# 資治通鑑 唐記 topic modeling

This notebook loads a corpus of the 資治通鑑 唐記 to experiment with different settings for topic modeling.  The data have been created in the project "Toward an Overall Inheritance and Development of Kanji Culture, East Asian Center for Informatics in Humanities, the 21st Century COE, Kyoto University" (http://coe21.zinbun.kyoto-u.ac.jp/)


In [None]:
stopwords=[
# u'一月', u'正月', u'二月', u'三月', u'四月', u'五月', u'六月',
# u'七月', u'八月', u'九月', u'十月', u'十一月', u'十二月',
u'之', u'是', u'于', u'元', u'哉', u'還', u'甚', u'氏', u'焉', u'不', u'與',
u'在', u'外', u'也', u'夫', u'非', u'稱', u'左', u'以', u'可', u'六',
u'雖', u'屬', u'己', u'興', u'千', u'而', u'五', u'諸', u'足', u'邪',
u'耳', u'亦', u'其', u'將', u'又', u'九', u'然', u'高', u'終', u'首',
u'益', u'通', u'常', u'七', u'曰', u'何', u'若', u'內', u'女', u'遠',
u'由', u'應', u'對', u'過', u'方', u'者', u'至', u'及', u'當', u'右',
u'盡', u'共', u'十', u'所', u'此', u'乃', u'子', u'四', u'未', u'去',
u'敢', u'異', u'徒', u'則', u'故', u'太', u'百', u'有', u'矣', u'萬',
u'北', u'前', u'進', u'任', u'無', u'三', u'謂', u'皆', u'於', u'自',
u'吾', u'來', u'易', u'初', u'更', u'一', u'二', u'如', u'乎',
#u'甲子', u'乙丑', u'丙寅', u'丁卯', u'戊辰', u'己巳', u'庚午', u'辛未', u'壬申',
#u'癸酉', u'甲戌', u'乙亥', u'丙子', u'丁丑', u'戊寅', u'己卯', u'庚辰',
#u'辛巳', u'壬午', u'癸未', u'甲申', u'乙酉', u'丙戌', u'丁亥', u'戊子',
#u'己丑', u'庚寅', u'辛卯', u'壬辰', u'癸巳', u'甲午', u'乙未', u'丙申',
#u'丁酉', u'戊戌', u'己亥', u'庚子', u'辛丑', u'壬寅', u'癸卯', u'甲辰',
#u'乙巳', u'丙午', u'丁未', u'戊申', u'己酉', u'庚戌', u'辛亥', u'壬子',
#u'癸丑', u'甲寅', u'乙卯', u'丙辰', u'丁巳', u'戊午', u'己未', u'庚申',
#u'辛酉', u'壬戌', u'癸亥',
]
print ",".join(stopwords)

The following codes prepares for loading the corpus. 

In [None]:
from __future__ import division
import gensim, mmseg, os, codecs, re
from collections import defaultdict
from matplotlib import pyplot as plt

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

punc_re = re.compile(ur"[\u3001-\u33FF\uFF00-\uFF7F]")

def mmseg_split(s, normalize_ent=False):
    "Split using mmseq only if not an identified named entity."
    out = []
    for l in s.split():
        if "@" in l:
            out.append(l.split('@')[-1])
        else:
            algor = mmseg.Algorithm(l)
            out.extend([tok.text for tok in algor if not (punc_re.search(tok.text) or tok.text in stopwords)])
    # tokenize only on kanji:
    # out = " ".join([a for a in re.split("(.)", out) if len(a) > 0])
    return out

doclabel = []
def load_documents(files):
    ret = []
    for f in files:
        r1 = []
        pcnt = 0
        for line in codecs.open(f, 'r', 'utf-8'):
            if line[0] in ['*', '#']:
                if ' year' in line:
                    y = line[:-1].split()[-1]
                    pcnt = 0
                elif ' p' in line:
                    pcnt += 1
                    ret.append(r1)
                    doclabel.append("%s-%2.2d" % (y, pcnt))
                    r1 = []
                continue
            r1.extend(mmseg_split(line))
        ret.append(r1)
    return ret

kstm_base='/home/chris/00scratch/kansekitm'
corpus_base='%s/corpus/zztj' % (kstm_base)
t = mmseg.Dictionary()
t.load_chars('%s/dic/chars.dic' % (kstm_base ))
t.load_words('%s/dic/words.dic' % (kstm_base ))
    

Now we are loading the corpus and produce the frequency table.

In [None]:
# stopwords=[]
print "Number of stopwords: %d" % (len(stopwords))
cut_off = 20
files = os.listdir(corpus_base)
files.sort()
files = ["%s/%s" % (corpus_base, f) for f in files if f.startswith('zztj')]
texts = load_documents(files)
print "Loaded %d documents (paragraphs)" % (len(texts))

frequency=defaultdict(int)
for p in texts:
    for token in p:
        frequency[token] += 1

fsum = sum([a for b, a in frequency.iteritems()])
csum = sum([len(b)*a for b, a in frequency.iteritems()])
print "Total # of characters: ", csum
fq = [a for a in sorted(frequency.iteritems(), key=lambda (k,v): (v,k), reverse=True)]
print "%d most frequent characters:" % (cut_off)
for f in fq[0:cut_off]:
    print "%s\t%5.5d\t%2.2f" % (f[0], f[1], (f[1] / fsum) * 100)



In [None]:
doc=10
print doclabel[doc]
print " ".join(texts[doc])

In [None]:
min_freq = 1
max_freq = 5000
red_texts = [[token for token in text if frequency[token] > min_freq and frequency[token] < max_freq]
              for text in texts]


In [None]:
dictionary = gensim.corpora.Dictionary(texts)
# Filter out words that occur less than 20 documents, or more than 50% of the documents.
# dictionary.filter_extremes(no_below=20, no_above=0.5)

corpus = [dictionary.doc2bow(text) for text in red_texts]
print('Number of unique tokens: %d' % len(dictionary))



In [None]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)



In [None]:
top_topics=model.top_topics(corpus)
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)
for i, t in enumerate(top_topics):
    print i, ",".join([z[1] for z in t[0]])
    #print t
    #print ",".join([a[1] for a in t])