In [1]:
# import packages
import pandas as pd
import os
import csv
import numpy as np
import gensim
import gensim.corpora as corpora
from gensim.models import ldaseqmodel
import time
from gensim.models.wrappers import DtmModel
import pickle
from gensim.models.coherencemodel import CoherenceModel

In [2]:
# read in csv file
all_data = pd.read_csv('./all_data.csv', header=None)
# add columns
all_data.columns = ["year", "title", "article"]
# sort by year
all_data = all_data.sort_values(by=['year'])
# change index
new_index = np.arange(0, len(all_data))
all_data["new_index"] = new_index
all_data = all_data.set_index('new_index')

In [3]:
# construct the stop word list
stopwords = [line.strip() for line in open("stop_words_copy2.txt",encoding='UTF-8').readlines()]

In [4]:
# construct the function that preprocess the text
def preprocess(text):
    '''
    Preprocess the text by tokenizing the string into uni-grams, deleting all 
    numbers, punctuations and stop words. Store the preprocessed sting into 
    a list of words
    
    input: 
        text: text for preprocessing(str)
    output: a list of words
    '''
    result = []
    tokens = gensim.utils.tokenize(text)
    tokens_lst = list(tokens)
    for token in tokens_lst:
        if token not in stopwords:
            result.append(token)
            
    return result

In [5]:
# preprocess the data
processed_df = all_data['article'].map(preprocess)

In [6]:
# prepare Document-Term Matrix for the DTA model 
# Create Dictionaries for unique word counts of each decade
dic_all = corpora.Dictionary(processed_df)

# Create Corpus: Term Document Frequency
corpus_all = [dic_all.doc2bow(text) for text in processed_df]

In [7]:
# set the time slice
all_data[all_data['year']==1958]
# before 60s: 0-35
all_data[all_data['year']==1978]
# before 1979: 36-70
all_data[all_data['year']==1989]
# before 1990: 71-876
all_data[all_data['year']==2003]
# before 2003: 877-1811
time_slice = [71, 806, 935]

In [77]:
path_to_dtm_binary = "./dtm-linux64.dms"

In [78]:
# Construct the function that builds the lda model with 10 topics
def dtm_model(corpus, dictionary, num_topics, time_slice):
    '''
    Build a lda model with 10 topics using the corpus and dictionary
    imput:
        corpus: the Term Document Frequency corpus used to build the mode
        dictionary: a dictionary containing word counts of each unique word
                    in the dataframe for each decade
    output: a lda model object
    '''
    dtm_model = DtmModel(path_to_dtm_binary, corpus=corpus, time_slices=time_slice, \
                         num_topics=num_topics,id2word=dictionary, rng_seed=100)
    return dtm_model

In [8]:
# fit the model
start = time.time()
dtm_10 = dtm_model(corpus_all, dic_all, 10, time_slice)
end = time.time()
print(end - start)

NameError: name 'dtm_model' is not defined

In [35]:
dtm_5 = pickle.load(open("ldaseq_model.sav", 'rb'))

In [70]:
topics_dtm_5 = dtm_5.dtm_coherence(time=1)
cm_DTM = CoherenceModel(topics=topics_dtm_5, corpus=corpus_all, dictionary=dic_all, coherence='u_mass')
print ("U_mass topic coherence")
print ("DTM Python coherence is", cm_DTM.get_coherence())

U_mass topic coherence
DTM Python coherence is -1.0184342296330713


In [9]:
# fit the 10-topic model
start = time.time()
ldaseq_10 = ldaseqmodel.LdaSeqModel(corpus=corpus_all, id2word=dic_all, time_slice=time_slice, num_topics=10)
end = time.time()
print(end - start)

  convergence = np.fabs((bound - old_bound) / old_bound)


17645.63276195526


In [11]:
# save model
pickle.dump(ldaseq_10, open("ldaseq_model_10.sav", 'wb'))
# check
loaded_model_10 = pickle.load(open("ldaseq_model_10.sav", 'rb'))

In [19]:
# calculate coherence matrix
topics_dtm_10 = ldaseq_10.dtm_coherence(time=2)
cm_DTM = CoherenceModel(topics=topics_dtm_10, corpus=corpus_all, dictionary=dic_all, coherence='u_mass')
print ("U_mass topic coherence")
print ("DTM Python coherence is", cm_DTM.get_coherence())

U_mass topic coherence
DTM Python coherence is -1.1009556066645674


In [20]:
# print the first period (pre 1979)
first_period = ldaseq_10.print_topics(time=0)
first_period

[[('人口', 0.07044011391455908),
  ('发展', 0.03212724088126665),
  ('经济', 0.015670058053402644),
  ('增长', 0.014933569531724188),
  ('社会', 0.013791119278192515),
  ('问题', 0.013208117751693315),
  ('资源', 0.006955607605569462),
  ('世界', 0.006891711411821534),
  ('城市', 0.00688562063825718),
  ('中国', 0.006571154126188291),
  ('环境', 0.005985000851009634),
  ('持续', 0.005965706589814484),
  ('提高', 0.0058253272788478215),
  ('地区', 0.004767395138293136),
  ('增加', 0.004737527107598841),
  ('水平', 0.004676346459132804),
  ('高', 0.004646891119548609),
  ('占', 0.004602968749656407),
  ('国家', 0.004487546871806497),
  ('解决', 0.004392143942600883)],
 [('人口', 0.016073668780779326),
  ('人', 0.015775779112891155),
  ('生产', 0.012209643140856794),
  ('我国', 0.009669379097010917),
  ('增加', 0.00961450029840283),
  ('平均', 0.0089777256696764),
  ('增长', 0.008877390679896183),
  ('生育', 0.007732360418673885),
  ('问题', 0.007370004387919109),
  ('孩子', 0.007100062730931866),
  ('生', 0.006701694307698201),
  ('岁', 0.006605

In [22]:
# print the second period
# print topics of the second time period (1980s)
second_period = ldaseq_10.print_topics(time=1)
second_period

[[('人口', 0.07215431864841855),
  ('发展', 0.03260532145200407),
  ('经济', 0.015755767854712253),
  ('增长', 0.014659310405805424),
  ('社会', 0.013901865836030983),
  ('问题', 0.013140379658351851),
  ('资源', 0.007098475275367026),
  ('城市', 0.006912530915946427),
  ('世界', 0.006856742438397656),
  ('中国', 0.006613659104759338),
  ('持续', 0.006165262699322873),
  ('环境', 0.006097832098788117),
  ('提高', 0.005866733151001197),
  ('增加', 0.004959327289923502),
  ('占', 0.00484976406624263),
  ('高', 0.004837042895461679),
  ('地区', 0.0047998890575028565),
  ('水平', 0.004695687632224133),
  ('国家', 0.004432862482225496),
  ('解决', 0.004377356065118538)],
 [('人口', 0.01619583608188276),
  ('人', 0.015898964629489563),
  ('生产', 0.0123421851201752),
  ('增加', 0.009808554152347607),
  ('我国', 0.009745107279821951),
  ('平均', 0.009075860221834053),
  ('增长', 0.008990422533611512),
  ('问题', 0.007342687825174518),
  ('生育', 0.007294821094062583),
  ('孩子', 0.007179389833001094),
  ('生', 0.006783658134194607),
  ('岁', 0.006716

In [23]:
# print topics of the third time period (1990s)
third_period = ldaseq_10.print_topics(time=2)
third_period

[[('人口', 0.058179073956352585),
  ('发展', 0.03388559130402996),
  ('经济', 0.016202032967118186),
  ('社会', 0.014344453120308812),
  ('问题', 0.013293729284627527),
  ('增长', 0.012932735834688245),
  ('资源', 0.007548870888007063),
  ('城市', 0.007077591195087214),
  ('世界', 0.00695579768799344),
  ('中国', 0.006851334095542709),
  ('持续', 0.0066642881087955585),
  ('环境', 0.0064073591097044205),
  ('提高', 0.006044765821354888),
  ('地区', 0.0049503839711663895),
  ('水平', 0.004812554636382315),
  ('增加', 0.0047798280318509904),
  ('占', 0.0047449484211176465),
  ('高', 0.004576730208087994),
  ('亿', 0.004469229301025177),
  ('国家', 0.004455910295390735)],
 [('人口', 0.016309306938904682),
  ('人', 0.016017790848713662),
  ('生产', 0.012448237899621776),
  ('增加', 0.009949797808331853),
  ('我国', 0.009700763226634065),
  ('平均', 0.009161679735612153),
  ('增长', 0.009085982232539109),
  ('生育', 0.007427853493431441),
  ('问题', 0.0073376839732916845),
  ('孩子', 0.007253596718148721),
  ('生', 0.006858620259593076),
  ('岁', 

In [24]:
# print topic evolution
# evolution of 1st topic
first_topic = ldaseq_10.print_topic_times(topic=0) 
first_topic

[[('人口', 0.07044011391455908),
  ('发展', 0.03212724088126665),
  ('经济', 0.015670058053402644),
  ('增长', 0.014933569531724188),
  ('社会', 0.013791119278192515),
  ('问题', 0.013208117751693315),
  ('资源', 0.006955607605569462),
  ('世界', 0.006891711411821534),
  ('城市', 0.00688562063825718),
  ('中国', 0.006571154126188291),
  ('环境', 0.005985000851009634),
  ('持续', 0.005965706589814484),
  ('提高', 0.0058253272788478215),
  ('地区', 0.004767395138293136),
  ('增加', 0.004737527107598841),
  ('水平', 0.004676346459132804),
  ('高', 0.004646891119548609),
  ('占', 0.004602968749656407),
  ('国家', 0.004487546871806497),
  ('解决', 0.004392143942600883)],
 [('人口', 0.07215431864841855),
  ('发展', 0.03260532145200407),
  ('经济', 0.015755767854712253),
  ('增长', 0.014659310405805424),
  ('社会', 0.013901865836030983),
  ('问题', 0.013140379658351851),
  ('资源', 0.007098475275367026),
  ('城市', 0.006912530915946427),
  ('世界', 0.006856742438397656),
  ('中国', 0.006613659104759338),
  ('持续', 0.006165262699322873),
  ('环境', 0.00

In [27]:
# evolution of 2nd topic
second_topic = ldaseq_10.print_topic_times(topic=1) 
second_topic

[[('人口', 0.016073668780779326),
  ('人', 0.015775779112891155),
  ('生产', 0.012209643140856794),
  ('我国', 0.009669379097010917),
  ('增加', 0.00961450029840283),
  ('平均', 0.0089777256696764),
  ('增长', 0.008877390679896183),
  ('生育', 0.007732360418673885),
  ('问题', 0.007370004387919109),
  ('孩子', 0.007100062730931866),
  ('生', 0.006701694307698201),
  ('岁', 0.0066050827347741555),
  ('生活', 0.005993488863870001),
  ('每年', 0.005700316414605513),
  ('人民', 0.005619783040818375),
  ('农业', 0.005229863718435157),
  ('家庭', 0.005138966588968502),
  ('国家', 0.005048335926170881),
  ('提高', 0.004823175647374086),
  ('粮食', 0.004783176974104591)],
 [('人口', 0.01619583608188276),
  ('人', 0.015898964629489563),
  ('生产', 0.0123421851201752),
  ('增加', 0.009808554152347607),
  ('我国', 0.009745107279821951),
  ('平均', 0.009075860221834053),
  ('增长', 0.008990422533611512),
  ('问题', 0.007342687825174518),
  ('生育', 0.007294821094062583),
  ('孩子', 0.007179389833001094),
  ('生', 0.006783658134194607),
  ('岁', 0.0067168

In [29]:
# evolution of 3rd topic
third_topic = ldaseq_10.print_topic_times(topic=2) 
third_topic

[[('计划生育', 0.05918448641154572),
  ('工作', 0.05062229438730188),
  ('群众', 0.016694895917644437),
  ('宣传', 0.011287767595107702),
  ('协会', 0.010367250861932726),
  ('干部', 0.009355739589480794),
  ('领导', 0.00844706877371619),
  ('全国', 0.006884970871143747),
  ('计生', 0.006724225552581118),
  ('服务', 0.006157635672466006),
  ('新', 0.006073548488116918),
  ('教育', 0.0058565465199890724),
  ('各级', 0.005832208785566449),
  ('基层', 0.005633419035724777),
  ('加强', 0.005343538048361407),
  ('思想', 0.005300768306359272),
  ('先进', 0.005076858792814353),
  ('开展', 0.005074334209104178),
  ('抓', 0.004999062952211988),
  ('会议', 0.0048476187301332895)],
 [('计划生育', 0.06748762977178856),
  ('工作', 0.05042099155490915),
  ('群众', 0.01674023808267892),
  ('宣传', 0.011197386392414717),
  ('协会', 0.010507131040595044),
  ('干部', 0.008429326389372618),
  ('领导', 0.00837731535537986),
  ('全国', 0.006893335054272378),
  ('计生', 0.0068260810505116785),
  ('服务', 0.0062294030038314),
  ('新', 0.006091239260594674),
  ('教育', 0.0

In [31]:
# evolution of 4th topic
fourth_topic = ldaseq_10.print_topic_times(topic=3) 
fourth_topic

[[('计划生育', 0.029247642376217575),
  ('人口', 0.02410989860356437),
  ('万', 0.014056215916530152),
  ('工作', 0.011175631321288473),
  ('增长率', 0.009534194503180719),
  ('去年', 0.008379517219949825),
  ('自然', 0.008200906458903298),
  ('生育', 0.007825209571520863),
  ('农村', 0.007421219410328785),
  ('全省', 0.007398944717416805),
  ('县', 0.0073133286618306155),
  ('责任制', 0.007221847983670644),
  ('管理', 0.006977974631066138),
  ('生产', 0.006846269128873217),
  ('人', 0.006832457123776688),
  ('实行', 0.006418918617685854),
  ('计划', 0.006375030914184514),
  ('保险', 0.005972651846792435),
  ('服务', 0.005969375403918669),
  ('元', 0.005773296620855325)],
 [('计划生育', 0.028658065056668413),
  ('人口', 0.02443287563732102),
  ('万', 0.014252795231785506),
  ('工作', 0.011326963584806145),
  ('增长率', 0.008010254288361888),
  ('生育', 0.00789927232184031),
  ('管理', 0.007702599736042424),
  ('农村', 0.0075055057846061864),
  ('全省', 0.007478929058608533),
  ('县', 0.007371722421617475),
  ('去年', 0.007289780142849802),
  ('自然'

In [32]:
# evolution of 5th topic
fifth_topic = ldaseq_10.print_topic_times(topic=4) 
fifth_topic

[[('人口', 0.07495285833334345),
  ('中国', 0.022131501400502553),
  ('世界', 0.019236122564590515),
  ('增长', 0.013789291360029722),
  ('国家', 0.01352208772446121),
  ('国际', 0.013354930286330072),
  ('会议', 0.01121266835282075),
  ('问题', 0.011090215115733807),
  ('联合国', 0.01083142646536086),
  ('发展', 0.010761753613316563),
  ('计划生育', 0.010249787690368366),
  ('控制', 0.009121708389283105),
  ('亿', 0.009015940081055643),
  ('政府', 0.008982518920545452),
  ('政策', 0.00830233022445664),
  ('亚洲', 0.006886561203308084),
  ('人', 0.006465331916658413),
  ('代表', 0.0060761108520387755),
  ('美国', 0.005992822392274204),
  ('各国', 0.005829282606939825)],
 [('人口', 0.0756600818312705),
  ('中国', 0.02368671879710511),
  ('世界', 0.019465727256727475),
  ('增长', 0.013855516629844867),
  ('国际', 0.013651620826926492),
  ('国家', 0.013580482397062955),
  ('会议', 0.01125819537110394),
  ('问题', 0.011169799611089702),
  ('联合国', 0.010971859715701872),
  ('发展', 0.010877008552317177),
  ('计划生育', 0.010328905464980841),
  ('控制', 0.

In [33]:
# evolution of 6th topic
sixth_topic = ldaseq_10.print_topic_times(topic=5) 
sixth_topic

[[('人口', 0.057078284087703796),
  ('我国', 0.019915464429290607),
  ('计划生育', 0.01949966935488804),
  ('工作', 0.016919869447917582),
  ('控制', 0.014828887041762323),
  ('增长', 0.013943793145230449),
  ('发展', 0.012768206700214207),
  ('经济', 0.00959486904456213),
  ('社会', 0.008366476166126055),
  ('生育', 0.008364600430979075),
  ('问题', 0.007807055740813231),
  ('政策', 0.007364850457071684),
  ('全国', 0.007356895050228631),
  ('水平', 0.006763431567083178),
  ('提高', 0.006158008556833964),
  ('国家', 0.0053574875410632074),
  ('出生', 0.005048125593182324),
  ('实现', 0.0047940781293120465),
  ('必须', 0.004744662582671363),
  ('环境', 0.004518782113172572)],
 [('人口', 0.05757087680391414),
  ('我国', 0.020016767627034124),
  ('计划生育', 0.019687563476286234),
  ('工作', 0.017141914204055315),
  ('控制', 0.013901682968988192),
  ('发展', 0.012933989223064555),
  ('增长', 0.012732803523645374),
  ('经济', 0.009694795355930527),
  ('社会', 0.008530062673395981),
  ('生育', 0.008419681541242827),
  ('问题', 0.007698568688760564),
  ('

In [34]:
# evolution of 7th topic
seventh_topic = ldaseq_10.print_topic_times(topic=6) 
seventh_topic

[[('孩子', 0.02242742826583915),
  ('生', 0.017694485514162417),
  ('计划生育', 0.012638173152585778),
  ('人', 0.009219985291022262),
  ('夫妇', 0.007246798812393834),
  ('妇女', 0.006989022198399595),
  ('大队', 0.0068859631807402025),
  ('群众', 0.006772771290448951),
  ('工作', 0.006521110589056569),
  ('农民', 0.006502488817660084),
  ('少', 0.006416706009564076),
  ('教育', 0.006015475078985495),
  ('元', 0.005080296278781145),
  ('干部', 0.004810625948883908),
  ('生育', 0.004717343303381513),
  ('计生', 0.00463683483388115),
  ('农村', 0.004356228139837498),
  ('观念', 0.004233397187878021),
  ('富', 0.004170979468200818),
  ('胎', 0.004151081240992724)],
 [('生', 0.01788355998826645),
  ('孩子', 0.017358930636394716),
  ('计划生育', 0.01277487877052146),
  ('人', 0.009027279523380364),
  ('妇女', 0.007063439283702878),
  ('夫妇', 0.006996894517089877),
  ('群众', 0.006870271058542766),
  ('农民', 0.00664031291925394),
  ('工作', 0.006592284301635169),
  ('少', 0.006565820519918213),
  ('大队', 0.006490261587377099),
  ('教育', 0.00610

In [35]:
# evolution of 8th topic
eighth_topic = ldaseq_10.print_topic_times(topic=7) 
eighth_topic

[[('计划生育', 0.020189144606385788),
  ('人', 0.01475450963837376),
  ('工作', 0.012243392979131911),
  ('流动', 0.009603372207706937),
  ('胎', 0.009495244537985916),
  ('生育', 0.009275595916380228),
  ('超生', 0.00836032881755665),
  ('人口', 0.007354722907365415),
  ('生', 0.0063717817539568735),
  ('妇女', 0.00546636841126169),
  ('部门', 0.005433317132495005),
  ('人员', 0.0053405427348694615),
  ('干部', 0.005280392030753855),
  ('出生', 0.004855754110120875),
  ('育龄', 0.004776662012927697),
  ('调查', 0.004713354894315926),
  ('统计', 0.004648279025546136),
  ('外', 0.004518187754580191),
  ('措施', 0.004425959646211638),
  ('进行', 0.004370086239400245)],
 [('计划生育', 0.021797129865162017),
  ('人', 0.014832229522692452),
  ('工作', 0.012284286930516383),
  ('流动', 0.009766858247616781),
  ('生育', 0.009295145687429422),
  ('超生', 0.008786891992419804),
  ('胎', 0.008313719230913303),
  ('人口', 0.007433573469212904),
  ('生', 0.005754450142692591),
  ('部门', 0.005518467418935917),
  ('人员', 0.005498850662175436),
  ('妇女', 0.

In [36]:
# evolution of 9th topic
ninth_topic = ldaseq_10.print_topic_times(topic=8) 
ninth_topic

[[('节育', 0.0210096313317803),
  ('手术', 0.01949765563043628),
  ('研究', 0.01741821486194611),
  ('避孕', 0.016403730398859503),
  ('理论', 0.011243155241190206),
  ('方法', 0.008920793152771403),
  ('人', 0.0084485040778737),
  ('技术', 0.007973637062509894),
  ('认为', 0.007077250218834372),
  ('科学', 0.006306060664651073),
  ('种', 0.006076171395879671),
  ('我国', 0.005909186759693054),
  ('医院', 0.005899869497668031),
  ('进行', 0.005633000472969327),
  ('生态', 0.005143065487475419),
  ('新', 0.0051242596890007834),
  ('问题', 0.004782487209178527),
  ('结扎', 0.0043805670571146395),
  ('使用', 0.0040432758715125226),
  ('马尔萨斯', 0.0039057846632004323)],
 [('节育', 0.021139578145833275),
  ('手术', 0.019449269245982097),
  ('研究', 0.01755550018782779),
  ('避孕', 0.01629343849273596),
  ('理论', 0.011103729149172334),
  ('方法', 0.008898591233677044),
  ('人', 0.008459043004660329),
  ('技术', 0.008147819356466138),
  ('认为', 0.007114477256655199),
  ('科学', 0.00629973549795842),
  ('种', 0.006238385454836567),
  ('我国', 0.0059

In [37]:
# evolution of 10th topic
tenth_topic = ldaseq_10.print_topic_times(topic=9) 
tenth_topic

[[('计划生育', 0.03750753913920411),
  ('人口', 0.024640235993731385),
  ('国家', 0.01179758967409812),
  ('服务', 0.01067424566506121),
  ('技术', 0.010648558339324491),
  ('中国', 0.008595722982708397),
  ('全国', 0.00803296315034466),
  ('健康', 0.007732036108345835),
  ('我国', 0.007409228543143996),
  ('规定', 0.007312019970134711),
  ('社会', 0.007144589959104045),
  ('教育', 0.006825426873211588),
  ('文化', 0.006498423640634047),
  ('法律', 0.00648766846124253),
  ('工作', 0.006412479927016965),
  ('北京', 0.006235832735332146),
  ('研究', 0.006186246548629616),
  ('科技', 0.005785201416711324),
  ('妇女', 0.005529619515779615),
  ('生育', 0.005144208702711765)],
 [('计划生育', 0.03786986431767568),
  ('人口', 0.024871586881893348),
  ('国家', 0.011831121358000326),
  ('服务', 0.010915346728120112),
  ('技术', 0.010835405915971758),
  ('中国', 0.008707907115851655),
  ('全国', 0.008077891937450541),
  ('健康', 0.007706057094564323),
  ('规定', 0.0073721966884492925),
  ('我国', 0.0072548829360546425),
  ('社会', 0.007224400441307131),
  ('教育'

In [None]:
# fit the 15-topic model
start = time.time()
ldaseq_10 = ldaseqmodel.LdaSeqModel(corpus=corpus_all, id2word=dic_all, time_slice=time_slice, num_topics=15)
end = time.time()
print(end - start)