In [1]:
# import packages
import pandas as pd
import os
import csv
import numpy as np
import gensim
import gensim.corpora as corpora
from gensim.models import ldaseqmodel
import time
from gensim.models.wrappers import DtmModel
import pickle
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

In [2]:
# read in csv file
all_data = pd.read_csv('./all_data.csv', header=None)
# add columns
all_data.columns = ["year", "title", "article"]
# sort by year
all_data = all_data.sort_values(by=['year'])
# change index
new_index = np.arange(0, len(all_data))
all_data["new_index"] = new_index
all_data = all_data.set_index('new_index')

In [3]:
# construct the stop word list
stopwords = [line.strip() for line in open("stop_words_copy2.txt",encoding='UTF-8').readlines()]

In [4]:
# construct the function that preprocess the text
def preprocess(text):
    '''
    Preprocess the text by tokenizing the string into uni-grams, deleting all 
    numbers, punctuations and stop words. Store the preprocessed sting into 
    a list of words
    
    input: 
        text: text for preprocessing(str)
    output: a list of words
    '''
    result = []
    tokens = gensim.utils.tokenize(text)
    tokens_lst = list(tokens)
    for token in tokens_lst:
        if token not in stopwords:
            result.append(token)
            
    return result

In [5]:
# preprocess the data
processed_df = all_data['article'].map(preprocess)

In [6]:
# prepare Document-Term Matrix for the DTA model 
# Create Dictionaries for unique word counts of each decade
dic_all = corpora.Dictionary(processed_df)

# Create Corpus: Term Document Frequency
corpus_all = [dic_all.doc2bow(text) for text in processed_df]

In [7]:
# set the time slice
all_data[all_data['year']==1958]
# before 60s: 0-35
all_data[all_data['year']==1978]
# before 1979: 36-70
all_data[all_data['year']==1989]
# before 1990: 71-876
all_data[all_data['year']==2003]
# before 2003: 877-1811
time_slice = [71, 806, 935]

In [8]:
# fit the 15-topic model
start = time.time()
ldaseq_15 = ldaseqmodel.LdaSeqModel(corpus=corpus_all, id2word=dic_all, time_slice=time_slice, 
                                    num_topics=15, chain_variance=0.13, random_state=100)
end = time.time()
print(end - start)

  convergence = np.fabs((bound - old_bound) / old_bound)


19464.67514872551


In [9]:
# save model
pickle.dump(ldaseq_15, open("ldaseq_model_15.sav", 'wb'))
# check
loaded_model_15 = pickle.load(open("ldaseq_model_15.sav", 'rb'))

In [12]:
# calculate coherence matrix
topics_dtm_15 = ldaseq_15.dtm_coherence(time=0)
cm_DTM_15 = CoherenceModel(topics=topics_dtm_15, corpus=corpus_all, dictionary=dic_all, coherence='u_mass')
print ("U_mass topic coherence")
print ("DTM Python coherence is", cm_DTM_15.get_coherence())

U_mass topic coherence
DTM Python coherence is -1.4068604518005443


In [14]:
# calculate coherence matrix
topics_dtm_15_test = loaded_model_15.dtm_coherence(time=2)
cm_DTM_15_test = CoherenceModel(topics=topics_dtm_15_test, corpus=corpus_all, dictionary=dic_all, coherence='u_mass')
print ("U_mass topic coherence")
print ("DTM Python coherence is", cm_DTM_15_test.get_coherence())

U_mass topic coherence
DTM Python coherence is -1.283978424278397
