In [38]:
# import packages
import pandas as pd
import os
import csv
import numpy as np
import gensim
import gensim.corpora as corpora
from gensim.models import ldaseqmodel
import time
from gensim.models.wrappers import DtmModel
import pickle
from gensim.models.coherencemodel import CoherenceModel

In [2]:
# read in csv file
all_data = pd.read_csv('./all_data.csv', header=None)
# add columns
all_data.columns = ["year", "title", "article"]
# sort by year
all_data = all_data.sort_values(by=['year'])
# change index
new_index = np.arange(0, len(all_data))
all_data["new_index"] = new_index
all_data = all_data.set_index('new_index')

In [61]:
# construct the stop word list
stopwords = [line.strip() for line in open("stop_words_copy.txt",encoding='UTF-8').readlines()]

In [62]:
# construct the function that preprocess the text
def preprocess(text):
    '''
    Preprocess the text by tokenizing the string into uni-grams, deleting all 
    numbers, punctuations and stop words. Store the preprocessed sting into 
    a list of words
    
    input: 
        text: text for preprocessing(str)
    output: a list of words
    '''
    result = []
    tokens = gensim.utils.tokenize(text)
    tokens_lst = list(tokens)
    for token in tokens_lst:
        if token not in stopwords:
            result.append(token)
            
    return result

In [63]:
# preprocess the data
processed_df = all_data['article'].map(preprocess)

In [64]:
# prepare Document-Term Matrix for the DTA model 
# Create Dictionaries for unique word counts of each decade
dic_all = corpora.Dictionary(processed_df)

# Create Corpus: Term Document Frequency
corpus_all = [dic_all.doc2bow(text) for text in processed_df]

In [65]:
# set the time slice
all_data[all_data['year']==1958]
# before 60s: 0-35
all_data[all_data['year']==1978]
# before 1979: 36-70
all_data[all_data['year']==1989]
# before 1990: 71-876
all_data[all_data['year']==2003]
# before 2003: 877-1811
time_slice = [71, 806, 935]

In [77]:
path_to_dtm_binary = "./dtm-linux64.dms"

In [78]:
# Construct the function that builds the lda model with 10 topics
def dtm_model(corpus, dictionary, num_topics, time_slice):
    '''
    Build a lda model with 10 topics using the corpus and dictionary
    imput:
        corpus: the Term Document Frequency corpus used to build the mode
        dictionary: a dictionary containing word counts of each unique word
                    in the dataframe for each decade
    output: a lda model object
    '''
    dtm_model = DtmModel(path_to_dtm_binary, corpus=corpus, time_slices=time_slice, \
                         num_topics=num_topics,id2word=dictionary, rng_seed=100)
    return dtm_model

In [79]:
# fit the model
start = time.time()
dtm_10 = dtm_model(corpus_all, dic_all, 10, time_slice)
end = time.time()
print(end - start)

PermissionError: [Errno 13] Permission denied: './dtm-linux64.dms'

In [35]:
dtm_5 = pickle.load(open("ldaseq_model.sav", 'rb'))

In [70]:
topics_dtm_5 = dtm_5.dtm_coherence(time=1)
cm_DTM = CoherenceModel(topics=topics_dtm_5, corpus=corpus_all, dictionary=dic_all, coherence='u_mass')
print ("U_mass topic coherence")
print ("DTM Python coherence is", cm_DTM.get_coherence())

U_mass topic coherence
DTM Python coherence is -1.0184342296330713


In [None]:
# fit the 10-topic model
start = time.time()
ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus_all, id2word=dic_all, time_slice=time_slice, num_topics=10)
end = time.time()
print(end - start)