In [1]:
import pandas as pd
import nltk # also install stop words corpus
import gensim as gs # also install pattern
import pyLDAvis.gensim
#import numpy as np
#import enchant

In [2]:
raw = pd.read_csv('strata_scheduled.csv',encoding='utf-8') 
#interestingly, there is an `s in record[10] that messes up by not encoding into utf-8

In [3]:
raw.head(1)

Unnamed: 0,id,event_id,proposer_id,name,description,abstract,scheduled
0,16242,55,2657,Google Cloud for Data Crunchers,"Many of the tools Google created to store, que...",Google is a Data business: over the past few y...,1


In [4]:
raw.index

RangeIndex(start=0, stop=3522, step=1)

In [12]:
class preproc(object):
    # need to put in check that dataframe has more than one row
    def __init__(self,dataframe,*col_names):
    # Load references for use in cleaning.
        self.df = dataframe
        self.col_names = col_names
        #self.dict_en = enchant.Dict('en_US') # Only use if want words and non-words
        self.tzer = nltk.RegexpTokenizer(r"\w+(?=n't)|n't|\w+(?=')|'\w+|\w+|-")
        self.stopEng = set(nltk.corpus.stopwords.words('english'))
        #self.p_stemmer = nltk.PorterStemmer()
    
    def token_normal(self):
        """
        Tokenize based on nltk RegexpTokenizer.
        Split into words and non-words based on PyEnchant English dict and being alphanumeric.
        Filter out stop words.
        """
        for col in self.col_names:
            new_column = pd.Series(index=self.df.index, name = (col + '_tok_norm'))
            for i,item in enumerate(self.df[col]):
                tokens = self.tzer.tokenize(item.lower()) # make everything lowercase
                filtered = filter(lambda word: word not in self.stopEng, tokens) # take out stop words
                new_column[new_column.index[i]] = filtered
            self.df = self.df.join(new_column)
                # Since data so clean, decided not to split into words and non-words
                #words = [t for t in tokens if self.dict_en.check(t)]
                #non_words = [t for t in tokens if not self.dict_en.check(t) and len(t)>1]
                #filtered = filter(lambda word: word not in self.stopEng, words)
                #stemmed = [self.p_stemmer.stem(i.lower()) for i in filtered]  

    def create_bow(self):
        """
        Create bags of words for each document, up-dating dictionary if necessary
        """
        columns = map(lambda original: original + '_tok_norm', self.col_names)
        
        # Check if dictionary exists and load, else intialize
        # Go through tokenized/normalized words and make into bag of words
        try: 
            self.dictionary = gs.corpora.Dictionary.load('dictionary.dic')
        except IOError:
            self.dictionary = gs.corpora.dictionary.Dictionary(self.df[columns[0]])
            
        for col in columns:
            new_column = pd.Series(index=self.df.index, name=(col + '_bow'), dtype=object)
            for i,item in enumerate(self.df[col]):
                bow = self.dictionary.doc2bow(item, allow_update = True)
                new_column[new_column.index[i]] = bow
            self.df = self.df.join(new_column)
        
    def create_model(self, n_topics, column_name):
        '''
        Create model with specified number of topics. Enter as integer.
        Need corpus, lda_model for visualization
        '''
        self.n_topics = n_topics
        self.corpus = [d for d in self.df[column_name]]
        self.lda_model = gs.models.ldamodel.LdaModel(corpus=self.corpus, id2word = self.dictionary, 
                                                num_topics = self.n_topics, update_every = 1, 
                                                chunksize = 100, passes = 10)
        self.lda_corpus = self.lda_model[self.corpus]

        #self.index = gs.similarities.MatrixSimilarity(self.lda_corpus, num_features = n_topics)
        #self.lda_model.save('lda_model_'+str(n_topics)+'.model')
        #self.index.save('simIndex_'+str(n_topics)+'.index')
        #gs.corpora.mmcorpus.MmCorpus.serialize('corpus.mm',self.corpus
        
    def visualize(self):
        # Visualizes model and writes to html file.
        self.data_vis = pyLDAvis.gensim.prepare(self.lda_model, self.corpus, self.dictionary)
        pyLDAvisHTML = pyLDAvis.prepared_data_to_html(self.data_vis)
        f = open('pyLDAvis_'+str(self.n_topics)+'.html','w')
        f.write(pyLDAvisHTML)
        f.close()

In [13]:
preprocessed = preproc(raw,'description','abstract')
preprocessed.token_normal()

In [14]:
preprocessed.create_bow()

In [15]:
preprocessed.create_model(20,'abstract_tok_norm_bow')

In [16]:
preprocessed.visualize()

In [17]:
pyLDAvis.display(preprocessed.data_vis)
#pyLDAvis.enable_notebook(data_vis)
#pyLDAvis.show(data_vis)

### Why is dtype=object needed when writing a list of tuples or integers to a pd.Series, but not when writing a list of strings?

In [None]:
def create_bow(df):
    # Create bags of words for each document, up-dating dictionary if necessary.
    # Returns dictionary.
    
    # Check if dictionary exists and load, else intialize
    try: 
        dictionary = gs.corpora.Dictionary.load('dictionary.dic')
    except IOError:
        dictionary = gs.corpora.dictionary.Dictionary(df['description_tok_norm'])
    
    #new_header = ['description' + '_tok_norm_bow']
    #new = pd.DataFrame(index=df.index, columns = new_header)
    new = pd.Series(index=df.index,name = 'description_tok_norm_bow',dtype=object)
    for i,item in enumerate(df['description_tok_norm']):
        bow = dictionary.doc2bow(item, allow_update = True)
        #new.description_tok_norm_bow[i] = bow # works
        new[i] = bow
    return new    
        # new = pd.Series(index=df.index, name = 'description_tok_norm_bow')
        # for i,item in enumerate(df['description_tok_norm']):
        #    bow = dictionary.doc2bow(item, allow_update = True)
        # new[new.index[i]] = ('hello','hi') # works, as does ['hello','hi'], so strings work, but not integers.
        # new[new.index[i]] = [1] # doesn't work
        # new[new.index[i]] = (1,2) # doesn't work
        # And when you create as a dataframe first and then turn into a series, it works. e.g. the above with 
        # pd.Series(new.description_tok_norm_bow)