In [None]:
import codecs, nltk, string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import gensim

wordnet_lemmatizer = WordNetLemmatizer()

exclude = set(string.punctuation)
stop_word_list = stopwords.words('english')

# input should be a string
def nlp_pipeline(text):
    
    # if you want you can split in sentences - i'm usually skipping this step
    # text = nltk.sent_tokenize(text) 
    
    #tokenize words for each sentence
    text = nltk.word_tokenize(text)
    
    # pos tagger
    text = nltk.pos_tag(text)

    # lemmatizer
    text = [wordnet_lemmatizer.lemmatize(token.lower(),"v")if "V" in pos else wordnet_lemmatizer.lemmatize(token.lower()) for token,pos in text]
    
    # remove punctuation and numbers
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    # remove stopwords - be careful with this step    
    text = [token for token in text if token not in stop_word_list]
        
    # the output is text
    return text

In [None]:
#topic models, finally!

import codecs, os

manifesto_quasi_sents = []

for filename in os.listdir("../datasets/US-Manifestos/"):
    if ".txt" in filename:
        doc = open("../datasets/US-Manifestos/"+filename,"r").read().strip().split("\n")
        for line in doc[2:]:
            sent = line.split("\t")[0]
            
            # for LDA we need tokens not embeddings!
            # be careful, each text-processing step you'll do will influence the analysis
            
            sent = nlp_pipeline(sent)
            if len(sent)>1:
                manifesto_quasi_sents.append(sent)
            
            
print ("ready!")

In [None]:
print (manifesto_quasi_sents[1])

In [None]:
from gensim import corpora, models

# for running LDA in gensim we need a dictionary of all the words
dictionary = corpora.Dictionary(manifesto_quasi_sents)
# and to count the word frequency in each doc
X = [dictionary.doc2bow(text) for text in manifesto_quasi_sents]

print ("ready!")

In [None]:
ldamodel = models.ldamodel.LdaModel(X, num_topics=10, id2word = dictionary, iterations=500)
print ("done!")

In [None]:
for topic in ldamodel.print_topics(num_words=7):
    print (topic)