# 200_Model_Creation

In [8]:
# libraries must be installed before importing 
import numpy as np
import pandas as pd
import pickle

import gensim
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
from spacy.lang.en import English

In [9]:
## download nltk wordnet and spacy and wordnet
nltk.download('wordnet')
spacy.load('en')
parser = English()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/donalmallon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 201: Methods to prepare/preprocess text for lda

In [10]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [11]:
def get_lemma(word):
    return WordNetLemmatizer().lemmatize(word)

In [12]:
## download all english stopwords 
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/donalmallon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

## 202: Load in petitions info and other transcripts 



In [14]:
petitions_info = pd.read_pickle("petitions_info.pickle")
other_tscripts= pd.read_pickle("other_transcripts.pickle")

In [15]:
# consider only petitions that have recieved over 20 signatures 
mask = petitions_info['Signatures Count'] >20
petitions_info = petitions_info.loc[mask].reset_index(drop = True)
petitions_info.shape

(20961, 7)

## 203: Create universal dictionary from the text both models

In [None]:
## Preprocess other transcripts text data for LDA 
#//2 mins//#
petitions_txt= []
len(petitions_info)
for x in range(0, len(petitions_info.information)):
    if x%10000== 0:
            print(x)
    petitions_txt.append(prepare_text_for_lda(str(petitions_info.information[x]) + str(petitions_info.more_info[x])))


In [None]:
## Preprocess other transcripts text data for LDA 
#//~3 hours//#

tscript_txt= []

for x in range(0, len(other_tscripts)):
    if x%1000== 0:
            print(x)
    tscript_txt.append(prepare_text_for_lda(str(other_tscripts.tscript[x])))


In [18]:
dictionary = corpora.Dictionary(petitions_txt+tscript_txt)
dictionary.save('dictionary.gensim')

## 204 Create LDA model for petitions data 

In [21]:
## create corpus based on universal dictionary and save 
petitions_corpus = [dictionary.doc2bow(text) for text in petitions_txt]
pickle.dump(petitions_corpus, open('petitions_corpus.pkl', 'wb'))

In [22]:
numTopics = 20
t = time.time() ##time process
petitions_ldamodel = LdaMulticore(petitions_corpus, num_topics = numTopics, id2word=dictionary)
petitions_ldamodel.save('petitions_model.gensim')
elapsed = time.time() - t
elapsed

29.714300870895386

In [23]:
## view topics
topics = petitions_ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.008*"people" + 0.007*"government" + 0.006*"service" + 0.006*"country" + 0.005*"british"')
(1, '0.007*"government" + 0.007*"would" + 0.005*"people" + 0.005*"petition" + 0.004*"crime"')
(2, '0.008*"police" + 0.008*"people" + 0.008*"driver" + 0.007*"government" + 0.004*"pension"')
(3, '0.022*"school" + 0.011*"would" + 0.010*"child" + 0.008*"petition" + 0.006*"education"')
(4, '0.005*"government" + 0.004*"people" + 0.004*"child" + 0.004*"petition" + 0.003*"would"')
(5, '0.018*"child" + 0.018*"people" + 0.009*"parent" + 0.007*"government" + 0.007*"would"')
(6, '0.011*"people" + 0.008*"government" + 0.008*"animal" + 0.007*"child" + 0.006*"health"')
(7, '0.009*"people" + 0.007*"government" + 0.007*"health" + 0.006*"year" + 0.006*"patient"')
(8, '0.027*"child" + 0.013*"government" + 0.007*"parent" + 0.005*"people" + 0.005*"petition"')
(9, '0.011*"government" + 0.008*"vehicle" + 0.005*"child" + 0.004*"would" + 0.004*"public"')
(10, '0.005*"people" + 0.005*"petition" + 0.004*"school" + 0.

In [24]:
## this provides distrubution for the 89th document 
petitions_ldamodel[petitions_corpus[89]]

[(0, 0.453937), (16, 0.5032059)]

## 205: Create LDA model for debates data 

In [25]:
## created corpus for lda based on uiversal dictionary 
tscript_corpus = [dictionary.doc2bow(text) for text in tscript_txt]
pickle.dump(tscript_corpus, open('tscript_corpus.pkl', 'wb'))

In [26]:
## model creation 
# 25 minutes
numTopics = 20
tscript_ldamodel = LdaMulticore(tscript_corpus, num_topics = numTopics, id2word=dictionary, passes=10)
tscript_ldamodel.save('tscript_model.gensim')

1512.2018258571625

In [28]:
## view topics

topics = tscript_ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.012*"right" + 0.012*"court" + 0.012*"justice" + 0.011*"prison" + 0.009*"government"')
(1, '0.023*"business" + 0.020*"government" + 0.009*"people" + 0.008*"economy" + 0.008*"friend"')
(2, '0.027*"ireland" + 0.025*"northern" + 0.013*"minister" + 0.012*"student" + 0.011*"university"')
(3, '0.012*"government" + 0.011*"member" + 0.009*"local" + 0.009*"north" + 0.008*"wale"')
(4, '0.020*"health" + 0.016*"service" + 0.012*"hospital" + 0.012*"patient" + 0.009*"people"')
(5, '0.013*"right" + 0.010*"country" + 0.010*"people" + 0.010*"government" + 0.008*"member"')
(6, '0.029*"people" + 0.018*"government" + 0.011*"benefit" + 0.010*"credit" + 0.010*"minister"')
(7, '0.022*"child" + 0.016*"woman" + 0.012*"member" + 0.008*"people" + 0.008*"issue"')
(8, '0.020*"government" + 0.015*"pension" + 0.012*"would" + 0.011*"budget" + 0.010*"people"')
(9, '0.019*"european" + 0.014*"union" + 0.013*"government" + 0.012*"right" + 0.012*"minister"')
(10, '0.016*"defence" + 0.015*"force" + 0.010*"armed" + 0.