In [132]:
import gensim
import gensim.models.ldamodel as LDA
from gensim import corpora
from gensim.models import CoherenceModel

import pickle 
import json, time, os, string

# # Run these the first time you do this
# nltk.download('stopwords')
# nltk.download('punkt')

from nltk.corpus import stopwords
from nltk import word_tokenize, WordNetLemmatizer
stop = set(stopwords.words('english'))

import spacy
nlp = spacy.load('en_core_web_sm')

import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt

### Load in list of text blocks

In [133]:
filehandler = open('Brexit_text_list', 'rb') 
hit_text = pickle.load(filehandler)

### Performing preprocessing 

In [47]:
#Preprocessing functions for entire text blocks

def LemmatizerBlock(text):
    tokens = word_tokenize(text)
    wnl = WordNetLemmatizer()
    tokens = [wnl.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def FilterPunc(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    return ' '.join(tokens)

def stopwordRemovalBlock(text, stop_words):
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if not w in stop_words]
    return ' '.join(tokens)

def clean(text, stop_words):
#     text = text.lower()
    no_stop = stopwordRemovalBlock(text, stop_words)
    no_punc = FilterPunc(no_stop)
    return no_punc

# Gensim prefers BOW corpora to be a list of lists of words for each document 
def gensimPrep(list_of_texts):
    tokenized = [word_tokenize(text) for text in list_of_texts]
    return tokenized    

In [71]:
# Lowercasing and removing punctuation and converting to gensim's desired format
lowered = [hit.lower() for hit in hit_text]
no_punc = [FilterPunc(low) for low in lowered]
to_gensim = gensimPrep(no_punc)

In [72]:
# Setting up bi/trigram models
bigram = gensim.models.Phrases(to_gensim, min_count=5, threshold=100) 
trigram = gensim.models.Phrases(bigram[to_gensim], threshold=100)  

# Phrase the sentence to build bi/trigram models 
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [73]:
print(trigram_mod[bigram_mod[to_gensim[0]]])

['in', 'the', 'first', 'of', 'two', 'extracts', 'from', 'their', 'new', 'book', 'liam', 'halligan', 'and', 'gerard', 'lyons', 'say', 'the', 'commonly', 'held', 'belief', 'that', 'britain', 'would', 'be', 'better', 'off', 'inside', 'the', 'single_market', 'and', 'customs_union', 'is', 'misconceived', 'there', 'has', 'been', 'much', 'talk', 'of', 'hard', 'brexit', 'versus', 'soft', 'brexit', 'such', 'labels', 'are', 'ubiquitous', 'during', 'these', 'article', 'negotiations', 'used', 'freely', 'by', 'the', 'broadcast', 'media', 'yet', 'they', 'are', 'partisan', 'and', 'deeply', 'misleading', 'hard', 'brexit', 'makes', 'leaving', 'the', 'european_union', 'sound', 'extreme', 'and', 'damaging', 'suggesting', 'and', 'a', 'bleak', 'economic', 'future', 'soft', 'brexit', 'conversely', 'conveys', 'a', 'comfortable', 'ongoing', 'relationship', 'with', 'the', 'eu', 'with', 'britain', 'still', 'part', 'of', 'the', 'club', 'leaving', 'the', 'single_market', 'and', 'the', 'customs_union', 'isn_t', 'h

In [60]:
# Running the bigram model on the text
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

# Running the trigram model on the bigram model (Gensim works through this nested pattern)
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

# Removing stopwords from the nested lists using nltk defined stopwords
def stopwordRemovalList(text, stop_words):
    out_words = [w for w in text if not w in stop_words]
    return out_words

# Use spacy lemmatization and only keep the select parts of speech, runs somewhat slowly
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [74]:
# Make trigrams and bigrams in the text, remove stopwords, and lemmatize
trigrams = make_trigrams(to_gensim)
stop_free = [stopwordRemovalList(text, stop) for text in trigrams]
lemmatized_text = lemmatization(stop_free)

### Generating topic models with Gensim

In [94]:
# First build the corpus in Gensim's required format
id2word = corpora.Dictionary(final_output)   # Initialize a dictionary for indexing all unique words
corpus = [id2word.doc2bow(text) for text in lemmatized_text] # Generate a list including all unique words anf their frequency

In [109]:
# Generate the topic model
lda_model = LDA.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=5, 
                                       random_state=0,
                                       update_every=1,
                                       chunksize=100,
                                       passes=10,
                                       alpha='auto',
                                       per_word_topics=True)

In [136]:
# Print the topics and the top 10 words in each 
lda_model.print_topics()

[(0,
  '0.014*"say" + 0.011*"year" + 0.009*"business" + 0.008*"company" + 0.007*"uk" + 0.007*"bank" + 0.007*"datum" + 0.006*"new" + 0.006*"market" + 0.005*"also"'),
 (1,
  '0.010*"people" + 0.007*"say" + 0.007*"go" + 0.007*"get" + 0.007*"make" + 0.007*"think" + 0.006*"time" + 0.005*"would" + 0.005*"take" + 0.004*"thing"'),
 (2,
  '0.009*"trump" + 0.007*"say" + 0.005*"country" + 0.005*"world" + 0.004*"include" + 0.004*"american" + 0.004*"europe" + 0.004*"president" + 0.003*"refugee" + 0.003*"germany"'),
 (3,
  '0.021*"say" + 0.017*"government" + 0.015*"would" + 0.010*"britain" + 0.009*"brexit" + 0.009*"uk" + 0.007*"deal" + 0.007*"make" + 0.007*"may" + 0.006*"vote"'),
 (4,
  '0.055*"party" + 0.037*"labour" + 0.020*"tory" + 0.016*"election" + 0.015*"say" + 0.014*"conference" + 0.014*"conservative" + 0.013*"leader" + 0.013*"may" + 0.012*"johnson"')]

In [137]:
# Evaluating the topic models with a coherence score (higher the better)
coherence_model_lda = CoherenceModel(model=lda_model, texts=lemmatized_text, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.41588594612823726


### Visualizing the Topics

In [None]:
# Prepare the topic models for the visualizer
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

# Loads the visualization in another webpage 
pyLDAvis.show(vis)  

In [120]:
# Saving this because it takes a while to generate
import pickle 
filehandler = open('LDAvis5Topic', 'wb') 
pickle.dump(vis, filehandler)

In [125]:
# Load this in instead of running the .prepare block
filehandler = open('LDAvis5Topic', 'rb') 
vis = pickle.load(filehandler)

In [124]:
pyLDAvis.show(vis)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [09/Aug/2019 10:13:30] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [09/Aug/2019 10:13:30] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [09/Aug/2019 10:13:30] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [09/Aug/2019 10:13:30] code 404, message Not Found
127.0.0.1 - - [09/Aug/2019 10:13:30] "GET /favicon.ico HTTP/1.1" 404 -
----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 60909)
Traceback (most recent call last):
  File "c:\users\jdward\appdata\local\continuum\anaconda3\envs\elastic\lib\socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "c:\users\jdward\appdata\local\continuum\anaconda3\envs\elastic\lib\socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "c:\users\jdward\appdata\local\continuum\anaconda3\envs\elastic\lib\socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_ad


stopping Server...


### Trying to use Mallet instead because the topics will probably be better

In [139]:
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
os.environ['MALLET_HOME'] = r'C:\Users\jdward\mallet-2.0.8\mallet-2.0.8'
mallet_path = r'C:\Users\jdward\mallet-2.0.8\mallet-2.0.8\bin\mallet' 
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

In [140]:
# Print the topics and the top 10 words in each 
ldamallet.print_topics()

[(0,
  '0.047*"brexit" + 0.033*"deal" + 0.032*"uk" + 0.032*"britain" + 0.025*"talk" + 0.020*"british" + 0.020*"negotiation" + 0.017*"eu" + 0.016*"make" + 0.015*"leave"'),
 (1,
  '0.065*"party" + 0.051*"labour" + 0.021*"election" + 0.020*"corbyn" + 0.015*"vote" + 0.014*"leader" + 0.013*"policy" + 0.013*"voter" + 0.012*"campaign" + 0.012*"conference"'),
 (2,
  '0.017*"time" + 0.011*"bad" + 0.011*"thing" + 0.010*"turn" + 0.009*"back" + 0.008*"day" + 0.008*"good" + 0.007*"fact" + 0.007*"long" + 0.007*"end"'),
 (3,
  '0.015*"system" + 0.012*"risk" + 0.011*"change" + 0.010*"technology" + 0.008*"project" + 0.008*"energy" + 0.008*"model" + 0.007*"work" + 0.007*"power" + 0.007*"create"'),
 (4,
  '0.054*"government" + 0.033*"brexit" + 0.019*"parliament" + 0.018*"bill" + 0.017*"vote" + 0.017*"law" + 0.015*"labour" + 0.015*"minister" + 0.013*"mps" + 0.012*"leave"'),
 (5,
  '0.026*"company" + 0.020*"business" + 0.015*"market" + 0.014*"year" + 0.013*"london" + 0.013*"bank" + 0.013*"firm" + 0.010*"bi

In [143]:
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=lemmatized_text, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('Coherence Score: ', coherence_ldamallet)


Coherence Score:  0.4643946777377191
