In [None]:
import logging
logging.basicConfig(filename="model.log", format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.warning('Watch out!')  # will print a message to the console
logging.info('I told you so')  # will not print anything

import nltk; nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
from tqdm import tqdm
import json
import pickle
import re
import numpy as np
import pandas as pd
from pprint import pprint
import random

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
df = pd.read_csv("../dataset/complete_data_by_speech.csv")
df.head()

In [None]:
filter_df = df.sort_values(["date", "intervention_id"])
# filter_df = filter_df[["date","intervention_id","text","mep_id","full_name","role","is_mep","langdetect","langid"]]
filter_df["lang_checkup"] = np.where(filter_df["langdetect"] == filter_df["langid"], True, False)
filter_df = filter_df[filter_df["langdetect"]=="en"]
filter_df = filter_df[filter_df["is_mep"]==True]
filter_df = filter_df.reset_index(drop=True)

print(filter_df.shape)
filter_df.head()

In [None]:
# Convert to list
data = filter_df.text.values.tolist()

# Remove Emails
# data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
# data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

# Remove distracting double quotes
data = [re.sub("\"", "", sent) for sent in data]

pprint(data[:1])

In [None]:
len(data)

In [None]:
def sent_to_words(sentences):
    for sentence in tqdm(sentences):
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(len(data_words))
print(data_words[:1])

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    print("stopwords")
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in tqdm(texts)]

def make_bigrams(texts):
    print("bigrams")
    return [bigram_mod[doc] for doc in tqdm(texts)]

def make_trigrams(texts):
    print("trigrams")
    return [trigram_mod[bigram_mod[doc]] for doc in tqdm(texts)]

def lemmatization(texts, allowed_postags):
    print("lemmatization")
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in tqdm(texts):
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Form Trigrams
data_words_trigrams = make_trigrams(data_words_bigrams)

# Initialize spacy 'en_core_web_sm' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# Do lemmatization keeping only nouns
data_lemmatized = lemmatization(data_words_trigrams, allowed_postags=['NOUN'])

print(data_lemmatized[0])

In [None]:
# Save data_lemmatized
with open("data_lemmatized", "w") as fp:
    json.dump(data_lemmatized, fp)

In [None]:
data_lemmatized_list = []

for i in tqdm(data_lemmatized):
    lemma_str = ' '.join(i)
    data_lemmatized_list.append(lemma_str)

data_lemmatized_df = pd.DataFrame(data_lemmatized_list, columns=["text"])

data_lemmatized_df = data_lemmatized_df.join(filter_df.drop(["text"], axis = 1))
data_lemmatized_df.head()

data_lemmatized_df.to_csv("../dataset/data_lemmatized_df.csv")

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
# Save corpus
with open("corpus", "w") as fp:
    json.dump(corpus, fp)

In [None]:
# Load resources if necessary

# with open("corpus", "r") as fp:
#     corpus = json.load(fp)

# with open("data_lemmatized", "r") as fp:
#     data_lemmatized = json.load(fp)

id2word = corpora.Dictionary(data_lemmatized)

In [None]:
seed_no = 42

In [None]:
random.seed(seed_no)
random_training_index = random.sample(range(0,len(corpus),1), int(len(corpus)/10))
random_training_index.sort()
print(random_training_index[:10], len(random_training_index))

In [None]:
with open("random_training_index", "w") as fp:
    json.dump(random_training_index, fp)

In [None]:
random_training_index_df = pd.DataFrame(random_training_index, columns=["training_index"])
random_training_index_df.to_csv("../dataset/random_training_index_df.csv")

In [None]:
training_corpus = [corpus[index] for index in random_training_index]
len(training_corpus)

In [None]:
remove_list = []

for i in range(len(training_corpus))
    if len(training_corpus[i])==0: #check for empty document
        remove_list.append(i)

len(remove_list)

In [None]:
for index in sorted(remove_list, reverse=True):
    del training_corpus[index]

len(training_corpus)

In [None]:
def train_and_save_lda_model(n_topics):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=training_corpus,
                                           id2word=id2word,
                                           num_topics=n_topics, 
                                           random_state=seed_no,
                                           update_every=1,
                                           passes=50)
    
    lda_model.save(f"models_{seed_no}/lda_model_{n_topics}")

In [None]:
min_topics = 2
max_topics = 40
steps = 1

In [None]:
for n_topics in tqdm(range(min_topics,max_topics+1,steps)):
    train_and_save_lda_model(n_topics)

In [None]:
coherence_list = []

def calculate_coherence(n_topics):
    loaded_model = gensim.models.ldamodel.LdaModel.load(f"models_{seed_no}/lda_model_{n_topics}")
    cm = gensim.models.coherencemodel.CoherenceModel(model=loaded_model, corpus=corpus, coherence="c_v", texts=data_lemmatized)
    coherence = cm.get_coherence()
    coherence_list.append([n_topics, coherence])

for n_topics in tqdm(range(min_topics, max_topics+1, steps)):
    calculate_coherence(n_topics)

coherence_df = pd.DataFrame(coherence_list, columns=["topics", "coherence"])

coherence_df

In [None]:
coherence_df.to_csv("visualisation/coherence_df.csv")

In [None]:
coherence_df.plot.line(x='topics', y='coherence')

42: 15, 16, 19, **34 (Topic 26)**
404: 35, 36

In [None]:
# Explore different models
n_topics = 34

with open(f"models_{seed_no}/lda_model_{n_topics}.id2word", "rb") as fp:
    id2word = pickle.load(fp)

model_load = gensim.models.ldamodel.LdaModel.load(f"models_{seed_no}/lda_model_{n_topics}")

# Print the Keyword in the 10 topics
pprint(model_load.print_topics(num_topics=n_topics))
doc_lda = model_load[corpus]

In [None]:
topic_no = 26
topic_df = pd.DataFrame(model_load.show_topic(topic_no, topn=100), columns = ["word", "probability"])
topic_df
topic_df.to_csv(f"visualisation/model_{n_topics}_topic_{topic_no}.csv")

In [None]:
prob_list_complete=[]

for doc in tqdm(corpus):
    prob_dict = model_load.get_document_topics(doc)
    prob_list=[]
    for i in range(0,n_topics,1):
        try:
            prob_list.append(dict(prob_dict)[i])
        except KeyError:
            prob_list.append(None)
    prob_list_complete.append(prob_list)

In [None]:
prob_df = pd.DataFrame(prob_list_complete)

colnames = []
for i in range(0,n_topics,1):
    colnames.append(f"topic_{i}")
prob_df.columns=colnames

prob_df.tail()

In [None]:
# posterior_df = filter_df[filter_df.index.isin(random_training_index)].reset_index(drop=True)
# posterior_df = filter_df[~filter_df.index.isin(remove_list)].reset_index(drop=True)
posterior_df = filter_df.join(prob_df)
posterior_df.tail()

In [None]:
refugee_df = posterior_df[posterior_df["topic_26"]>=0.30]
refugee_df.shape

In [None]:
posterior_df.shape

In [None]:
# Visualize the topics
vis = pyLDAvis.gensim_models.prepare(model_load, corpus, id2word)
pyLDAvis.save_html(vis, f"vis_{n_topics}")

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)