In [4]:
import pandas as pd
import numpy as np
import json
from pprint import pprint
import pickle
import timeit
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',filename="model.log", level=logging.INFO)
import random
from operator import itemgetter

import gensim
from gensim import corpora
from gensim.models import ldaseqmodel
from gensim.models import LdaModel

In [None]:
seed_no = 42
n_topics = 34

In [None]:
df = pd.read_csv("../dataset/complete_data_by_speech.csv")

In [None]:
filter_df = df.sort_values(["date", "intervention_id"])
filter_df = filter_df[["date","intervention_id","text","mep_id","full_name","role","is_mep","langdetect","langid"]]
filter_df["lang_checkup"] = np.where(filter_df["langdetect"] == filter_df["langid"], True, False)
filter_df = filter_df[filter_df["langdetect"]=="en"]
filter_df = filter_df[filter_df["is_mep"]==True]
filter_df["date"] = pd.to_datetime(filter_df["date"])
filter_df["year"] = filter_df["date"].dt.year
filter_df = filter_df.reset_index(drop=True)

In [5]:
with open("../LDA/corpus", "r") as fp:
    corpus = json.load(fp)

with open("../LDA/data_lemmatized", "r") as fp:
    data_lemmatized = json.load(fp)

id2word = corpora.Dictionary(data_lemmatized)

In [None]:
random.seed(seed_no)
random_training_index = random.sample(range(0,len(corpus),1), int(len(corpus)/10))
random_training_index.sort()
random_training_index[:10]

In [None]:
training_corpus = [corpus[index] for index in random_training_index]
print(len(corpus), len(training_corpus))

In [None]:
date_list = filter_df["year"].tolist()
len(date_list)

In [None]:
training_date_list = [date_list[index] for index in random_training_index]
len(training_date_list)

In [None]:
remove_list = []

for i in range(len(training_corpus)): #bow_corpus is the corpus
    if len(training_corpus[i])==0: #check for empty document
        remove_list.append(i) #if there is any empty document then print the index of that document

len(remove_list)

In [None]:
for index in sorted(remove_list, reverse=True):
    del training_corpus[index]

len(training_corpus)

In [None]:
for index in sorted(remove_list, reverse=True):
    del training_date_list[index]

len(training_date_list)

In [None]:
uniqueyears, time_slice = np.unique(training_date_list, return_counts=True) 
time_slice = time_slice.tolist()
time_slice

In [None]:
sum(time_slice)

In [None]:
dynamic_lda_model = ldaseqmodel.LdaSeqModel(corpus=training_corpus,
                                        id2word=id2word,
                                        time_slice=time_slice,
                                        num_topics=n_topics,
                                        random_state=seed_no,
                                        passes=50)

In [None]:
dynamic_lda_model.save(f"models_{seed_no}/dtm_model_{n_topics}")

In [None]:
dtm_load = ldaseqmodel.LdaSeqModel.load(f"models_{seed_no}/lda_model_{n_topics}")

In [None]:
topic_dict = dtm_load.print_topic_times(topic=26)

In [None]:
yearly_df = pd.DataFrame(columns=["term"])

for i in range(0,23,1):
    temp_df = pd.DataFrame(topic_dict[i], columns = ["term", str(uniqueyears[i])])
    yearly_df = pd.merge(yearly_df, temp_df, on="term", how="outer").set_index("term", drop=True)

yearly_df = yearly_df.dropna().transpose()

yearly_df

In [None]:
yearly_df.to_csv("visualisation/yearly_df.csv")

In [None]:
yearly_df.plot.line()