In [None]:
# External libraries used for data handling:
import os
import warnings
import pandas as pd
from datetime import datetime
from dateutil import parser
import pickle
# Packages for LDA
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary, MmCorpus
import pyLDAvis # LDA visualisation
import pyLDAvis.gensim
# Helper functions
from lda_helper import *

In [None]:
# df = pd.read_csv('df_1934_to_1998_JDG.csv')
# df = pd.read_csv('df_1826_to_1874_JDG.csv')
df = pd.read_csv('df_GDL_all.csv')
df2 = pd.read_csv('df_JDG_all.csv')
df = df.append(df2)

# parsing all text to strings
df.text=df.text.apply(lambda x: str(x))
# parsing the strings in 'date' column to a pandas Timestamp
df.date = df.date.apply(lambda x: parser.parse(x))

In [None]:
df=df.drop(['Unnamed: 0', 'Unnamed: 0.1'],axis=1)
df.head()

In [None]:
# adding a year and month column
df_new = pd.DataFrame(columns=['year', 'month'])
df_new.year = df.date.apply(lambda x: datetime.strftime(x, '%Y'))
df_new.month = df.date.apply(lambda x: datetime.strftime(x, '%m'))
df = df.join(df_new, how='outer')

In [None]:
df.tail()


In [None]:
type(df.date[0])
# Selection of articles between dates can be done like so:
# df.date = df.date.apply(lambda x: parser.parse(x))
# df.set_index=('date', inplace=True)
# df[datetime(1851,1,24):datetime(1852,8,22)]

# Latent Dirchlet Allocation
The following files are created during this section and can be loaded to resume from a previous point:
* dico.pickle
* corpus.mm
* lda_model_all
* lda_model_all.id2word  
* lda_model_all.expElogbeta.npy 
* lda_model_all.state
* LDAvis_prepared

In [None]:
project_path = '/Users/robin/GIT/ADA/ADA2017_GroupWork/Project_temp/5_topics/'
dico_fp = os.path.join(project_path, 'dico.pickle')
lda_model_filepath = os.path.join(project_path, 'lda_model_all')
ldavis_path = os.path.join(project_path, 'LDAvis_prepared')

In [None]:
# learn the dictionnary by iterating over all of the articles
dico = Dictionary([article.split() for article in df.text.tolist()])

# filter tokens that are very rare or too common from
# the dictionary. Here we decided to keep all unusual tokens that
# may appear in only one document, but to discard those that are in 
# 40% or more of the corpus
dico.filter_extremes(no_below=0, no_above=0.4)

# reassign integer lda
dico.compactify()
# save dictionary
dico.save(dico_fp)

In [None]:
# load dictionary
dico = Dictionary.load(dico_fp)

In [None]:
# generate bag-of-word representations for
# all reviews and save them as a matrix

# BoW is saved to file 'corpus.mm'
MmCorpus.serialize(os.path.join(project_path, 'corpus.mm'),
                       bow_generator(df.text.tolist(), dico))  

In [None]:
# load bow_corpus from file
bow_corpus = MmCorpus(os.path.join(project_path, 'corpus.mm'))

In [None]:
len(bow_corpus)

In [None]:
# Generating and storing the LDA model
with warnings.catch_warnings():
    warnings.simplefilter('ignore')

    # workers => sets the parallelism, and should be
    # set to your number of physical cores minus one
    lda = LdaMulticore(bow_corpus,
                       num_topics=5,
                       id2word=dico,
                       workers=3)
    # saving the LDA model to disk 
    lda.save(lda_model_filepath)

In [None]:
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

In [None]:
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, bow_corpus, dico)

In [None]:
# saving LDAvis_prepared
with open(ldavis_path, 'wb') as file:
    pickle.dump(LDAvis_prepared, file)

In [None]:
# loading LDAvis_prepared
with open(ldavis_path, 'rb') as file:
    LDAvis_prepared = pickle.load(file)

In [None]:
pyLDAvis.display(LDAvis_prepared)

## Todo
* map articles to their LDA topics
* exporting graphics from pyLDAvis
* try with 2-3 different # of topics
* time distribution of topics -> see if we can pick out any trends
* time distribution (year) of all votation articles
* comparison of GDL vs JDG
* seasonal comparison, ie. a on a month-to-month basis

In [None]:
df.text[0]

In [None]:
len(bow_corpus)
df.shape
dico.id2token[12800]

In [None]:
t = bow_corpus[0:5]
len(t)
t[0]

In [None]:
df_new = pd.DataFrame()
for bow in bow_corpus:
    rk = lda.get_document_topics(bow, minimum_probability=0)
    ziped = [p[1] for p in rk]
    df_topicscore = pd.DataFrame(ziped)
    df_topicscore = df_topicscore.transpose()
    df_new = df_new.append(df_topicscore)


In [None]:
df_new.columns = range(1, lda.num_topics+1)

In [None]:
df_new

In [None]:
lda.get_document_topics(bow_corpus[4], minimum_probability=0)

In [None]:
dfn = pd.DataFrame()
dfn2 = pd.DataFrame(z1)
dfn2 = dfn2.transpose()
dfn2.head()
dfn = dfn.append(dfn2)
dfn.head()

### For maximum sorting:

In [None]:
dist = lda.get_document_topics(bow_corpus[2])
dist
# dist = [p[1] for p in dist] # returns probabilities for reach topic
# dist.index(max(dist))

In [None]:
dist.sort(key = (lambda x: x[1]) , reverse = True)


In [None]:
# df=df.drop(['Unnamed: 0','Unnamed: 0.1'],axis=1)
df.columns