In [22]:
# External libraries used for data handling:
import os
import warnings
import pandas as pd
from datetime import datetime
from dateutil import parser
# Packages for LDA
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary, MmCorpus
import pyLDAvis
import pyLDAvis.gensim
# Helper functions
from lda_helper import *

In [61]:
# df = pd.read_csv('df_1934_to_1998_JDG.csv')
df = pd.read_csv('df_1826_to_1874_JDG.csv')

# parsing all text to strings
df.text=df.text.apply(lambda x: str(x))
# parsing the strings in 'date' column to a pandas Timestamp
df.date = df.date.apply(lambda x: parser.parse(x))

In [62]:
df=df.drop(['Unnamed: 0', 'Unnamed: 0.1'],axis=1)
df.head()

Unnamed: 0,date,newspaper,text
0,1826-01-05,JDG,syndic développer proposition déposer séance d...
1,1826-01-05,JDG,diverses élection lieu séance annoncer feuille...
2,1826-01-19,JDG,contenir graver ordre chose préexister détruir...
3,1826-02-02,JDG,militaire généralement mauvais juger remplir f...
4,1826-02-09,JDG,personne prendre parole commission membre nomm...


In [63]:
# df.to_csv('df_1934_to_1998_JDG.csv')
df.to_csv('df_1826_to_1874_JDG.csv')

# Latent Dirchlet Allocation

In [64]:
type(df.text.tolist())

list

In [65]:
# learn the dictionnary by iterating over all of the articles
dico = Dictionary([article.split() for article in df.text.tolist()])

# filter tokens that are very rare or too common from
# the dictionary 
dico.filter_extremes(no_below=0, no_above=0.4)

# reassign integer lda
dico.compactify()

In [66]:
# generate bag-of-word representations for
# all reviews and save them as a matrix
project_path = '/Users/robin/GIT/ADA/ADA2017_GroupWork/Project/'

if 1 == 1:
    MmCorpus.serialize(os.path.join(project_path, 'corpus.mm'),
                       bow_generator(df.text.tolist(), dico))
    

bow_corpus = MmCorpus(os.path.join(project_path, 'corpus.mm'))

In [67]:
# Generating and storing the LDA model
lda_model_filepath = os.path.join(project_path, 'lda_model_all')
with warnings.catch_warnings():
    warnings.simplefilter('ignore')

    # workers => sets the parallelism, and should be
    # set to your number of physical cores minus one
    lda = LdaMulticore(bow_corpus,
                       num_topics=5,
                       id2word=dico,
                       workers=1)
        
    lda.save(lda_model_filepath)

# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

In [80]:
?LdaMulticore

In [74]:
explore_topic(lda, topic_number=4, topn=10)

term                 frequency

conseil              0.018
votation             0.015
fédéral              0.012
scrutin              0.012
proposition          0.012
voter                0.011
commission           0.010
loi                  0.009
assemblée            0.009
voix                 0.009


In [75]:
topic_docs = articles_from_topic(lda, bow_corpus, df.text.tolist(), 3)

In [76]:
len(topic_docs)


5749

In [78]:
topic_docs[0]

'diverses élection lieu séance annoncer feuiller avis répéter conseil organe monsieur syndic rendre compter administration année'

In [37]:
type(dico)

gensim.corpora.dictionary.Dictionary

In [34]:
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, bow_corpus, dico)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [35]:
pyLDAvis.display(LDAvis_prepared)

## Todo
* map articles to their LDA topics
* time distribution of topics -> see if we can pick out any trends
* time distribution (year) of all votation articles
* comparison of GDL vs JDG
* seasonal comparison, ie. a on a month-to-month basis