In [1]:
import spacy
import pandas as pd
import os

In [93]:
from lxml import etree
from datetime import datetime
from article_selection import article_selection
import json

In [3]:
def month_dates(start, end):
    f = lambda date: date.month + 12 * date.year

    res = []
    for tot_m in range(f(start)-1, f(end)):
        y, m = divmod(tot_m, 12)
        res.append(str(y) + '/' + '%02d' % (m+1))
    
    return res

In [4]:
def get_date(article):
    """
    This method returns the date of the article
    """
    str_date = article.find('entity').find('meta').find('issue_date').text
    return datetime.strptime(str_date, '%d/%m/%Y')

In [5]:
def get_articles_in_file(file, start_date, end_date):
    articles = []  
    for article in file.iter('article'):
        if article.find('entity') is not None:
            a = ''
            date = get_date(article)
            if start_date <= date <= end_date:
                for entity in article.iter('entity'):
                    a += entity.findtext('full_text') + ' '
                articles.append(date.strftime('%d/%m/%Y') + ' ' + a)
    return articles

In [6]:
def get_articles(path, start_date, end_date):
    articles = []
    for m_date in month_dates(start_date, end_date):
        try:
            file = etree.parse(path + m_date + '.xml')
            articles.append(get_articles_in_file(file, start_date, end_date))
        except (FileNotFoundError, IOError):
            pass
    return [a for file in articles for a in file]  

In [7]:
def get_entity_text(file, box_id):
    res = None
    for article in file.iter('article'):
        if article.find('entity') is not None:
            date = get_date(article)
            for entity in article.iter('entity'):
                if   box_id == entity.find('meta').find('box').text:
                    res = date.strftime('%d/%m/%Y') + ' ' + entity.findtext('full_text')
                    break
    return res

In [8]:
path = '/home/mbanga/Desktop/JDG/'
start_date =  datetime(1990, 1, 1)
end_date = datetime(1998, 2, 28)

In [9]:
articles = get_articles(path, start_date, end_date)

In [10]:
len(articles)

358455

In [11]:
import fr_core_news_sm
import enchant

In [12]:
nlp = fr_core_news_sm.load()

In [13]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuaiton or whitespace
    """
    
    return token.is_punct or token.is_space

In [14]:
def is_french(word):
    """
    helper function to eliminate tokens that
    are not french words.
    """
    d = enchant.Dict('fr_FR')
    return d.check(word)

In [334]:
def lemmatized_corpus(corpus):
    """
    generator function to use spaCy to parse articles,
    lemmatize the text, and yield sentences
    """

    pos = ['VERB', 'PROPN', 'NOUN', 'ADJ', 'ADV']
    for parsed_article in nlp.pipe(corpus, 
                                   batch_size=100, n_threads=5):
        # save the date
        date = parsed_article[0].text
        
        yield (date, ' '.join([token.lemma_ for token in parsed_article if token.pos_ in pos]))
                             
        '''if not punct_space(token) and is_french(token.text)
                                and not token.is_stop and not token.is_digit
                                and not token.like_num]))'''

In [16]:
def corpus_votation(articles, lemmas):
    votations = []
    for article in articles:
        if any(lemma in article for lemma in lemmas): 
            votations.append(article)
    return votations

In [17]:
def corpus_votation_bis(articles, lemmas):
    votations = []
    for article in articles:
        if any(lemma in article.replace(' ', '') for lemma in lemmas):
            votations.append(article)
    return votations

In [18]:
# Naive selection (First Filtering)
lems = ['votation', 'referendum']

#articles_votation_third = corpus_votation_bis(articles, lemmas)
#articles_votation_bis = corpus_votation(articles, lemmas)

articles_votation = article_selection(articles, lems)

In [19]:
len(articles_votation)

2561

In [188]:
if 0 == 1:
    %%time
    # Time consuming !!
    lemmatized_corpus = [(date, lemmas) for date, lemmas in lemmatized_corpus(articles_votation)]

    # retrieve dates
    dates = [pair[0] for pair in lemmatized_corpus]

    # retrieve articles
    corpus = [pair[1] for pair in lemmatized_corpus]

In [None]:
if 0 == 1:
    project_path = '/home/mbanga/Epfl/AppliedDataAnalysis/ADA2017_GroupWork/Project/'

    with open(os.path.join(project_path, 'lemmatized articles 1990-1998.txt'), 'w') as file:
        for article in corpus:
            file.write(article + '\n')

In [187]:
if 0 == 1:
    project_path = '/home/mbanga/Epfl/AppliedDataAnalysis/ADA2017_GroupWork/Project/'

    with open(os.path.join(project_path, 'lemmatized articles 1990-1998 json'), 'w') as file:
        json.dump(lemmatized_corpus, file)

In [None]:
if 0 == 1:
    # check ouput of lemmatizer (lemmatized_corpus) 
    file = etree.parse('/home/mbanga/Desktop/JDG/1990/01.xml')
    box_id = '24 123 1446 2167'

    original_text = [get_entity_text(file, box_id)]

    for lemmatized in lemmatized_corpus(original_text):
        print(lemmatized[1], '\n')
    print(original_text)

In [None]:
if 0 == 1:
    # check naive selection
    file = etree.parse('/home/mbanga/Desktop/JDG/1990/01.xml')
    box_id = '50 163 1090 888'

    original_text = [get_entity_text(file, box_id)]
    lemmas = ['vote', 'voter', 'votation', 'referendum']
    res = corpus_votation(original_text, lemmas)

# Filtering articles about votations

> Assumption: The subject of a votation is most likely to be found in
the neighborhoud of the terms 'votation' or 'referendum' in the article. 

In [147]:
import re

In [330]:
# get all phrases index with the searched term
keywords = ['votation']
#j = 0

articles_votation_sents = []
for article in articles_votation:
    date = re.findall(r'^([^\s]+)', article)[0]
    #print('article', (j+1), ': ', date)
    
    sent = ''
    phrases = article.split('.')    
    for i, phrase in enumerate(phrases):
        if any(keyword in phrase for keyword in keywords):
            if len(phrases) < 2:
                sent += phrase
            elif i == 0:
                sent += phrase[phrase.index(' ') + 1:] + ' '  + phrases[i+1]
            elif i == len(phrases) - 1:
                sent += ' ' + phrases[i-1] + ' ' + phrase
            elif 0 < i < len(phrases) - 1:
                sent += ' ' + phrases[i-1] + ' ' + phrase + ' ' + phrases[i+1]
    articles_votation_sents.append(date + ' ' + sent)
            #print(' {:}'.format(phrases[i-1] + phrase + phrases[i+1]))
    #print('\n')
    #j += 1

In [331]:
len(articles_votation_sents)

2561

In [332]:
articles_votation_sents[501]

"11/02/1992   Communications et renseignements à « Fréquence verte », Radio Cité, CP, 1227 Carouge  (EC) YOTATIONS A Onex, ce sera le 17 mai Le 17 mai prochain, les habitants de la commune d'Onex devront se prononcer lors d'une votation communale référendaire sur la construction simultanée d'un garage privé et d'un abri de la protection civile  Opposée à ce projet, l'Association des habitants de la région Vieux Moulin-Evaux est partie en guerre"

In [335]:
if 1 == 1:
    %%time
    # Time consuming !!
    lemmatized_corpus = [(date, lemmas) for date, lemmas in lemmatized_corpus(articles_votation_sents)]

    # retrieve dates
    dates = [pair[0] for pair in lemmatized_corpus]

    # retrieve articles
    corpus = [pair[1] for pair in lemmatized_corpus]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 4.53 µs


In [336]:
len(lemmatized_corpus)

2561

In [338]:
lemmatized_corpus[501]

('11/02/1992',
 '11/02/1992 communications renseignement fréquence vert radio cité cp carouge ec yotations a onex être mai mai prochain habitant commun onex prononcer lors votation communal référendaire construction simultané garage priver abri protection civil opposée projet association habitant région vieux moulin evaux partir guerre')

# Latent Dirichlet Allocation

In [339]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary, MmCorpus

import pyLDAvis
import pyLDAvis.gensim
import warnings
#import cPickle as pickle

In [340]:
# learn the dictionnary by iterating over all of the articles
dico = Dictionary([article.split() for article in corpus])

# filter tokens that are very rare or too common from
# the dictionary 
dico.filter_extremes(no_below=0, no_above=0.4)

# reassign integer lda
dico.compactify()

In [341]:
def bow_generator(corpus):
    """
    generator function to read articles from a file
    and yield a bag-of-words representation
    """
    for article in corpus:
        yield dico.doc2bow(article.split())

In [342]:
# generate bag-of-word representations for
# all reviews and save them as a matrix
project_path = '/home/mbanga/Epfl/AppliedDataAnalysis/ADA2017_GroupWork/Project/'
MmCorpus.serialize(os.path.join(project_path, 'corpus.mm'),
                                bow_generator(corpus))

bow_corpus = MmCorpus(os.path.join(project_path, 'corpus.mm'))

In [343]:
lda_model_filepath = os.path.join(project_path, 'lda_model_all')

In [350]:
if 1 == 1:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(bow_corpus,
                           num_topics=50,
                           id2word=dico,
                           workers=5)
        
        lda.save(lda_model_filepath)

#load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

In [351]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
    
    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
    
    for term, frequency in lda.show_topic(topic_number, topn):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [352]:
explore_topic(topic_number=15, topn=10)

term                 frequency

suisse               0.011
avoir                0.011
populaire            0.009
initiative           0.009
pas                  0.009
million              0.007
faire                0.006
plaire               0.006
franc                0.006
prochain             0.005


In [353]:
# The goal is to find all documents related to the same topic
def articles_topic(lda, bow_corpus, corpus, topic):
    """
    return the list of articles associated
    with a given topic.
    """
    assert len(bow_corpus) == len(corpus)
    nb_topics = len(lda.get_topics())
    
    documents = []
    if 0 <= topic < nb_topics:
        k = 0
        for bow_article in bow_corpus:
            dist = lda.get_document_topics(bow_article, minimum_probability=0)
            dist = [p[1] for p in dist]
            idx_max = dist.index(max(dist))
            if idx_max == topic:
                documents.append(corpus[k])
            k += 1
    
    return documents

In [354]:
docs = articles_topic(lda, bow_corpus, articles_votation_sents, 10)

In [355]:
docs[1]

"20/06/1990   n appartient maintenant au Conseil national de se prononcer  BERNE : MARIE-JEANNE KRILL Echaudé par deux échecs successifs en votation populaire (1977 et 1979), le Conseil fédéral a, rappelons-le, renoncé à proposer une nouvelle fois le passage à la TVA  Il préconise en revanche l'introduction d'un leha modernisé, avec notamment la suppression de la taxe occulte, ainsi qu'une extension de l'impôt à certaines prestations de service et aux Otto Stich, une obstination peu payante (Keystone), agents énergétiques"

In [356]:
if 1 == 1:     
        LDAvis_prepared = pyLDAvis.gensim.prepare(lda, bow_corpus, dico)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]


In [357]:
pyLDAvis.display(LDAvis_prepared)