In [1]:
import spacy
import pandas as pd
from nltk.stem.snowball import FrenchStemmer

In [2]:
from lxml import etree
from datetime import datetime

In [3]:
def month_dates(start, end):
    f = lambda date: date.month + 12 * date.year

    res = []
    for tot_m in range(f(start)-1, f(end)):
        y, m = divmod(tot_m, 12)
        res.append(str(y) + '/' + '%02d' % (m+1))
    
    return res

In [4]:
def get_date(article):
    """
    This method returns the date of the article
    """
    str_date = article.find('entity').find('meta').find('issue_date').text
    return datetime.strptime(str_date, '%d/%m/%Y')

In [5]:
def get_articles_in_file(file, start_date, end_date):
    articles = []  
    for article in file.iter('article'):
        if article.find('entity') is not None:
            a = ''
            date = get_date(article)
            if start_date <= date <= end_date:
                for entity in article.iter('entity'):
                    a += entity.findtext('full_text') + ' '
                articles.append(date.strftime('%d/%m/%Y') + ' ' + a)
    return articles

In [6]:
def get_articles(path, start_date, end_date):
    articles = []
    for m_date in month_dates(start_date, end_date):
        try:
            file = etree.parse(path + m_date + '.xml')
            articles.append(get_articles_in_file(file, start_date, end_date))
        except (FileNotFoundError, IOError):
            pass
    return [a for file in articles for a in file]  

In [7]:
path = '/home/mbanga/Desktop/JDG/'
start_date =  datetime(1990, 1, 1)
end_date = datetime(1990, 1, 31)

In [8]:
articles = get_articles(path, start_date, end_date)

In [9]:
len(articles)

3434

In [10]:
nlp = fr_core_news_sm.load()

In [122]:
import fr_core_news_sm
import enchant

In [15]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuaiton or whitespace
    """
    
    return token.is_punct or token.is_space

In [101]:
def is_french(word):
    """
    helper function to eliminate tokens that
    are not french words.
    """
    d = enchant.Dict('fr_FR')
    return d.check(word)

In [115]:
def lemmatized_sentence_corpus(corpus):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    j = 0
    i = 0
    for parsed_article in nlp.pipe(corpus, 
                                   batch_size=50, n_threads=1):
        
        date = parsed_article[0].text + ' '
        yield date + u' '.join([token.lemma_ for token in parsed_article
                             if not punct_space(token) and is_french(token.text) and not token.is_stop])

In [121]:
for lem in lemmatized_sentence_corpus(articles[:1]):
    print(lem, '\n')

01/01/1990 le panama tremplin le panama jamais prendre sérieux voisin latin dans pays créer début siècle autour canal chapeau indépendance formel rester aléatoire largement américain dollar servir monnayer national troupe américain intervenir foi an la unième foi injustifiable plan droit international entourer série circonstance atténuant commencer régime inique subir isthme général il falloir ajouter renverser régime coup forcer prendre soin éviter victime civil américains accomplir boire principal ils ré surcroît reportage 10- reconnaissance le difficile matière militaire décrocher jour passer opération critiquer si partisan dictateur déposer regrouper harceler fragile autorité civil risquer lutte 1-drogue en pression courir nonciature lu mauvais grand oeil ion or curer hiérarchie combattre justement modernisation américanisation société accroissement injustice social dépourvu international panama apparaître géographiquement délicat articulation nord sud continent creuset rationalité

In [129]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary, MmCorpus

import pyLDAvis
import pyLDAvis.gensim
import warnings
#import cPickle as pickle

In [104]:
text = ['Hier, je suis allé mangé des pommes avec les frères', 
        'Je ne sais pas quoi fera pour gamins violente']

for token in nlp(text[0]):
    print(token.pos_, ' ', token.is_stop)

ADV   False
PUNCT   False
PRON   True
AUX   True
VERB   False
VERB   False
DET   True
NOUN   False
ADP   True
DET   True
NOUN   False
