In [1]:
from os import path
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from lxml import etree
import spacy
from glob import glob
from unicodedata import normalize

In [2]:
nlp = spacy.load('fr_core_news_md')

In [3]:
%matplotlib inline

In [4]:
def lemmatize(path):
    list_lemma = []
    with open(path, encoding="utf8") as file:
        tree = etree.parse(file)
        if tree.findall(".//p"):
            for paragraphe in tree.findall(".//p"):
                if paragraphe.text:
                    clean_text = normalize("NFKD", paragraphe.text)
                    docs = nlp(clean_text)
                    for token in docs:
                        if token.pos_ != "PUNCT" and "SPACE" and "X" and "SYM":
                            list_lemma.append(token.lemma_)
    return list_lemma

In [12]:
def postaggize(path):
    list_pos = []
    with open(path, encoding="utf8") as file:
        tree = etree.parse(file)
        if tree.findall(".//p"):
            for paragraphe in tree.findall(".//p"):
                if paragraphe.text:
                    clean_text = normalize("NFKD", paragraphe.text)
                    docs = nlp(clean_text)
                    for token in docs:
                        if token.pos_ != "PUNCT" and "SPACE" and "X" and "SYM":
                            list_pos.append(token.pos_)
    return list_pos

In [6]:
def bigrammize(list_lemma):
    list_bigram = []
    for indice_lemma in range(len(list_lemma)-1):
        bigram = list_lemma[indice_lemma]+'_'+list_lemma[indice_lemma+1]
        list_bigram.append(bigram)
    return list_bigram

In [7]:
def ngrammize(list_lemma, n):
    list_ngram = []
    for indice_lemma in range(len(list_lemma) - n):
        ngram = ""
        for i in range(n):
            soudure = "_" if i != n-1 else ""
            ngram+=list_lemma[indice_lemma+i]+soudure
        list_ngram.append(ngram)
    return list_ngram

In [8]:
path_test = 'corpus_test/1829_Hugo-Victor_Le-dernier-jour-d-un-condamne.xml'

In [9]:
list_lemma_hugo = lemmatize(path_test)
list_bigram_hugo = bigrammize(list_lemma_hugo)    
print(list_bigram_hugo)

['il_n’', 'n’_y', 'y_avoir', 'avoir_en', 'en_tête', 'tête_un', 'un_première', 'première_édition', 'édition_de', 'de_ce', 'ce_ouvrage', 'ouvrage_publié', 'publié_d’', 'd’_abord', 'abord_sans', 'sans_nom', 'nom_d’', 'd’_auteur', 'auteur_que', 'que_le', 'le_quelque', 'quelque_ligne', 'ligne_qu’', 'qu’_on', 'on_aller', 'aller_lire', 'lire_il', 'il_y', 'y_avoir', 'avoir_deux', 'deux_manière', 'manière_de', 'de_se', 'se_rendre', 'rendre_compte', 'compte_de', 'de_l’', 'l’_existence', 'existence_de', 'de_ce', 'ce_livre', 'livre_ou', 'ou_il', 'il_y', 'y_avoir', 'avoir_avoir', 'avoir_en', 'en_effet', 'effet_un', 'un_liasse', 'liasse_de', 'de_papier', 'papier_jaune', 'jaune_et', 'et_inégau', 'inégau_sur', 'sur_lequel', 'lequel_on', 'on_avoir', 'avoir_trouvé', 'trouvé_enregistréer', 'enregistréer_un', 'un_à', 'à_un', 'un_le', 'le_dernière', 'dernière_penséer', 'penséer_d’', 'd’_un', 'un_misérable', 'misérable_ou', 'ou_il', 'il_s’', 's’_être', 'être_rencontré', 'rencontré_

In [10]:
Counter(list_bigram_hugo).most_common(5)

[('de_le', 180),
 ('j’_avoir', 165),
 ('c’_être', 157),
 ('dans_le', 136),
 ('d’_un', 123)]

In [13]:
list_pos_hugo = postaggize(path_test)
list_bigram_pos_hugo = bigrammize(list_pos_hugo)    
print(list_bigram_pos_hugo)

['PRON_ADV', 'ADV_PRON', 'PRON_VERB', 'VERB_ADP', 'ADP_NOUN', 'NOUN_DET', 'DET_ADJ', 'ADJ_NOUN', 'NOUN_ADP', 'ADP_DET', 'DET_NOUN', 'NOUN_VERB', 'VERB_ADP', 'ADP_ADV', 'ADV_ADP', 'ADP_NOUN', 'NOUN_ADP', 'ADP_NOUN', 'NOUN_SCONJ', 'SCONJ_DET', 'DET_DET', 'DET_NOUN', 'NOUN_PRON', 'PRON_PRON', 'PRON_VERB', 'VERB_VERB', 'VERB_PRON', 'PRON_PRON', 'PRON_VERB', 'VERB_NUM', 'NUM_NOUN', 'NOUN_ADP', 'ADP_PRON', 'PRON_VERB', 'VERB_NOUN', 'NOUN_ADP', 'ADP_DET', 'DET_NOUN', 'NOUN_ADP', 'ADP_DET', 'DET_NOUN', 'NOUN_CCONJ', 'CCONJ_PRON', 'PRON_PRON', 'PRON_AUX', 'AUX_VERB', 'VERB_ADP', 'ADP_NOUN', 'NOUN_DET', 'DET_NOUN', 'NOUN_ADP', 'ADP_NOUN', 'NOUN_ADJ', 'ADJ_CCONJ', 'CCONJ_NOUN', 'NOUN_ADP', 'ADP_PRON', 'PRON_PRON', 'PRON_AUX', 'AUX_VERB', 'VERB_VERB', 'VERB_DET', 'DET_NOUN', 'NOUN_DET', 'DET_DET', 'DET_NOUN', 'NOUN_VERB', 'VERB_ADP', 'ADP_DET', 'DET_NOUN', 'NOUN_CCONJ', 'CCONJ_PRON', 'PRON_PRON', 'PRON_AUX', 'AUX_ADJ', 'ADJ_DET', 'DET_NOUN', 'NOUN_DET', 'DET_NOUN', 'NOUN_PROPN', 'PROPN_ADP', 'ADP_

In [14]:
Counter(list_bigram_pos_hugo).most_common(5)

[('DET_NOUN', 3984),
 ('NOUN_ADP', 2344),
 ('PRON_VERB', 1988),
 ('ADP_NOUN', 1749),
 ('ADP_DET', 1707)]

In [15]:
list_ngram_hugo = ngrammize(list_lemma_hugo, 4)    
print(list_ngram_hugo)

['il_n’_y_avoir', 'n’_y_avoir_en', 'y_avoir_en_tête', 'avoir_en_tête_un', 'en_tête_un_première', 'tête_un_première_édition', 'un_première_édition_de', 'première_édition_de_ce', 'édition_de_ce_ouvrage', 'de_ce_ouvrage_publié', 'ce_ouvrage_publié_d’', 'ouvrage_publié_d’_abord', 'publié_d’_abord_sans', 'd’_abord_sans_nom', 'abord_sans_nom_d’', 'sans_nom_d’_auteur', 'nom_d’_auteur_que', 'd’_auteur_que_le', 'auteur_que_le_quelque', 'que_le_quelque_ligne', 'le_quelque_ligne_qu’', 'quelque_ligne_qu’_on', 'ligne_qu’_on_aller', 'qu’_on_aller_lire', 'on_aller_lire_il', 'aller_lire_il_y', 'lire_il_y_avoir', 'il_y_avoir_deux', 'y_avoir_deux_manière', 'avoir_deux_manière_de', 'deux_manière_de_se', 'manière_de_se_rendre', 'de_se_rendre_compte', 'se_rendre_compte_de', 'rendre_compte_de_l’', 'compte_de_l’_existence', 'de_l’_existence_de', 'l’_existence_de_ce', 'existence_de_ce_livre', 'de_ce_livre_ou', 'ce_livre_ou_il', 'livre_ou_il_y', 'ou_il_y_avoir', 'il_y_avoir_avoir', 'y_avoir

In [16]:
Counter(list_ngram_hugo).most_common(5)

[('le_peine_de_mort', 43),
 ('il_y_avoir_un', 25),
 ('qu’_il_y_avoir', 20),
 ('il_n’_y_avoir', 19),
 ('de_le_peine_de', 17)]

In [17]:
list_ngram_pos_hugo = ngrammize(list_pos_hugo, 4)    
print(list_ngram_pos_hugo)

['PRON_ADV_PRON_VERB', 'ADV_PRON_VERB_ADP', 'PRON_VERB_ADP_NOUN', 'VERB_ADP_NOUN_DET', 'ADP_NOUN_DET_ADJ', 'NOUN_DET_ADJ_NOUN', 'DET_ADJ_NOUN_ADP', 'ADJ_NOUN_ADP_DET', 'NOUN_ADP_DET_NOUN', 'ADP_DET_NOUN_VERB', 'DET_NOUN_VERB_ADP', 'NOUN_VERB_ADP_ADV', 'VERB_ADP_ADV_ADP', 'ADP_ADV_ADP_NOUN', 'ADV_ADP_NOUN_ADP', 'ADP_NOUN_ADP_NOUN', 'NOUN_ADP_NOUN_SCONJ', 'ADP_NOUN_SCONJ_DET', 'NOUN_SCONJ_DET_DET', 'SCONJ_DET_DET_NOUN', 'DET_DET_NOUN_PRON', 'DET_NOUN_PRON_PRON', 'NOUN_PRON_PRON_VERB', 'PRON_PRON_VERB_VERB', 'PRON_VERB_VERB_PRON', 'VERB_VERB_PRON_PRON', 'VERB_PRON_PRON_VERB', 'PRON_PRON_VERB_NUM', 'PRON_VERB_NUM_NOUN', 'VERB_NUM_NOUN_ADP', 'NUM_NOUN_ADP_PRON', 'NOUN_ADP_PRON_VERB', 'ADP_PRON_VERB_NOUN', 'PRON_VERB_NOUN_ADP', 'VERB_NOUN_ADP_DET', 'NOUN_ADP_DET_NOUN', 'ADP_DET_NOUN_ADP', 'DET_NOUN_ADP_DET', 'NOUN_ADP_DET_NOUN', 'ADP_DET_NOUN_CCONJ', 'DET_NOUN_CCONJ_PRON', 'NOUN_CCONJ_PRON_PRON', 'CCONJ_PRON_PRON_AUX', 'PRON_PRON_AUX_VERB', 'PRON_AUX_VERB_ADP', 'AUX_VERB_ADP_NOUN', 'VERB_ADP




In [18]:
Counter(list_ngram_pos_hugo).most_common(5)

[('NOUN_ADP_DET_NOUN', 634),
 ('DET_NOUN_ADP_NOUN', 563),
 ('ADP_DET_NOUN_ADP', 456),
 ('DET_NOUN_ADP_DET', 392),
 ('PRON_VERB_DET_NOUN', 378)]

pour mesurer le temps d'exec du code d'une cellule

In [None]:
import timeit

In [None]:
start_time = timeit.default_timer()
# code you want to evaluate
elapsed = timeit.default_timer() - start_time

pour resoudre les problemes du '\xa0'

In [None]:
import unicodedata

In [None]:
text_string = BeautifulSoup(raw_html, "lxml").text
clean_text = unicodedata.normalize("NFKD",text_string)
print clean_text