In [1]:
import spacy
import timeit
import math
import pandas as pd
import matplotlib.pyplot as plt
from os import path
from collections import Counter
from lxml import etree
from glob import glob
from unicodedata import normalize

  return torch._C._cuda_getDeviceCount() > 0


In [2]:
%matplotlib inline

In [47]:
nlp = spacy.load('fr_core_news_lg', exclude=["ner"])

In [4]:
nlp.max_length = 2000000

In [49]:
path_name = 'corpus_temp/*.txt'
#path_name = 'corpus_test/*.txt'
#path_name = 'corpus_main_txt/*.txt'
window = 1000
nombre_bigrammes = 100

In [6]:
def pipeline_spacy(path):
    pos_ko = ["NUM", "X", "SYM", "PUNCT", "SPACE"]
    list_lemma = []
    list_pos = []
    nombre_tokens = 0
    with open(path, encoding="utf8") as file:
        text = file.readlines()
        text_clean = clean_text(str(text).lower())
        docs = nlp(text_clean)
        nombre_tokens += len(docs)
        for token in docs:
            #si le token est bien un mot on récupère son lemme
            if token.pos_ not in pos_ko:
                list_lemma.append(token.lemma_)
                list_pos.append(token.pos_)

    return list_lemma, list_pos, nombre_tokens

In [23]:
def clean_text(txt):
    txt_res = normalize("NFKD", txt.replace('\xa0', ' '))
    txt_res = txt_res.replace('\\xa0', '')
    return txt_res

In [8]:
def bigrammize(list_token):
    """fonction qui prend en parametre une liste de tokens et retourne une liste de bi-grammes"""
    list_bigram = []
    for indice_token in range(len(list_token)-1):
        bigram = list_token[indice_token]+'_'+list_token[indice_token+1]
        list_bigram.append(bigram)
    return list_bigram

In [9]:
def trigrammize(list_token):
    """fonction qui prend en parametre une liste de tokens et retourne une liste de tri-grammes"""
    list_trigram = []
    for indice_token in range(len(list_token)-2):
        trigram = list_token[indice_token]+'_'+list_token[indice_token+1]+'_'+list_token[indice_token+2]
        list_trigram.append(trigram)
    return list_trigram

In [10]:
def rollingntokens(list_tokens, n):
    """fonction qui prend en parametre une liste de tokens et un nombre n et decoupe cette liste en fragments de n tokens puis retourne une liste de listes de tokens"""
    i = 0 # i stocke l'indice auquel on est dans le rolling
    list_rolling = []
    while i-n < len(list_tokens):
        list_rolling.append(list_tokens[i:i+n])
        i+=n
    return list_rolling 

In [11]:
def rolling_operationnalisation(rolling_list, window, doc_name, nombre_bigrammes, feature):
    """fonction qui prend en parametre une liste de liste de token  et une fenetre et calcul le ratio type-token pour chaque liste et retourne une liste de liste de valeurs"""
    i = 1
    list_type_token = []
    list_entropy = []
    for list_tokens in rolling_list:
        if type(list_tokens) == list:
            table_freq = Counter(list_tokens)
            list_type_token.append(type_token(table_freq, window))
            list_entropy.append(entropy(table_freq, window, nombre_bigrammes))
            #On stocke chaque table de frequences pour les chunks de 1000 mots dans des dataframmes
            zipped = list(zip(table_freq.keys(), table_freq.values()))
            df = pd.DataFrame(zipped, columns=[feature, 'Nombre d\'apparitions'])
            df['Frequence d\'apparition'] = round(df['Nombre d\'apparitions']/sum(table_freq.values()),5)
            df.to_csv(r'tables_frequences/'+feature+'/'+doc_name+'_'+feature+str(i)+'freq.csv', index = False)
            i+=1
        
    return list_type_token, list_entropy

In [12]:
"""fonction qui prend en parametre une table de frequence et retourne le calcul du ratio type-token"""
def type_token(table_freq, window):
    #test de verification sur les valeurs recuperees
    if sum(table_freq.values()) == window:
        #On calcule le type_token ratio et on le retourne
        return round(len(table_freq)/sum(table_freq.values()),5)

In [13]:
def entropy(table_freq, window, nombre_bigrammes):
    """fonction qui prend en parametre une table de frequence et un nombre de bigramme et retourne leur entropie"""
    # initialisation de l'indice de shannon
    shannon_sum = 0
    #test de verification sur les valeurs recuperees
    if sum(table_freq.values()) == window:
        #On calcule l'entropie et on la retourne
        table_freq_select = dict(table_freq.most_common(nombre_bigrammes))
        for bigram in table_freq_select.keys():
            prop = table_freq_select[bigram]/window
            shannon_sum += prop * (math.log(prop, 5))   
        return round(shannon_sum * -1,2)

In [32]:
def get_n_most_common_features(list_tokens, n):
    table_freq = dict(Counter(list_tokens).most_common(n))
    return list(table_freq.keys())

In [14]:
n_most_common_features = 1000

In [39]:
def moulinette(path_name, n):
    """fonction main qui utilise les fonctions précédentes et tourne sur le corpus"""
    
    i = 1
    nombre_total_tokens = 0
 
    list_lemma_global = []
    list_bigram_lemma_global = []
    list_trigram_lemma_global = []
    list_pos_global = []
    list_bigram_pos_global = []
    list_trigram_pos_global = []

    
    print("\n\nBEGIN PROCESSING CORPUS-----------")
    
    for doc in glob(path_name):
        
        print("\n\nBEGIN PROCESSING NOVEL-----------")

        
        doc_name = path.splitext(path.basename(doc))[0]
        date = doc_name.split("_")[0]
        print(doc_name)
        
        #On recupere le texte des romans sous forme de listes de lemmes et de pos grâce à spacy
        
        list_lemma_temp, list_pos_temp, nombre_tokens = pipeline_spacy(doc)
        
        print("PIPELINE SPACY ----------- OK")
        
        print("NOMBRE TOKENS = ", nombre_tokens)
        
        nombre_total_tokens += nombre_tokens
        
        list_lemma_global += list_lemma_temp
        list_bigram_lemma_global += bigrammize(list_lemma_temp)
        list_trigram_lemma_global += trigrammize(list_lemma_temp)
        
        list_pos_global += list_pos_temp
        list_bigram_pos_global += bigrammize(list_pos_temp)
        list_trigram_pos_global += trigrammize(list_pos_temp)
    
        i+=1

        print("END PROCESSING NOVEL --------------\n\n")
        print("PROGRESSION ", round(i/2,3),'% COMPLETED\n')
        
    print("\n GET LISTS RESULTS -----------")
    list_lemma_result = get_n_most_common_features(list_lemma_global, n)
    list_bigram_lemma_result = get_n_most_common_features(list_bigram_lemma_global, n)
    list_trigram_lemma_result = get_n_most_common_features(list_trigram_lemma_global, n)
    
    list_pos_result = get_n_most_common_features(list_pos_global, n)
    list_bigram_pos_result = get_n_most_common_features(list_bigram_pos_global, n)
    list_trigram_pos_result = get_n_most_common_features(list_trigram_pos_global, n)

        
    print("\n NOMBRE TOTAL TOKENS = ", nombre_total_tokens)
    print("\n RETURN LISTS RESULTS -----------")
    print("\n\n END PROCESSING CORPUS --------------\n\n")
            
    return list_lemma_result, list_bigram_lemma_result, list_trigram_lemma_result, list_pos_result, list_bigram_pos_result, list_trigram_pos_result

In [40]:
starttime = timeit.default_timer()
list_lemma_result, list_bigram_lemma_result, list_trigram_lemma_result, list_pos_result, list_bigram_pos_result, list_trigram_pos_result = moulinette(path_name, n_most_common_features) 
print("Le temps total d'execution en secondes est de : ", timeit.default_timer() - starttime)



BEGIN PROCESSING CORPUS-----------


BEGIN PROCESSING NOVEL-----------
2001_Nothomb-Amelie_Cosmetique-de-l-ennemi
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  26259
END PROCESSING NOVEL --------------


PROGRESSION  1.0 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1864_Erckmann-Chatrian_Histoire-d-un-conscrit-de-1813
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  80653
END PROCESSING NOVEL --------------


PROGRESSION  1.5 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1866_Sand-George_Promenades-autour-d-un-village
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  51939
END PROCESSING NOVEL --------------


PROGRESSION  2.0 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1931_Simenon-Georges_Au-Rendez-vous-des-Terre-Neuvas
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  45650
END PROCESSING NOVEL --------------


PROGRESSION  2.5 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1888_Guy-de-Maupassant_Pierre-et-Jean
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  66802
END PROCES

PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  46786
END PROCESSING NOVEL --------------


PROGRESSION  21.0 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1881_Berthet-Elie_Le-Charlatan
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  100838
END PROCESSING NOVEL --------------


PROGRESSION  21.5 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1899_Eekhoud-Georges_Escal-Vigor
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  55107
END PROCESSING NOVEL --------------


PROGRESSION  22.0 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1886_Daudet-Alphonse_La-Belle-Nivernaise
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  35650
END PROCESSING NOVEL --------------


PROGRESSION  22.5 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1928_Leroux-Gaston_Les-Mohicans-de-Babel
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  103745
END PROCESSING NOVEL --------------


PROGRESSION  23.0 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1840_Sand-George_Pauline
PIPELINE SPACY ----------- OK
NOMBRE 

PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  82498
END PROCESSING NOVEL --------------


PROGRESSION  41.5 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1931_Simenon-Georges_La-nuit-du-carrefour
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  45575
END PROCESSING NOVEL --------------


PROGRESSION  42.0 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1967_Simon-Claude_Histoire
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  119590
END PROCESSING NOVEL --------------


PROGRESSION  42.5 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1843_Feval-Paul_Le-loup-blanc
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  96094
END PROCESSING NOVEL --------------


PROGRESSION  43.0 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1952_San-Antonio_Mes-hommages-a-la-donzelle
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  44921
END PROCESSING NOVEL --------------


PROGRESSION  43.5 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1868_La-Rive-William-de_La-Marquise-de-Clerol-par-William-de-La-Ri

PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  70951
END PROCESSING NOVEL --------------


PROGRESSION  62.5 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1881_Sand-George_Jeanne
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  125366
END PROCESSING NOVEL --------------


PROGRESSION  63.0 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1921_Renard-Maurice_L-Homme-Truque
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  40560
END PROCESSING NOVEL --------------


PROGRESSION  63.5 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1900_Colette_La-retraite-sentimentale
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  55430
END PROCESSING NOVEL --------------


PROGRESSION  64.0 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1845_Balzac-Honore-de_Adieu
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  18068
END PROCESSING NOVEL --------------


PROGRESSION  64.5 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1884_Silvestre-Armand_En-pleine-fantaisie
PIPELINE SPACY ----------- OK
NOMBRE TOKEN

PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  50570
END PROCESSING NOVEL --------------


PROGRESSION  83.5 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1922_Jaloux-Edmond_L-escalier-d-or
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  46799
END PROCESSING NOVEL --------------


PROGRESSION  84.0 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1897_Lemonnier-Camille_L-homme-en-amour
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  60455
END PROCESSING NOVEL --------------


PROGRESSION  84.5 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1920_Leroux-Gaston_Aventures-effroyables-de-M-Herbert-de-Renich_Tome-II-La-Bataille-invisible
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  106672
END PROCESSING NOVEL --------------


PROGRESSION  85.0 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1880_Stapleaux-Leopold_Le-pendu-de-la-Foret-Noire
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  111706
END PROCESSING NOVEL --------------


PROGRESSION  85.5 % COMPLETED



BEGIN PROCESSING NOVEL-

In [41]:
list_lemma_result

['le',
 'de',
 'un',
 'et',
 'il',
 'avoir',
 'à',
 'lui',
 'être',
 'son',
 'que',
 'l’',
 'ce',
 'je',
 'en',
 'd’',
 'qui',
 'pas',
 'ne',
 'vous',
 'se',
 'dans',
 'qu’',
 'tout',
 'pour',
 'faire',
 'dire',
 's’',
 'mon',
 'au',
 'éter',
 'n’',
 'sur',
 'plus',
 '-',
 'mais',
 'on',
 '–',
 'avec',
 'me',
 'par',
 'comme',
 'c’',
 'nous',
 'pouvoir',
 'si',
 'bien',
 'j’',
 'y',
 'voir',
 'aller',
 'même',
 'moi',
 'tu',
 'leur',
 'sans',
 'être',
 'm’',
 'vouloir',
 'savoir',
 'où',
 'venir',
 'homme',
 'autre',
 'ou',
 'à',
 'petit',
 'quelque',
 'prendre',
 'grand',
 'encore',
 'votre',
 'femme',
 'rien',
 'quand',
 'là',
 'main',
 'peu',
 'jour',
 'celui',
 'dont',
 'bon',
 'mettre',
 'aussi',
 'jeune',
 'heure',
 'cela',
 'devoir',
 'falloir',
 'été',
 'non',
 'croire',
 'temps',
 'oeil',
 'puis',
 'chose',
 'donc',
 'sous',
 'ça',
 'jamais',
 'Monsieur',
 'fois',
 'toujours',
 'passer',
 'notre',
 'après',
 'seul',
 'ton',
 'tête',
 'alors',
 'porte',
 'entendre',


In [42]:
list_bigram_lemma_result

['de_le',
 'dans_le',
 'd’_un',
 'qu’_il',
 'de_son',
 'à_le',
 'c’_être',
 'et_le',
 'sur_le',
 'que_le',
 'de_l’',
 'n’_avoir',
 'il_avoir',
 'et_de',
 'de_ce',
 'que_je',
 'j’_avoir',
 'tout_le',
 'je_ne',
 'y_avoir',
 'à_l’',
 'qu’_lui',
 'lui_avoir',
 'avoir_pas',
 'à_son',
 'par_le',
 'dans_un',
 'il_être',
 'il_ne',
 'avec_un',
 'avoir_un',
 'il_y',
 'il_se',
 'pour_le',
 'dans_son',
 'il_n’',
 'être_pas',
 'l’_avoir',
 'que_vous',
 'n’_être',
 'comme_un',
 'être_un',
 'je_être',
 'le_plus',
 'ne_pouvoir',
 'je_vous',
 'à_ce',
 'ce_être',
 'qu’_on',
 'et_il',
 'à_un',
 'vous_avoir',
 'de_mon',
 'c’_éter',
 'avoir_été',
 'un_peu',
 'il_falloir',
 'ce_que',
 'qui_avoir',
 'le_main',
 'de_tout',
 'je_me',
 'être_-ce',
 'avoir_le',
 'être_le',
 'et_je',
 'dire_il',
 'il_s’',
 'm’_avoir',
 'pas_de',
 'par_un',
 '-_être',
 'je_n’',
 'pouvoir_-',
 'qu’_un',
 'faire_un',
 'pas_le',
 'avoir_faire',
 'le_porte',
 'que_ce',
 'sur_son',
 'ce_qui',
 'avec_le',
 'dans_l’',
 'il_éter

In [43]:
list_trigram_lemma_result

['il_y_avoir',
 'n’_avoir_pas',
 'pouvoir_-_être',
 'n’_être_pas',
 'qu’_il_avoir',
 'je_n’_avoir',
 'lui_-_même',
 'c’_être_un',
 'que_j’_avoir',
 'il_n’_y',
 'n’_y_avoir',
 'ne_être_pas',
 'ce_qu’_il',
 'y_avoir_un',
 'n’_éter_pas',
 'il_n’_avoir',
 'ce_n’_être',
 'être_-ce_que',
 'je_ne_savoir',
 'c’_être_le',
 'ne_savoir_pas',
 'qu’_lui_avoir',
 'tout_à_coup',
 'qu’_il_ne',
 'au_milieu_de',
 'que_je_ne',
 'ne_vouloir_pas',
 'se_mettre_à',
 'qu’_il_n’',
 'de_tout_le',
 'ne_pouvoir_pas',
 'qu’_être_-ce',
 'ce_que_je',
 'qu’_il_être',
 'tout_le_monde',
 'c’_éter_un',
 'le_porte_de',
 'd’_un_voix',
 'n’_avoir_jamais',
 'je_ne_pouvoir',
 'le_jeune_fille',
 'être_-ce_pas',
 'lui_n’_avoir',
 'que_c’_être',
 'qu’_il_y',
 'je_l’_avoir',
 'n’_être_-ce',
 'de_ne_pas',
 'et_de_le',
 'que_je_être',
 'et_qu’_il',
 'un_de_ce',
 'à_l’_heure',
 'je_ne_être',
 'y_avoir_pas',
 'n’_avoir_plus',
 'ce_être_un',
 'je_me_être',
 'que_vous_avoir',
 'au_fond_de',
 'au_bout_de',
 'c’_éter_le',
 'avoi

In [44]:
list_pos_result

['NOUN',
 'VERB',
 'PRON',
 'ADP',
 'DET',
 'ADV',
 'ADJ',
 'PROPN',
 'CCONJ',
 'AUX',
 'SCONJ',
 'INTJ']

In [45]:
list_bigram_pos_result

['DET_NOUN',
 'PRON_VERB',
 'NOUN_ADP',
 'ADP_DET',
 'ADP_NOUN',
 'VERB_ADP',
 'NOUN_PRON',
 'VERB_DET',
 'PRON_PRON',
 'VERB_ADV',
 'NOUN_ADJ',
 'NOUN_VERB',
 'NOUN_DET',
 'NOUN_CCONJ',
 'VERB_PRON',
 'ADJ_NOUN',
 'PRON_AUX',
 'ADV_VERB',
 'NOUN_NOUN',
 'DET_ADJ',
 'ADP_PRON',
 'PRON_ADV',
 'SCONJ_PRON',
 'NOUN_ADV',
 'VERB_VERB',
 'AUX_VERB',
 'ADV_PRON',
 'ADP_VERB',
 'ADV_ADP',
 'ADJ_ADP',
 'CCONJ_PRON',
 'VERB_NOUN',
 'ADJ_PRON',
 'NOUN_PROPN',
 'ADV_DET',
 'ADP_PROPN',
 'ADV_ADV',
 'PROPN_PRON',
 'ADJ_DET',
 'ADV_ADJ',
 'VERB_SCONJ',
 'PROPN_ADP',
 'CCONJ_DET',
 'CCONJ_ADP',
 'PRON_ADP',
 'ADJ_CCONJ',
 'NOUN_SCONJ',
 'VERB_ADJ',
 'SCONJ_DET',
 'DET_PROPN',
 'ADP_ADJ',
 'VERB_PROPN',
 'PROPN_VERB',
 'AUX_ADV',
 'PRON_DET',
 'ADJ_VERB',
 'NOUN_AUX',
 'VERB_CCONJ',
 'ADP_ADV',
 'PROPN_PROPN',
 'PROPN_DET',
 'ADV_NOUN',
 'CCONJ_VERB',
 'ADV_SCONJ',
 'PROPN_NOUN',
 'PRON_NOUN',
 'PROPN_CCONJ',
 'ADV_AUX',
 'PROPN_ADV',
 'PRON_PROPN',
 'ADJ_ADV',
 'ADJ_ADJ',
 'CCONJ_ADV',
 'AUX_ADJ',
 

In [46]:
list_trigram_pos_result

['ADP_DET_NOUN',
 'DET_NOUN_ADP',
 'VERB_DET_NOUN',
 'NOUN_ADP_NOUN',
 'NOUN_ADP_DET',
 'PRON_PRON_VERB',
 'DET_NOUN_PRON',
 'PRON_VERB_ADP',
 'VERB_ADP_DET',
 'NOUN_PRON_VERB',
 'NOUN_DET_NOUN',
 'PRON_VERB_DET',
 'ADP_NOUN_ADP',
 'DET_NOUN_ADJ',
 'DET_NOUN_VERB',
 'DET_ADJ_NOUN',
 'PRON_VERB_ADV',
 'NOUN_PRON_PRON',
 'DET_NOUN_CCONJ',
 'DET_NOUN_DET',
 'VERB_ADP_NOUN',
 'PRON_AUX_VERB',
 'PRON_VERB_PRON',
 'ADP_NOUN_PRON',
 'DET_NOUN_NOUN',
 'VERB_PRON_VERB',
 'ADV_DET_NOUN',
 'PRON_VERB_VERB',
 'DET_NOUN_ADV',
 'ADV_PRON_VERB',
 'ADV_VERB_ADV',
 'PRON_ADV_VERB',
 'ADJ_DET_NOUN',
 'ADP_PRON_VERB',
 'NOUN_VERB_ADP',
 'CCONJ_DET_NOUN',
 'NOUN_ADJ_ADP',
 'ADP_NOUN_DET',
 'ADJ_NOUN_ADP',
 'PRON_PRON_PRON',
 'ADP_NOUN_CCONJ',
 'VERB_ADV_ADP',
 'PRON_PRON_AUX',
 'NOUN_VERB_DET',
 'ADP_DET_ADJ',
 'SCONJ_PRON_VERB',
 'SCONJ_DET_NOUN',
 'ADJ_ADP_DET',
 'NOUN_ADP_PRON',
 'NOUN_PRON_AUX',
 'ADP_VERB_DET',
 'NOUN_CCONJ_PRON',
 'VERB_ADP_PRON',
 'NOUN_ADP_VERB',
 'NOUN_ADP_PROPN',
 'VERB_PRON_PRO

In [50]:
starttime = timeit.default_timer()
list_lemma_result, list_bigram_lemma_result, list_trigram_lemma_result, list_pos_result, list_bigram_pos_result, list_trigram_pos_result = moulinette(path_name, n_most_common_features) 
print("Le temps total d'execution en secondes est de : ", timeit.default_timer() - starttime)



BEGIN PROCESSING CORPUS-----------


BEGIN PROCESSING NOVEL-----------
1829_Hugo-Victor_Le-dernier-jour-d-un-condamne
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  40498
END PROCESSING NOVEL --------------


PROGRESSION  1.0 % COMPLETED



BEGIN PROCESSING NOVEL-----------
1831_Signol-Alphonse-Macaire-Stanislas_Le-Chiffonnier_Tome-2
PIPELINE SPACY ----------- OK
NOMBRE TOKENS =  31732
END PROCESSING NOVEL --------------


PROGRESSION  1.5 % COMPLETED


 GET LISTS RESULTS -----------

 NOMBRE TOTAL TOKENS =  72230

 RETURN LISTS RESULTS -----------


 END PROCESSING CORPUS --------------


Le temps total d'execution en secondes est de :  28.75209702199936


In [51]:
list_trigram_pos_result

['ADP_DET_NOUN',
 'DET_NOUN_ADP',
 'VERB_DET_NOUN',
 'NOUN_ADP_NOUN',
 'NOUN_ADP_DET',
 'DET_NOUN_PRON',
 'PRON_PRON_VERB',
 'NOUN_DET_NOUN',
 'VERB_ADP_DET',
 'PRON_VERB_ADP',
 'NOUN_PRON_VERB',
 'ADP_NOUN_ADP',
 'PRON_VERB_DET',
 'DET_NOUN_ADJ',
 'NOUN_PRON_PRON',
 'DET_NOUN_VERB',
 'PRON_AUX_VERB',
 'DET_ADJ_NOUN',
 'DET_NOUN_DET',
 'DET_NOUN_CCONJ',
 'ADP_NOUN_PRON',
 'VERB_ADP_NOUN',
 'PRON_VERB_PRON',
 'PRON_VERB_ADV',
 'DET_NOUN_NOUN',
 'ADJ_DET_NOUN',
 'VERB_PRON_VERB',
 'NOUN_PRON_AUX',
 'PRON_VERB_VERB',
 'PRON_PRON_AUX',
 'ADV_DET_NOUN',
 'ADP_NOUN_CCONJ',
 'DET_NOUN_ADV',
 'ADP_NOUN_DET',
 'NOUN_VERB_ADP',
 'PRON_PRON_PRON',
 'ADV_PRON_VERB',
 'CCONJ_DET_NOUN',
 'ADJ_NOUN_ADP',
 'ADV_VERB_ADV',
 'ADP_DET_ADJ',
 'SCONJ_PRON_VERB',
 'PRON_ADV_VERB',
 'SCONJ_DET_NOUN',
 'NOUN_ADJ_ADP',
 'PRON_DET_NOUN',
 'NOUN_CCONJ_ADP',
 'AUX_VERB_ADP',
 'NOUN_VERB_DET',
 'VERB_ADV_ADP',
 'NOUN_CCONJ_PRON',
 'ADP_PRON_VERB',
 'NOUN_ADP_PRON',
 'AUX_VERB_DET',
 'VERB_ADP_PRON',
 'DET_NOUN_AUX

In [52]:
list_trigram_lemma_result

['il_y_avoir',
 'le_peine_de',
 'n’_avoir_pas',
 'être_-ce_que',
 'c’_être_un',
 'peine_de_mort',
 'que_c’_être',
 'n’_y_avoir',
 'n’_être_pas',
 'pouvoir_-_être',
 'il_n’_y',
 'y_avoir_un',
 'qu’_être_-ce',
 'avoir_-t_il',
 'qu’_il_avoir',
 'ce_n’_être',
 'qu’_il_y',
 'que_j’_avoir',
 'il_m’_avoir',
 'ce_qu’_il',
 'c’_être_le',
 'je_ne_savoir',
 'ne_être_pas',
 'le_porte_de',
 'il_falloir_que',
 'je_n’_avoir',
 'de_le_peine',
 'que_l’_on',
 'que_je_être',
 'qu’_il_être',
 'et_j’_avoir',
 'lui_avoir_je',
 'de_tout_le',
 'c’_éter_un',
 'vif_le_nation',
 'je_l’_avoir',
 'le_tête_de',
 'en_ce_moment',
 'à_travers_le',
 'de_le_prison',
 'je_me_être',
 'j’_avoir_faire',
 'à_l’_instant',
 'il_s’_être',
 'n’_avoir_pouvoir',
 'un_de_ce',
 'n’_avoir_plus',
 'tout_à_coup',
 'qu’_il_ne',
 'c’_éter_le',
 'c’_être_que',
 'y_avoir_pas',
 'au_milieu_de',
 'm’_avoir_-t',
 'l’_assemblée_national',
 'ne_vouloir_pas',
 'au_nom_de',
 'celui_-_là',
 'c’_être_de',
 'au_moment_où',
 'qu’_on_ne',
 '