In [1]:
import spacy
import timeit
import math
import pandas as pd
import matplotlib.pyplot as plt
from os import path
from collections import Counter
from lxml import etree
from glob import glob
from unicodedata import normalize

In [2]:
%matplotlib inline

In [3]:
nlp = spacy.load('fr_core_news_lg')

In [4]:
def est_canon(tree):
    if tree.find(".//profileDesc") is not None:
        profil = tree.find(".//profileDesc")
        if profil.get("tag") == "canon":
            return True
        else:
            return False

In [120]:
def lemmatize(path):
    list_lemma = []
    with open(path, encoding="utf8") as file:
        tree = etree.parse(file)
        tag = est_canon(tree)
        if tag == True:
            print("canon")
        else:
            print("non_canon")
        if tree.findall(".//p"):
            for paragraphe in tree.findall(".//p"):
                if paragraphe.text:
                    clean_text = normalize("NFKD", paragraphe.text)
                    docs = nlp(clean_text)
                    for token in docs:
                        if token.pos_ != "PUNCT" and "SPACE" and "X" and "SYM":
                            list_lemma.append(token.lemma_)
    return list_lemma, tag

In [124]:
def bigrammize(list_lemma):
    list_bigram = []
    for indice_lemma in range(len(list_lemma)-1):
        bigram = list_lemma[indice_lemma]+'_'+list_lemma[indice_lemma+1]
        list_bigram.append(bigram)
    return list_bigram

In [83]:
def bigrammize_rolling(rolling_list_lemma):
    rolling_list_bigram = []
    list_bigram = []
    for list_lemma in rolling_list_lemma:
        for indice_lemma in range(len(list_lemma)-1):
            bigram = list_lemma[indice_lemma]+'_'+list_lemma[indice_lemma+1]
            list_bigram.append(bigram)
        rolling_list_bigram.append(list_bigram)
    return rolling_list_bigram

In [37]:
def rollingnwords(list_lemma, n):
    i = 0 # i stocke l'indice auquel on est dans le rolling
    list_rolling = []
    while i-n < len(list_lemma):
        list_rolling.append(list_lemma[i:i+n])
        i+=n
    return list_rolling    

In [79]:
def shannon_diversity(rolling_list_bigram):
    shannon_measures = []
    for list_bigram in rolling_list_bigram:
        shannon_sum = 0 # initialisation de l'indice de shannon
        dict_conteur = Counter(list_bigram)
        for bigram in list_bigram:
            # on recupere la proportion pi de chaque bigram par rapport à tous les autres bigrams
            prop = dict_conteur[bigram]/len(list_bigram)
            shannon_courant = prop * (math.log(prop, 2))
            # on met à jour l'indice de shannon
            shannon_sum += shannon_courant
        shannon_measures.append(round(shannon_sum * -1,2))
    return shannon_measures

In [133]:
# non canon
#path_test = 'corpus_test/1834_Foa-Eugenie_La-Femme-a-la-mode.xml'

# canon
#path_test = 'corpus_test/1829_Hugo-Victor_Le-dernier-jour-d-un-condamne.xml'

#non canon
#path_test = 'corpus_test/1857_Aimard-Gustave_Les-trappeurs-de-l-Arkansas.xml'

# canon
path_test = 'corpus_test/1857_Flaubert-Gustave_Madame-Bovary.xml'

In [134]:
window = 1000
list_lemma, canon = lemmatize(path_test)
list_bigram = bigrammize(list_lemma)
rolling_list_bigram = rollingnwords(list_bigram, window)
results = shannon_diversity(rolling_list_bigram)

canon


In [126]:
print(results)# Eugenie Foa

[12.67, 13.27, 12.88, 15.71, 12.35, 12.92, 12.75, 12.58, 12.84, 14.66, 12.51, 13.11, 13.33, 13.35, 12.99, 12.43, 12.22, 13.07, 13.36, 14.0, 12.75, 12.29, 12.72, 13.21, 12.96, 12.58, 12.38, 12.6, 12.51, 12.9, 11.92, 12.61, 13.18, 12.14, 12.33, 12.3, 13.3, 12.59, 12.55, 0]


In [129]:
print(results)# Victor Hugo

[15.24, 13.68, 13.25, 13.77, 13.62, 14.31, 13.92, 13.13, 13.35, 19.9, 16.07, 12.64, 12.88, 12.34, 13.47, 12.74, 13.23, 12.72, 12.72, 12.13, 14.8, 14.68, 14.16, 13.79, 13.78, 13.14, 13.35, 14.0, 13.81, 12.49, 12.95, 13.7, 14.42, 15.51, 15.61, 13.27, 10.94, 0]


In [132]:
print(results)# Gustave Aimard

[13.39, 13.38, 13.22, 14.75, 13.01, 13.4, 13.44, 14.09, 13.45, 14.37, 13.4, 12.51, 13.37, 14.74, 13.57, 13.27, 13.23, 13.15, 12.94, 13.04, 13.27, 13.16, 13.83, 14.6, 12.71, 12.76, 13.56, 14.78, 12.69, 12.88, 13.61, 12.95, 14.81, 13.0, 13.95, 12.58, 13.1, 13.57, 13.74, 13.45, 13.66, 13.98, 13.34, 13.83, 13.83, 12.61, 13.33, 13.43, 15.41, 13.56, 14.49, 14.93, 14.14, 13.99, 13.28, 12.75, 14.5, 14.19, 13.86, 13.02, 14.36, 13.36, 13.04, 14.66, 13.27, 12.96, 13.92, 13.9, 14.92, 14.61, 14.13, 14.34, 12.99, 13.32, 12.76, 13.59, 13.53, 15.06, 13.16, 13.11, 13.15, 12.52, 13.73, 12.85, 13.44, 13.87, 15.72, 14.44, 14.09, 14.48, 13.8, 12.6, 12.59, 13.2, 12.29, 0]


In [135]:
print(results)# Gustave Flaubert

[13.15, 12.59, 12.39, 12.34, 12.38, 13.19, 12.47, 12.63, 13.04, 12.7, 12.26, 12.68, 12.79, 12.32, 12.62, 12.54, 11.97, 13.1, 11.66, 12.1, 12.28, 11.75, 11.9, 12.3, 12.35, 13.65, 12.71, 12.47, 12.49, 12.41, 12.4, 11.88, 12.45, 11.87, 12.97, 12.36, 12.09, 12.39, 11.72, 12.45, 12.47, 11.87, 12.27, 11.84, 12.69, 12.62, 11.71, 12.45, 12.04, 11.84, 12.76, 12.73, 11.85, 11.91, 12.04, 11.93, 12.65, 12.98, 12.03, 11.89, 11.82, 12.36, 11.83, 12.52, 11.68, 12.55, 12.28, 12.03, 12.18, 12.34, 12.04, 11.88, 11.88, 12.24, 12.22, 12.27, 11.93, 11.98, 12.02, 12.87, 12.84, 11.7, 12.42, 12.33, 12.52, 11.88, 11.9, 11.81, 12.44, 11.79, 12.65, 12.48, 12.35, 13.44, 12.14, 12.03, 11.85, 11.87, 12.33, 12.04, 11.55, 12.06, 12.52, 11.85, 12.35, 12.36, 12.23, 12.2, 12.94, 12.67, 12.86, 11.9, 12.72, 12.48, 12.33, 12.44, 12.82, 11.65, 11.64, 12.29, 12.19, 8.56, 0]


test sur les lemmes

In [117]:
# non canon
path_test = 'corpus_test/1834_Foa-Eugenie_La-Femme-a-la-mode.xml'

# canon
#path_test = 'corpus_test/1829_Hugo-Victor_Le-dernier-jour-d-un-condamne.xml'

#non canon
#path_test = 'corpus_test/1857_Aimard-Gustave_Les-trappeurs-de-l-Arkansas.xml'

# canon
#path_test = 'corpus_test/1857_Flaubert-Gustave_Madame-Bovary.xml'

In [121]:
window = 1000
list_lemma, canon = lemmatize_rolling(path_test)
rolling_list_lemma = rollingnwords(list_lemma, window)
results = shannon_diversity(rolling_list_lemma)

non_canon


In [122]:
print(results)# Eugenie Foa

[69.87, 72.64, 69.48, 68.96, 61.84, 60.69, 60.17, 60.87, 67.73, 68.78, 64.19, 67.36, 64.39, 61.4, 65.44, 64.5, 66.36, 65.69, 69.41, 66.79, 68.14, 61.12, 66.86, 65.72, 65.46, 65.86, 64.85, 63.36, 60.63, 63.39, 62.63, 65.67, 73.45, 61.65, 61.1, 66.89, 62.04, 63.2, 55.17, 0]


In [116]:
print(results)# Victor Hugo

[79.2, 76.78, 71.45, 76.36, 77.12, 77.32, 76.16, 73.17, 77.42, 86.42, 82.43, 69.02, 72.06, 67.54, 74.97, 68.42, 69.82, 76.87, 79.27, 71.18, 66.96, 71.51, 75.19, 76.07, 68.78, 67.25, 66.34, 70.53, 75.39, 63.77, 68.68, 72.26, 72.11, 71.54, 76.94, 79.23, 35.09, 0]


In [113]:
print(results)# Gustave Aimard

[85.59, 77.32, 74.99, 84.14, 77.07, 71.49, 67.52, 69.22, 70.85, 70.83, 84.18, 73.46, 73.16, 75.86, 71.1, 65.86, 69.55, 79.79, 74.04, 67.69, 72.96, 73.4, 76.2, 83.1, 71.91, 70.75, 84.66, 79.59, 72.21, 63.85, 76.8, 71.48, 83.76, 72.69, 74.0, 66.14, 69.73, 65.75, 70.32, 73.26, 73.42, 73.26, 75.51, 74.79, 73.44, 72.09, 69.38, 76.57, 75.22, 75.29, 74.78, 70.67, 79.71, 74.14, 78.44, 70.53, 71.17, 80.42, 73.91, 74.55, 71.56, 73.6, 68.89, 74.76, 71.2, 73.35, 72.37, 77.63, 82.93, 78.19, 81.54, 71.63, 64.83, 66.78, 67.33, 70.88, 66.42, 79.19, 63.89, 68.14, 75.06, 70.26, 75.55, 71.76, 70.86, 70.42, 86.16, 81.09, 79.24, 65.94, 65.46, 66.71, 65.37, 67.51, 55.35, 0]


In [110]:
print(results)# Gustave Flaubert

[70.49, 73.83, 76.92, 74.96, 76.32, 76.91, 67.05, 76.34, 71.1, 79.71, 76.99, 78.99, 84.05, 72.3, 76.49, 78.39, 72.95, 83.97, 85.62, 78.5, 76.7, 77.78, 73.49, 76.81, 77.37, 86.64, 73.35, 73.31, 71.5, 62.64, 78.19, 74.58, 70.41, 70.76, 71.35, 76.34, 70.84, 65.25, 72.96, 74.23, 66.41, 67.42, 68.29, 65.46, 75.86, 71.54, 66.09, 76.12, 66.73, 62.93, 67.8, 72.69, 72.59, 61.88, 72.78, 62.37, 77.86, 71.66, 70.36, 66.01, 64.3, 67.6, 61.16, 64.8, 67.69, 58.52, 68.89, 64.45, 64.22, 73.2, 61.48, 61.73, 66.34, 67.62, 66.42, 76.24, 68.8, 64.69, 80.07, 72.05, 76.72, 60.91, 65.44, 62.61, 72.04, 71.66, 76.36, 59.34, 66.89, 66.45, 73.1, 66.54, 82.51, 81.4, 66.58, 64.43, 59.92, 64.92, 69.13, 63.94, 63.04, 76.06, 67.08, 62.72, 65.97, 70.83, 70.78, 65.58, 65.65, 72.94, 67.95, 62.43, 71.0, 65.85, 70.26, 70.28, 77.07, 65.42, 69.87, 74.35, 69.29, 26.17, 0]
