In [4]:
import spacy
import timeit
import math
import pandas as pd
import matplotlib.pyplot as plt
from os import path
from collections import Counter
from lxml import etree
from glob import glob
from unicodedata import normalize
from tqdm import tqdm

In [5]:
nlp = spacy.load('fr_core_news_lg')

In [6]:
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7fae5dd37a40>

In [12]:
def clean_text(txt):
    txt_res = normalize("NFKD", str(txt).replace(u'\xa0', u' '))
    txt_res = txt_res.replace(u'\\xa0', u' ')
    txt_res = txt_res.replace(u'\\n', u'')
    txt_res = txt_res.replace(u'\n', u'')
    txt_res = txt_res.replace(u'\\', u'')
    txt_res = txt_res.replace(u'\'', u'')

    return txt_res

In [7]:
def get_n_most_common_features(list_tokens, n):
    table_freq = dict(Counter(list_tokens).most_common(n))
    return list(table_freq.keys())

In [1]:
def bigrammize(list_token):
    """fonction qui prend en parametre une liste de tokens et retourne une liste de bi-grammes"""
    list_bigram = []
    for indice_token in range(len(list_token)-1):
        bigram = list_token[indice_token]+'_'+list_token[indice_token+1]
        list_bigram.append(bigram)
    return list_bigram

In [2]:
def trigrammize(list_token):
    """fonction qui prend en parametre une liste de tokens et retourne une liste de tri-grammes"""
    list_trigram = []
    for indice_token in range(len(list_token)-2):
        trigram = list_token[indice_token]+'_'+list_token[indice_token+1]+'_'+list_token[indice_token+2]
        list_trigram.append(trigram)
    return list_trigram

In [29]:
def get_n_features(path_name, n):
    str_base = " "
    list_illitch = []
    for doc in glob(path_name):
        list_lemma, list_token, nombre_tokens = pipeline_spacy(doc)
        print(doc +' : '+str(nombre_tokens)+' tokens')
        list_bigram_lemma = bigrammize(list_lemma)
        list_trigram_lemma = trigrammize(list_lemma)
        
        list_bigram_token = bigrammize(list_token)
        list_trigram_token = trigrammize(list_token)
        
        
        list_lemma_result = get_n_most_common_features(list_lemma, n)
        list_bigram_lemma_result = get_n_most_common_features(list_bigram_lemma, n)
        list_trigram_lemma_result = get_n_most_common_features(list_trigram_lemma, n)
    
        list_token_result = get_n_most_common_features(list_token, n)
        list_bigram_token_result = get_n_most_common_features(list_bigram_token, n)
        list_trigram_token_result = get_n_most_common_features(list_trigram_token, n)

    return list_lemma_result, list_bigram_lemma_result, list_trigram_lemma_result, list_token_result, list_bigram_token_result, list_trigram_token_result

In [92]:
def chunkise(list_token, n):
    list_chunks = []
    for i in range(0, len(list_token)-n, n):
        list_chunks.append(list_token[i:i+n])
    return list_chunks

In [26]:
def pipeline_spacy(path):
    pos_ko = ["NUM", "X", "SYM", "PUNCT", "SPACE"]
    str_base = " "
    list_lemma = []
    list_token = []
    nombre_tokens = 0
    with open(path, encoding="utf8") as file:
        text_ivan = file.readlines()
        text_ivan_str = str_base.join(text_ivan)
        text_ivan_cleaned = clean_text(text_ivan_str.lower())

        docs = nlp(text_ivan_cleaned)
        nombre_tokens += len(docs)
        
        for token in docs:
            if token.pos_ not in pos_ko:
                list_lemma.append(token.lemma_)
                list_token.append(token.text)

    return list_lemma, list_token, nombre_tokens

In [211]:
n_most_common_features = 1000

In [212]:
path_name = "data/Tolstoi - La mort de Ivan Ilitch.txt"

In [213]:
list_lemma_result, list_bigram_lemma_result, list_trigram_lemma_result, list_token_result, list_bigram_token_result, list_trigram_token_result = get_n_features(path_name, n_most_common_features)

data/Tolstoi - La mort de Ivan Ilitch.txt : 25885 tokens


In [77]:
def dict_freq_token(list_lemma, list_select):
    
    dict_result = dict.fromkeys(list_select)
    
    dict_temp = Counter(list_lemma)
        
    for key in dict_temp.keys():
        if key in dict_result.keys():
            dict_result[key] = dict_temp[key]/len(list_lemma)
    
    return dict_result

In [103]:
def compute_list(list_selected, list_chunks_tokens, doc_name):
    
    df_lemma = pd.DataFrame()
    dict_chunks = {}
    i=1
    
    for chunk in list_chunks_tokens:
                
        dict_chunk = dict_freq_token(chunk, list_selected)
        dict_chunk["index"] = doc_name+'_chunk_'+str(i)

        #update ici pour les autres features
        
        df_temp_lemma = pd.DataFrame(dict_chunk, index=[0])
        
        df_lemma = df_lemma.append(df_temp_lemma, ignore_index = True)
        
        i+=1
    return df_lemma

In [138]:
def mouli_ivan(path_name, n, list_lemma_select):
    
    str_base = " "
    list_illitch = []
    
    dict_results_lemma = {}
    
    df_lemma = pd.DataFrame()
    
    for doc in glob(path_name):
        
        doc_name = path.splitext(path.basename(doc))[0]
        
        list_lemma_temp, list_token_temp, nombre_tokens = pipeline_spacy(doc)
        print(doc +' : '+str(nombre_tokens)+' tokens')

        list_chunks_lemma = chunkise(list_lemma_temp, n)
        
        df_lemma_temp = compute_list(list_lemma_select, list_chunks_lemma, doc_name)
        
        df_lemma = df_lemma.append(df_lemma_temp, ignore_index = True)

    df_lemma.set_index("index", inplace = True)
        
    return df_lemma

In [214]:
n_token_per_chunk = 100

In [215]:
path_name = r'data/chapitres_II/*.txt'

In [216]:
df_lemma_ivan = mouli_ivan(path_name, n_token_per_chunk, list_lemma_result)

data/chapitres_II/chap_10.txt : 965 tokens
data/chapitres_II/chap_03.txt : 3155 tokens
data/chapitres_II/chap_09.txt : 1213 tokens
data/chapitres_II/chap_04.txt : 3186 tokens
data/chapitres_II/chap_02.txt : 3314 tokens
data/chapitres_II/chap_12.txt : 996 tokens
data/chapitres_II/chap_07.txt : 1897 tokens
data/chapitres_II/chap_11.txt : 1170 tokens
data/chapitres_II/chap_08.txt : 2962 tokens
data/chapitres_II/chap_05.txt : 1919 tokens
data/chapitres_II/chap_06.txt : 1375 tokens
data/chapitres_II/chap_01.txt : 3698 tokens


In [210]:
df_lemma_ivan

Unnamed: 0_level_0,le,de,et,il,son,un,à,lui,se,ce,...,soirée,sapprocher,intérieur,partenaire,tendre,meuble,asseoir,lenfance,calmer,pénibl
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chap_10_chunk_1,0.10,0.05,0.04,0.03,0.03,,0.02,,0.01,0.02,...,,,0.01,,,,,,,
chap_10_chunk_2,0.11,0.03,0.07,,0.04,,0.01,,0.01,0.01,...,,,,,,,,,,
chap_10_chunk_3,0.07,0.11,0.01,0.03,0.02,0.02,0.01,0.01,0.03,0.04,...,,,,,,,,,,
chap_10_chunk_4,0.11,0.07,0.03,0.01,0.03,0.01,0.01,0.02,0.01,0.03,...,,,,,,,,0.01,,
chap_10_chunk_5,0.05,0.07,0.04,0.02,0.03,0.03,0.01,,0.01,0.02,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chap_01_chunk_26,0.04,0.07,0.03,0.02,0.02,,0.06,0.07,0.05,0.01,...,,,,,,,,,,0.03
chap_01_chunk_27,0.02,0.03,0.02,0.03,0.04,0.02,0.02,0.03,0.03,0.04,...,,,,,,,,,,
chap_01_chunk_28,0.07,0.05,0.03,0.01,,0.05,0.02,0.05,,,...,,,,,,,,,,
chap_01_chunk_29,0.09,0.06,0.04,0.01,0.02,0.04,0.04,0.01,,,...,,,,,,,,,,


In [172]:
import pandas as pd
import numpy as np
from sklearn import set_config
import sklearn.metrics as metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import matplotlib.pyplot as plt

In [180]:
def get_chap(index):
    list_chapitre = []
    str_base = '_'
    for elem in index:
        chap = elem.split("_")[0:2]
        list_chapitre.append(str_base.join(chap))
    return list_chapitre

In [217]:
list_chapitre = get_chap(df_lemma_ivan.index)

In [218]:
df_lemma_ivan['chapitre']=list_chapitre

In [189]:
def canonizer(data, test_size=0.1, random_state=42, sampling=None, cross_validation=False, cv=5, kernel='rbf', nb_coef=20):
    
    df_results = pd.DataFrame()
    
    if cross_validation == True:
        pipe = make_pipeline(StandardScaler(), SVC(kernel=kernel, probability=True))
        cv_results = cross_validate(pipe, data.drop(['chapitre'], axis=1), data['chapitre'], cv=cv)
        return pipe, cv_results
    
    else:
        if kernel == 'rbf':
            pipe = make_pipeline(StandardScaler(), Normalizer(), SVC(kernel=kernel, probability=True))
        else:
            pipe = make_pipeline(StandardScaler(), SVC(kernel=kernel, probability=True))
    

        X_train, X_test, y_train, y_test = train_test_split(data.drop(['chapitre'], axis=1), data['chapitre'], test_size=test_size, random_state=random_state)
        print('Original dataset shape {}'.format(Counter(y_train)))
                
        if sampling is not None:     
            if sampling == 'over':
                ros = RandomOverSampler(random_state=10)
                X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
                
            elif sampling == 'under':
                rus = RandomUnderSampler(random_state=10)
                X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
                
            elif sampling == "smoteenn":
                smote_enn = SMOTEENN(random_state=10)
                X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
                
            elif sampling == 'smotetomek':
                smote_tomek = SMOTETomek(random_state=10)
                X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
            
            else:
                print('Please follow the sampling possible values : over, under, smoteenn, smotetomek')
                return
                
            print('Resampled dataset shape {}'.format(Counter(y_resampled)))
        
            pipe.fit(X_resampled, y_resampled)
        
        else:
            pipe.fit(X_train, y_train)
        print(metrics.classification_report(y_test, pipe.predict(X_test)))
    
        if kernel == 'linear':
            coefs = pipe.named_steps['svc'].coef_
            return pipe, coefs
            #plot_coefficients(*coefs, data.columns, nb_coef)
            
        #df_results['metadata'] = y_test
        #df_results['proba canon'] = pipe.predict_proba(X_test)[:,0]
        #df_results['proba non-canon'] = pipe.predict_proba(X_test)[:,1]
        #df_results['prediction']= pipe.predict(X_test)
    
        #df_results['accord'] = [True if row['metadata'] == row['prediction'] else False for index, row in df_results.iterrows()]
    
    return pipe


In [219]:
df_lemma_ivan = df_lemma_ivan.replace(np.nan, 0)

In [220]:
pipe_test = canonizer(df_lemma_ivan)

Original dataset shape Counter({'chap_01': 25, 'chap_04': 25, 'chap_02': 25, 'chap_03': 24, 'chap_08': 22, 'chap_07': 14, 'chap_05': 13, 'chap_06': 9, 'chap_11': 9, 'chap_09': 9, 'chap_12': 8, 'chap_10': 8})
              precision    recall  f1-score   support

     chap_01       0.67      0.80      0.73         5
     chap_02       0.80      1.00      0.89         4
     chap_03       1.00      0.67      0.80         3
     chap_04       0.22      1.00      0.36         2
     chap_05       0.00      0.00      0.00         2
     chap_06       0.00      0.00      0.00         2
     chap_07       0.00      0.00      0.00         2
     chap_08       0.00      0.00      0.00         2

    accuracy                           0.55        22
   macro avg       0.34      0.43      0.35        22
weighted avg       0.45      0.55      0.47        22



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [221]:
pipe_test, coefs = canonizer(df_lemma_ivan, kernel='linear')

Original dataset shape Counter({'chap_01': 25, 'chap_04': 25, 'chap_02': 25, 'chap_03': 24, 'chap_08': 22, 'chap_07': 14, 'chap_05': 13, 'chap_06': 9, 'chap_11': 9, 'chap_09': 9, 'chap_12': 8, 'chap_10': 8})
              precision    recall  f1-score   support

     chap_01       1.00      0.80      0.89         5
     chap_02       0.75      0.75      0.75         4
     chap_03       0.67      0.67      0.67         3
     chap_04       0.33      1.00      0.50         2
     chap_05       1.00      1.00      1.00         2
     chap_06       0.00      0.00      0.00         2
     chap_07       1.00      0.50      0.67         2
     chap_08       0.50      0.50      0.50         2

    accuracy                           0.68        22
   macro avg       0.66      0.65      0.62        22
weighted avg       0.71      0.68      0.67        22



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Discussion :

- get more features -> test at least 200 bigrams -> extend n features (only mots outils)
- merge labels chapitres from qualitative insights
- merge labels chapitres from quantitative insights -> Topic Modeling
-