In [56]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from sklearn import set_config, metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, ConfusionMatrixDisplay
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split, ShuffleSplit, LeaveOneGroupOut, GroupKFold
import random
from collections import Counter

In [76]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SVMSMOTE 

In [133]:
#df_main = pd.read_csv(r'data/unigram_dataset_majed_canon_author_scale.csv')
df_main = pd.read_csv(r'data/main_dataset_majed_canon_author_scale.csv')
df_main.set_index("index", inplace = True)
df_main = df_main.replace(np.nan, 0)

In [136]:
list_author = []
for elem in df_main.index:
    list_author.append(elem.split('_')[1])
    
df_main['auteur'] = list_author

In [201]:
df_concat = pd.concat([df_main.iloc[:, 0:800], df_main.iloc[:, 1000:1800]],  axis=1)#df_main.iloc[:, 3156:3956]],
df_concat['canon']=df_main['canon']
df_concat['auteur']=df_main['auteur']

In [134]:
df_unigram_main = df_main.iloc[:, 0:2000]
df_unigram_main['canon'] = df_main['canon']

In [137]:
len(set(list_author))

714

In [228]:
def GKF_canonized(df_main, n_splits=5, sampling=None):
    
    ALL_PREDS, ALL_GT = [], [] # lists of all predictions vs all ground truth data
    
    pipe = make_pipeline(StandardScaler(), SVC(class_weight={"canon":1.5, "non_canon":1}))
    gkf = GroupKFold(n_splits)
    
    for (train_index, test_index) in tqdm(gkf.split(df_main.drop(['auteur', 'canon'], axis=1), df_main['canon'], df_main['auteur']), total=n_splits): 
            train = df_main.iloc[train_index]
            test = df_main.iloc[test_index]
            #print("\n\n GROUP "+str(i)+' : \n\n')
            #print("\n\n Train set: \n\n")
            #print(list(set(list(df_main['auteur'][train_index]))))
            #print("\n\n Test set: \n\n")
            #print(list(set(list(df_main['auteur'][test_index]))))
            
            X_train = train.drop(['auteur', 'canon'], axis=1)
            y_train = train['canon']
            X_test = test.drop(['auteur', 'canon'], axis=1)
            y_test = test['canon']
            
            if sampling is not None:     
                if sampling == 'over':
                    ros = RandomOverSampler(random_state=10)
                    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
                    
                elif sampling == 'svm':
                    sm = SVMSMOTE(random_state=10)
                    X_resampled, y_resampled = sm.fit_resample(X_train, y_train)
                    
                elif sampling == 'under':
                    rus = RandomUnderSampler(random_state=10)
                    X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
                    
                elif sampling == "smoteenn":
                    smote_enn = SMOTEENN(random_state=10)
                    X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
                    
                elif sampling == 'smotetomek':
                    smote_tomek = SMOTETomek(random_state=10)
                    X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
                
                else:
                    print('Please follow the sampling possible values : over, under, smoteenn, smotetomek')
                    return
                    
                print('Resampled dataset shape {}'.format(Counter(y_resampled)))
            
                pipe.fit(X_resampled, y_resampled)
        
            else:
                pipe.fit(X_train, y_train)
                
            preds = pipe.predict(X_test)
        
            #print(f"\n PREDS : ")
            #print(set(zip(test.index, preds)))
            #print("\n\n")
  
            ALL_PREDS.extend(preds)
            ALL_GT.extend(y_test)
    
    report = metrics.classification_report(ALL_GT, ALL_PREDS, output_dict=True)# zero_division=1
    df_scores = pd.DataFrame(report).transpose()
    
    return pipe, df_scores

In [205]:
pipe, df_final_scores = GKF_canonized(df_concat, 5, sampling='under')

  0%|          | 0/5 [00:00<?, ?it/s]

Resampled dataset shape Counter({'canon': 966, 'non_canon': 966})
Resampled dataset shape Counter({'canon': 1018, 'non_canon': 1018})
Resampled dataset shape Counter({'canon': 868, 'non_canon': 868})
Resampled dataset shape Counter({'canon': 923, 'non_canon': 923})
Resampled dataset shape Counter({'canon': 917, 'non_canon': 917})


In [192]:
#df_concat 800*2 + 800 POS, sampling = under
df_final_scores

Unnamed: 0,precision,recall,f1-score,support
canon,0.699021,0.730605,0.714464,1173.0
non_canon,0.817762,0.793509,0.805453,1787.0
accuracy,0.768581,0.768581,0.768581,0.768581
macro avg,0.758392,0.762057,0.759959,2960.0
weighted avg,0.770707,0.768581,0.769396,2960.0


In [206]:
#df_concat 800*2 
df_final_scores

Unnamed: 0,precision,recall,f1-score,support
canon,0.696648,0.726343,0.711185,1173.0
non_canon,0.815199,0.792389,0.803632,1787.0
accuracy,0.766216,0.766216,0.766216,0.766216
macro avg,0.755923,0.759366,0.757409,2960.0
weighted avg,0.768219,0.766216,0.766997,2960.0


In [204]:
#df_unigram_main 
df_final_scores

Unnamed: 0,precision,recall,f1-score,support
canon,0.690049,0.721228,0.705294,1173.0
non_canon,0.811419,0.787353,0.799205,1787.0
accuracy,0.761149,0.761149,0.761149,0.761149
macro avg,0.750734,0.75429,0.752249,2960.0
weighted avg,0.763322,0.761149,0.761989,2960.0


In [141]:
#df_main
df_final_scores

Unnamed: 0,precision,recall,f1-score,support
canon,0.726539,0.613811,0.665434,1173.0
non_canon,0.769934,0.848349,0.807242,1787.0
accuracy,0.755405,0.755405,0.755405,0.755405
macro avg,0.748236,0.73108,0.736338,2960.0
weighted avg,0.752737,0.755405,0.751046,2960.0


In [96]:
#under
df_final_scores

Unnamed: 0,precision,recall,f1-score,support
canon,0.685668,0.717818,0.701374,1173.0
non_canon,0.808891,0.783996,0.796249,1787.0
accuracy,0.75777,0.75777,0.75777,0.75777
macro avg,0.74728,0.750907,0.748812,2960.0
weighted avg,0.76006,0.75777,0.758652,2960.0


In [207]:
df_final_scores.round(decimals = 2)

Unnamed: 0,precision,recall,f1-score,support
canon,0.7,0.73,0.71,1173.0
non_canon,0.82,0.79,0.8,1787.0
accuracy,0.77,0.77,0.77,0.77
macro avg,0.76,0.76,0.76,2960.0
weighted avg,0.77,0.77,0.77,2960.0


In [208]:
res = df_final_scores.round(decimals = 2)

In [209]:
print(res.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
\midrule
canon        &       0.70 &    0.73 &      0.71 &  1173.00 \\
non\_canon    &       0.82 &    0.79 &      0.80 &  1787.00 \\
accuracy     &       0.77 &    0.77 &      0.77 &     0.77 \\
macro avg    &       0.76 &    0.76 &      0.76 &  2960.00 \\
weighted avg &       0.77 &    0.77 &      0.77 &  2960.00 \\
\bottomrule
\end{tabular}



  print(res.to_latex())


## NOVEL SCALE

In [210]:
df_main = pd.read_csv(r'data/main_dataset_majed_canon_novel_scale.csv')
df_main.set_index("index", inplace = True)
df_main = df_main.replace(np.nan, 0)

In [211]:
list_author = []
for elem in df_main.index:
    list_author.append(elem.split('_')[1])
    
df_main['auteur'] = list_author
df_concat = pd.concat([df_main.iloc[:, 0:800], df_main.iloc[:, 1000:1800]],  axis=1)#df_main.iloc[:, 3156:3956]],
df_concat['canon']=df_main['canon']
df_concat['auteur']=df_main['auteur']

In [219]:
df_concat.canon.value_counts()

non_canon    2654
canon         306
Name: canon, dtype: int64

In [231]:
pipe, df_final_scores = GKF_canonized(df_main, 5, sampling='over')

  0%|          | 0/5 [00:00<?, ?it/s]

In [232]:
df_final_scores

Unnamed: 0,precision,recall,f1-score,support
canon,0.409091,0.058824,0.102857,306.0
non_canon,0.901235,0.990203,0.943627,2654.0
accuracy,0.893919,0.893919,0.893919,0.893919
macro avg,0.655163,0.524513,0.523242,2960.0
weighted avg,0.850358,0.893919,0.856709,2960.0


## LOGO CV

In [14]:
pipe, df_final_scores = LOGO_canonized(df_main)

0it [00:00, ?it/s]


 AUTHOR OUT 0 : A-D-G. 


 PREDS : 
{('1977_A-D-G._Juste-un-rigolo', 'non_canon')}




 AUTHOR OUT 1 : A.D.G. 


 PREDS : 
{('1982_A.D.G._On-n-est-pas-des-chiens', 'non_canon')}




 AUTHOR OUT 2 : Adam-Juliette 


 PREDS : 
{('1876_Adam-Juliette_Jean-et-Pascal', 'non_canon')}




 AUTHOR OUT 3 : Adam-Paul 


 PREDS : 
{('1903_Adam-Paul_Au-soleil-de-juillet', 'canon')}




 AUTHOR OUT 4 : Aimard-Gustave 


 PREDS : 
{('1893_Aimard-Gustave_La-Belle-Riviere', 'non_canon'), ('1882_Aimard-Gustave_Les-bandits-de-l-Arizona', 'non_canon'), ('1876_Aimard-Gustave_Le-Chasseur-de-rats_Tome-1-L-oeil-gris', 'non_canon')}




 AUTHOR OUT 5 : Aimard-Gustave-Auriac-Jules-Berlioz-d- 


 PREDS : 
{('1867_Aimard-Gustave-Auriac-Jules-Berlioz-d-_Oeil-de-feu', 'non_canon'), ('1865_Aimard-Gustave-Auriac-Jules-Berlioz-d-_Les-pieds-fourchus', 'non_canon')}




 AUTHOR OUT 6 : Ajar-Emile-Gary-Romain 


 PREDS : 
{('1975_Ajar-Emile-Gary-Romain_La-Vie-Devant-Soi', 'non_canon')}




 AUTHOR OUT 7 : Alain-Fournier


 PREDS : 
{('1877_Bouvier-Alexis_Le-Domino-rose', 'non_canon'), ('1879_Bouvier-Alexis_La-Belle-Grelee', 'non_canon')}




 AUTHOR OUT 42 : Bouvier-Nicolas 


 PREDS : 
{('1963_Bouvier-Nicolas_L-usage-du-monde', 'canon')}




 AUTHOR OUT 43 : Boylesve-Rene 


 PREDS : 
{('1901_Boylesve-Rene_La-Becquee', 'canon')}




 AUTHOR OUT 44 : Brehat-Alfred-de 


 PREDS : 
{('1876_Brehat-Alfred-de_L-hotel-du-dragon-Souvenirs-de-voyages', 'non_canon')}




 AUTHOR OUT 45 : Buet-Charles 


 PREDS : 
{('1876_Buet-Charles_Philippe-Monsieur', 'non_canon'), ('1881_Buet-Charles_Histoires-a-dormir-debout', 'non_canon')}




 AUTHOR OUT 46 : Calet-Henri 


 PREDS : 
{('1935_Calet-Henri_La-belle-lurette', 'canon')}




 AUTHOR OUT 47 : Camus-Albert 


 PREDS : 
{('1994_Camus-Albert_Le-premier-homme', 'canon'), ('1947_Camus-Albert_La-peste', 'canon'), ('1944_Camus-Albert_Le-premier-homme', 'canon'), ('1942_Camus-Albert_L-etranger', 'non_canon')}




 AUTHOR OUT 48 : Cardinal-Marie 


 PREDS : 
{('1975_Card


 PREDS : 
{('1999_Echenoz-Jean_Je-m-en-vais', 'canon'), ('1983_Echenoz-Jean_Cherokee', 'canon'), ('2014_Echenoz-Jean_Caprice-de-la-reine', 'non_canon')}




 AUTHOR OUT 93 : Enard-Mathias 


 PREDS : 
{('2008_Enard-Mathias_Zone', 'canon')}




 AUTHOR OUT 94 : Erckmann-Chatrian 


 PREDS : 
{('1881_Erckmann-Chatrian_Les-Vieux-de-la-Vieille-Justine-et-Lucien.', 'non_canon'), ('1860_Erckmann-Chatrian_Contes-fantastiques', 'canon'), ('1862_Erckmann-Chatrian_L-Invasion-ou-le-Fou-Yegof', 'canon')}




 AUTHOR OUT 95 : Ernaux-Annie 


 PREDS : 
{('2016_Ernaux-Annie_Memoire-de-fille', 'canon'), ('1981_Ernaux-Annie_La-femme-gelee', 'canon'), ('1983_Ernaux-Annie_La-place', 'canon'), ('2008_Ernaux-Annie_Les-annees', 'canon')}




 AUTHOR OUT 96 : Escoffier-Henri 


 PREDS : 
{('1876_Escoffier-Henri_Les-femmes-fatales', 'non_canon')}




 AUTHOR OUT 97 : Fargue-Leon-Paul 


 PREDS : 
{('1942_Fargue-Leon-Paul_Refuges', 'canon')}




 AUTHOR OUT 98 : Farrenc-Cesarie 


 PREDS : 
{('1863_Farrenc-Ce


 PREDS : 
{('1879_Houssaye-Arsene_Histoires-romanesques', 'non_canon')}




 AUTHOR OUT 139 : Hugo-Victor 


 PREDS : 
{('1834_Hugo-Victor_Claude-Gueux', 'non_canon'), ('1829_Hugo-Victor_Le-dernier-jour-d-un-condamne', 'canon'), ('1866_Hugo-Victor_Les-travailleurs-de-la-mer', 'canon'), ('1831_Hugo-Victor_Notre-Dame-de-Paris_(Tome-2)', 'non_canon'), ('1862_Hugo-Victor_Les-Miserables', 'canon'), ('1869_Hugo-Victor_L-homme-qui-rit', 'canon'), ('1831_Hugo-Victor_Notre-Dame-de-Paris_(Tome-1)', 'non_canon'), ('1874_Hugo-Victor_Quatrevingt-Treize', 'canon')}




 AUTHOR OUT 140 : Huysmans-Joris-Karl 


 PREDS : 
{('1879_Huysmans-Joris-Karl_Les-soeurs-Vatard', 'canon'), ('1891_Huysmans-Joris-Karl_La-bas', 'canon'), ('1884_Huysmans-Joris-Karl_a-rebours', 'canon')}




 AUTHOR OUT 141 : Ionesco-Eugene 


 PREDS : 
{('1973_Ionesco-Eugene_Le-solitaire', 'canon')}




 AUTHOR OUT 142 : Ivoi-Paul-d- 


 PREDS : 
{('1909_Ivoi-Paul-d-_L-Espion-X-323_Volume-I-L-Homme-sans-visage', 'non_canon')}




 A


 PREDS : 
{('1988_Michon-Pierre_Vie-de-Joseph-Roulin', 'canon'), ('1991_Michon-Pierre_Rimbaud-le-fils', 'canon'), ('2009_Michon-Pierre_Les-Onze', 'canon')}




 AUTHOR OUT 190 : Mille-Pierre 


 PREDS : 
{('1912_Mille-Pierre_Louise-et-Barnavaux', 'canon')}




 AUTHOR OUT 191 : Millet-Richard 


 PREDS : 
{('2003_Millet-Richard_Le-renard-dans-le-nom', 'canon'), ('2010_Millet-Richard_Le-Sommeil-Sur-Les-Cendres', 'canon'), ('2007_Millet-Richard_Corps-en-dessous', 'non_canon'), ('1994_Millet-Richard_Un-balcon-a-Beyrouth', 'canon'), ('2006_Millet-Richard_Devorations', 'canon'), ('2005_Millet-Richard_le-gout-des-femmes-laides', 'canon')}




 AUTHOR OUT 192 : Mirbeau-Octave 


 PREDS : 
{('1899_Mirbeau-Octave_Le-Jardin-des-supplices', 'canon'), ('1900_Mirbeau-Octave_Le-journal-d-une-femme-de-chambre', 'non_canon')}




 AUTHOR OUT 193 : Mme-Richomme-Fanny 


 PREDS : 
{('1847_Mme-Richomme-Fanny_Grain-de-sable-ou-le-Sorcier-d-Altenbourg', 'non_canon')}




 AUTHOR OUT 194 : Mme-Tarbe-Des-Sa


 PREDS : 
{('2009_Rouaud-Jean_La-femme-promise', 'canon')}




 AUTHOR OUT 244 : Roussel-Raymond 


 PREDS : 
{('1897_Roussel-Raymond_La-doublure', 'non_canon')}




 AUTHOR OUT 245 : Rousset-Alexis 


 PREDS : 
{('1873_Rousset-Alexis_Derailles-et-declasses-Paris-et-la-province_Tome-2', 'non_canon')}




 AUTHOR OUT 246 : Rufin-Jean-Christophe 


 PREDS : 
{('2001_Rufin-Jean-Christophe_Rouge-Bresil', 'canon')}




 AUTHOR OUT 247 : Saint-Exupery-Antoine-de 


 PREDS : 
{('1939_Saint-Exupery-Antoine-de_Terre-des-hommes', 'canon'), ('1931_Saint-Exupery-Antoine-de_Vol-de-nuit', 'canon')}




 AUTHOR OUT 248 : Saint-Vidal-Mathilde-de 


 PREDS : 
{('1877_Saint-Vidal-Mathilde-de_Amour-et-devoir', 'non_canon')}




 AUTHOR OUT 249 : Sainte-Beuve 


 PREDS : 
{('1834_Sainte-Beuve_Volupte_2', 'canon'), ('1834_Sainte-Beuve_Volupte_1', 'canon')}




 AUTHOR OUT 250 : Saintine-Xavier 


 PREDS : 
{('1836_Saintine-Xavier_Picciola', 'non_canon')}




 AUTHOR OUT 251 : San-Antonio 


 PREDS : 
{('1


 PREDS : 
{("1972_Veuzit-Max-du_Le-coeur-d'ivoire", 'non_canon')}




 AUTHOR OUT 284 : Vian-Boris 


 PREDS : 
{('1950_Vian-Boris_Elles-ne-se-rendent-pas-compte', 'non_canon')}




 AUTHOR OUT 285 : Vidocq-Eugene-François 


 PREDS : 
{('1827_Vidocq-Eugene-François_Memoires-de-Vidocq_Tome-III', 'non_canon'), ('1827_Vidocq-Eugene-François_Memoires-de-Vidocq_Tome-II', 'non_canon')}




 AUTHOR OUT 286 : Viel-Tanguy 


 PREDS : 
{('2013_Viel-Tanguy_La-Disparition-de-Jim-Sullivan', 'canon')}




 AUTHOR OUT 287 : Vigan-Delphine-de 


 PREDS : 
{('2009_Vigan-Delphine-de_Les-heures-souterraines', 'canon')}




 AUTHOR OUT 288 : Vigny-Alfred-de 


 PREDS : 
{('1832_Vigny-Alfred-de_Stello', 'canon')}




 AUTHOR OUT 289 : Villard-Marc 


 PREDS : 
{('1985_Villard-Marc_Le-sentier-de-la-guerre', 'non_canon'), ('1989_Villard-Marc_La-dame-est-une-trainee', 'non_canon')}




 AUTHOR OUT 290 : Villiers-de-l-Isle-Adam 


 PREDS : 
{('1886_Villiers-de-l-Isle-Adam_L-Eve-future', 'non_canon')}




 AU

In [12]:
df_final_scores

Unnamed: 0,precision,recall,f1-score,support
canon,0.718978,0.643791,0.67931,306.0
non_canon,0.731527,0.794118,0.761538,374.0
accuracy,0.726471,0.726471,0.726471,0.726471
macro avg,0.725253,0.718954,0.720424,680.0
weighted avg,0.72588,0.726471,0.724536,680.0


In [108]:
df_final_scores

Unnamed: 0,precision,recall,f1-score,support
canon,0.685668,0.717818,0.701374,1173.0
non_canon,0.808891,0.783996,0.796249,1787.0
accuracy,0.75777,0.75777,0.75777,0.75777
macro avg,0.74728,0.750907,0.748812,2960.0
weighted avg,0.76006,0.75777,0.758652,2960.0
