In [6]:
import pandas as pd
import numpy as np
from sklearn import set_config
import sklearn.metrics as metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [4]:
set_config(display='diagram')

### Canonizer sklearn svm pipeline

In [56]:
def canonizer(data, test_size=0.15, random_state=42, cross_validation=False):
    
    df_results = pd.DataFrame()
    
    pipe = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))
    
    if cross_validation == True:
        cv_results = cross_validate(pipe, data.drop(['canon'], axis=1), data['canon'])
        return pipe, cv_results
    else:
        
        X_train, X_test, y_train, y_test = train_test_split(data.drop(['canon'], axis=1), data['canon'], test_size=test_size, random_state=random_state)
        
        pipe.fit(X_train, y_train)
        
        print(metrics.classification_report(y_test, pipe.predict(X_test)))
    
    
        df_results['metadata'] = y_test
        df_results['proba canon'] = pipe.predict_proba(X_test)[:,0]
        df_results['proba non-canon'] = pipe.predict_proba(X_test)[:,1]
        df_results['prediction']= pipe.predict(X_test)
    
        df_results['accord'] = [True if row['metadata'] == row['prediction'] else False for index, row in df_results.iterrows()]
    
    
    return pipe, df_results

### Load df features main metadata

In [15]:
df_main = pd.read_csv(r'data\features_canon.csv')
df_main.set_index("index", inplace = True)
df_main = df_main.replace(np.nan, 0)

In [16]:
pipe_main, df_results_main = canonizer(df_main)

              precision    recall  f1-score   support

       canon       1.00      0.07      0.14        40
   non_canon       0.92      1.00      0.96       404

    accuracy                           0.92       444
   macro avg       0.96      0.54      0.55       444
weighted avg       0.92      0.92      0.88       444



In [21]:
df_results_main

Unnamed: 0_level_0,metadata,proba canon,proba non-canon,prediction,accord
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1973_Vautrin-Jean_a-Bulletins-Rouges,non_canon,0.078429,0.921571,non_canon,True
1903_Chabrier-Rieder-Charlotte_Les-Enfants-du-Luxembourg,non_canon,0.020180,0.979820,non_canon,True
1879_Thuret-Mme-E._La-Guerre-au-chateau,non_canon,0.040088,0.959912,non_canon,True
1954_Reage-Pauline_Histoire-d-O,non_canon,0.246104,0.753896,non_canon,True
1928_Cami-Pierre-Henri_le-jugement-dernier,non_canon,0.024722,0.975278,non_canon,True
...,...,...,...,...,...
2011_Darrieussecq-Marie_Cleves,non_canon,0.068335,0.931665,non_canon,True
1848_Woillez-Catherine_Edma-et-Marguerite-ou-les-Ruines-de-Chatillon-d-Azergues,non_canon,0.021627,0.978373,non_canon,True
1876_Sand-George_Contes-d-une-grand-mere,non_canon,0.061136,0.938864,non_canon,True
1955_Simenon-Georges_Maigret-tend-un-piege,non_canon,0.023912,0.976088,non_canon,True


### Load df features one txt one author main metadata

In [45]:
df_main_ones = pd.read_csv(r'data\df_main_ones.csv')
df_main_ones.set_index("Unnamed: 0", inplace = True)
df_main_ones = df_main_ones.replace(np.nan, 0)

In [46]:
pipe_main_ones, df_results_main_ones = canonizer(df_main_ones)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       canon       0.00      0.00      0.00         8
   non_canon       0.93      1.00      0.96       100

    accuracy                           0.93       108
   macro avg       0.46      0.50      0.48       108
weighted avg       0.86      0.93      0.89       108



In [47]:
df_results_main_ones

Unnamed: 0_level_0,metadata,proba canon,proba non-canon,prediction,accord
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1959_Troyat-Henri_Les-compagnons-du-coquelicot,non_canon,0.070211,0.929789,non_canon,True
1872_Guise-Charles-de_Helika-Memoire-d-un-vieux-maitre-d-ecole,non_canon,0.088156,0.911844,non_canon,True
1931_Simenon-Georges_Au-Rendez-vous-des-Terre-Neuvas,non_canon,0.078666,0.921334,non_canon,True
2013_Ferrier-Michael_Fukushima-Recit-d-un-desastre,non_canon,0.053380,0.946620,non_canon,True
1889_Coppee-François_Henriette,non_canon,0.096119,0.903881,non_canon,True
...,...,...,...,...,...
1949_Benoit-Pierre_La-dame-de-l-ouest,non_canon,0.087757,0.912243,non_canon,True
2012_Deville-Patrick_Peste-et-cholera,canon,0.052699,0.947301,non_canon,False
1867_Aimard-Gustave-Auriac-Jules-Berlioz-d-_Les-Forestiers-du-Michigan,non_canon,0.037480,0.962520,non_canon,True
1855_Roche-L._La-Maison-du-dimanche,non_canon,0.027636,0.972364,non_canon,True


### Load df features fabula metadata

In [35]:
df_fabula = pd.read_csv(r'data\features_fabula.csv')
df_fabula.set_index("index", inplace = True)
df_fabula = df_fabula.replace(np.nan, 0)

In [36]:
df_fabula['canon'].value_counts()

False    1805
True     1155
Name: canon, dtype: int64

In [57]:
pipe_cv, scores_cv = canonizer(df_fabula, cross_validation=True)

In [58]:
scores_cv

{'fit_time': array([58.84632397, 57.07133722, 56.70533705, 62.05453444, 57.57437778]),
 'score_time': array([3.60127449, 3.63727522, 3.77728677, 4.09956574, 3.78728795]),
 'test_score': array([0.90878378, 0.89189189, 0.89864865, 0.91722973, 0.90033784])}

In [61]:
sum(scores_cv['test_score'])/len(scores_cv['test_score'])

0.9033783783783784

In [38]:
pipe_fabula, df_results_fabula = canonizer(df_fabula, test_size=0.15, random_state=42)

              precision    recall  f1-score   support

       False       0.94      0.92      0.93       283
        True       0.87      0.90      0.88       161

    accuracy                           0.91       444
   macro avg       0.91      0.91      0.91       444
weighted avg       0.92      0.91      0.91       444



In [28]:
df_results_fabula

Unnamed: 0_level_0,metadata,proba canon,proba non-canon,prediction,accord
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1973_Vautrin-Jean_a-Bulletins-Rouges,non_canon,0.079270,0.920730,non_canon,True
1903_Chabrier-Rieder-Charlotte_Les-Enfants-du-Luxembourg,non_canon,0.020738,0.979262,non_canon,True
1879_Thuret-Mme-E._La-Guerre-au-chateau,non_canon,0.040854,0.959146,non_canon,True
1954_Reage-Pauline_Histoire-d-O,non_canon,0.245400,0.754600,non_canon,True
1928_Cami-Pierre-Henri_le-jugement-dernier,non_canon,0.025343,0.974657,non_canon,True
...,...,...,...,...,...
2011_Darrieussecq-Marie_Cleves,non_canon,0.069185,0.930815,non_canon,True
1848_Woillez-Catherine_Edma-et-Marguerite-ou-les-Ruines-de-Chatillon-d-Azergues,non_canon,0.022206,0.977794,non_canon,True
1876_Sand-George_Contes-d-une-grand-mere,non_canon,0.061982,0.938018,non_canon,True
1955_Simenon-Georges_Maigret-tend-un-piege,non_canon,0.024523,0.975477,non_canon,True


### Load df features one txt one author fabula metadata

In [42]:
df_fabula_ones = pd.read_csv(r'data\df_fabula_ones.csv')
df_fabula_ones.set_index("Unnamed: 0", inplace = True)
df_fabula_ones = df_fabula_ones.replace(np.nan, 0)

In [43]:
pipe_fabula_ones, df_results_fabula_ones = canonizer(df_fabula_ones)

              precision    recall  f1-score   support

       False       0.81      0.99      0.89        87
        True       0.50      0.05      0.09        21

    accuracy                           0.81       108
   macro avg       0.66      0.52      0.49       108
weighted avg       0.75      0.81      0.73       108



In [44]:
df_results_fabula_ones

Unnamed: 0_level_0,metadata,proba canon,proba non-canon,prediction,accord
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1959_Troyat-Henri_Les-compagnons-du-coquelicot,False,0.978111,0.021889,False,True
1872_Guise-Charles-de_Helika-Memoire-d-un-vieux-maitre-d-ecole,False,0.925608,0.074392,False,True
1931_Simenon-Georges_Au-Rendez-vous-des-Terre-Neuvas,True,0.365732,0.634268,False,False
2013_Ferrier-Michael_Fukushima-Recit-d-un-desastre,False,0.192833,0.807167,True,False
1889_Coppee-François_Henriette,False,0.804036,0.195964,False,True
...,...,...,...,...,...
1949_Benoit-Pierre_La-dame-de-l-ouest,False,0.748989,0.251011,False,True
2012_Deville-Patrick_Peste-et-cholera,False,0.483873,0.516127,False,True
1867_Aimard-Gustave-Auriac-Jules-Berlioz-d-_Les-Forestiers-du-Michigan,False,0.847588,0.152412,False,True
1855_Roche-L._La-Maison-du-dimanche,False,0.822611,0.177389,False,True


In [50]:
df_test = pd.read_csv(r'data/df_main_test_author.csv')
df_test.set_index("index", inplace = True)
df_test = df_test.replace(np.nan, 0)

In [51]:
pipe_test, df_results_test = canonizer(df_test)

              precision    recall  f1-score   support

       False       0.85      0.96      0.90       324
        True       0.84      0.56      0.67       120

    accuracy                           0.85       444
   macro avg       0.85      0.76      0.79       444
weighted avg       0.85      0.85      0.84       444



In [52]:
df_results_test

Unnamed: 0_level_0,metadata,proba canon,proba non-canon,prediction,accord
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1973_Vautrin-Jean_a-Bulletins-Rouges,False,0.779095,0.220905,False,True
1903_Chabrier-Rieder-Charlotte_Les-Enfants-du-Luxembourg,False,0.978908,0.021092,False,True
1879_Thuret-Mme-E._La-Guerre-au-chateau,False,0.960335,0.039665,False,True
1954_Reage-Pauline_Histoire-d-O,False,0.491813,0.508187,False,True
1928_Cami-Pierre-Henri_le-jugement-dernier,False,0.975450,0.024550,False,True
...,...,...,...,...,...
2011_Darrieussecq-Marie_Cleves,True,0.697951,0.302049,False,False
1848_Woillez-Catherine_Edma-et-Marguerite-ou-les-Ruines-de-Chatillon-d-Azergues,False,0.995404,0.004596,False,True
1876_Sand-George_Contes-d-une-grand-mere,True,0.154177,0.845823,True,True
1955_Simenon-Georges_Maigret-tend-un-piege,False,0.947545,0.052455,False,True


In [62]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

In [None]:
smote_tomek = SMOTETomek(random_state=0)
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)



kernel='linear'

In [67]:
def canonizer_imbalanced(data, test_size=0.15, random_state=42, cross_validation=False):
    
    df_results = pd.DataFrame()
    
    pipe = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))
    
    if cross_validation == True:
        cv_results = cross_validate(pipe, data.drop(['canon'], axis=1), data['canon'])
        return pipe, cv_results
    else:
        
        X_train, X_test, y_train, y_test = train_test_split(data.drop(['canon'], axis=1), data['canon'], test_size=test_size, random_state=random_state)
        
        #ros = RandomOverSampler(random_state=0) ### = 10 ???
        #X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
        
        smote_enn = SMOTEENN(random_state=0)
        X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

        
        pipe.fit(X_resampled, y_resampled)
        
        print(metrics.classification_report(y_test, pipe.predict(X_test)))
    
    
        df_results['metadata'] = y_test
        df_results['proba canon'] = pipe.predict_proba(X_test)[:,0]
        df_results['proba non-canon'] = pipe.predict_proba(X_test)[:,1]
        df_results['prediction']= pipe.predict(X_test)
    
        df_results['accord'] = [True if row['metadata'] == row['prediction'] else False for index, row in df_results.iterrows()]
    
    
    return pipe, df_results

### NOT BALANCED

                precision    recall  f1-score   support

    canon           1.00      0.07      0.14        40
    non_canon       0.92      1.00      0.96       404

    accuracy                            0.92       444
    macro avg       0.96      0.54      0.55       444
    weighted avg    0.92      0.92      0.88       444


### OverSampling

                   precision    recall  f1-score   support

           canon       0.54      0.33      0.41        40
       non_canon       0.94      0.97      0.95       404

        accuracy                           0.91       444
       macro avg       0.74      0.65      0.68       444
    weighted avg       0.90      0.91      0.90       444

In [66]:
pipe_imb_over, df_results_imb_over = canonizer_imbalanced(df_main)

              precision    recall  f1-score   support

       canon       0.54      0.33      0.41        40
   non_canon       0.94      0.97      0.95       404

    accuracy                           0.91       444
   macro avg       0.74      0.65      0.68       444
weighted avg       0.90      0.91      0.90       444

