In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn import set_config, metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, ConfusionMatrixDisplay
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split, ShuffleSplit, LeaveOneGroupOut, LeavePGroupsOut
import seaborn as sns
import matplotlib.pyplot as plt
import random
from collections import Counter

In [24]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SVMSMOTE

In [3]:
df_fold_0 = pd.read_csv(r'data/fold_0.csv').set_index("index")
df_fold_1 = pd.read_csv(r'data/fold_1.csv').set_index("index")
df_fold_2 = pd.read_csv(r'data/fold_2.csv').set_index("index")
df_fold_3 = pd.read_csv(r'data/fold_3.csv').set_index("index")
df_fold_4 = pd.read_csv(r'data/fold_4.csv').set_index("index")

In [4]:
data = [df_fold_0, df_fold_1, df_fold_2, df_fold_3, df_fold_4]

In [5]:
print(df_fold_0.canon.value_counts(normalize=True))
print(df_fold_1.canon.value_counts(normalize=True))
print(df_fold_2.canon.value_counts(normalize=True))
print(df_fold_3.canon.value_counts(normalize=True))
print(df_fold_4.canon.value_counts(normalize=True))


non_canon    0.587054
canon        0.412946
Name: canon, dtype: float64
canon        0.501773
non_canon    0.498227
Name: canon, dtype: float64
non_canon    0.525046
canon        0.474954
Name: canon, dtype: float64
non_canon    0.581888
canon        0.418112
Name: canon, dtype: float64
non_canon    0.578182
canon        0.421818
Name: canon, dtype: float64


In [6]:
def canonizer(data, test_size=0.2, random_state=42, sampling=None, cross_validation=False, cv=5, kernel='rbf', nb_coef=20):
    list_df_scores = []
    
    pipe = make_pipeline(StandardScaler(), Normalizer(), SVC(kernel=kernel, probability=True, class_weight='balanced'))
    
    
    for elem in data:
        train = elem.head(int(len(elem)*(1-test_size)))
        test = elem.iloc[len(train.index):]  
        X_train = train.drop(['canon'], axis=1)
        y_train = train['canon']
        X_test = test.drop(['canon'], axis=1)
        y_test = test['canon']
                
        pipe.fit(X_train, y_train)
        report = metrics.classification_report(y_test, pipe.predict(X_test), output_dict=True)
        df_scores = pd.DataFrame(report).transpose()
        print(df_scores)
        list_df_scores.append(df_scores)
    
    df_final_scores = pd.concat(list_df_scores).groupby(level=0).mean()
    
    return pipe, df_final_scores

In [6]:
pipe, df_final_scores = canonizer(data)

              precision    recall  f1-score    support
canon          0.800000  0.827586  0.813559  29.000000
non_canon      0.916667  0.901639  0.909091  61.000000
accuracy       0.877778  0.877778  0.877778   0.877778
macro avg      0.858333  0.864613  0.861325  90.000000
weighted avg   0.879074  0.877778  0.878309  90.000000
              precision    recall  f1-score     support
canon          0.969072  0.949495  0.959184   99.000000
non_canon      0.687500  0.785714  0.733333   14.000000
accuracy       0.929204  0.929204  0.929204    0.929204
macro avg      0.828286  0.867605  0.846259  113.000000
weighted avg   0.934187  0.929204  0.931202  113.000000
              precision    recall  f1-score     support
canon          0.809524  0.283333  0.419753   60.000000
non_canon      0.505747  0.916667  0.651852   48.000000
accuracy       0.564815  0.564815  0.564815    0.564815
macro avg      0.657635  0.600000  0.535802  108.000000
weighted avg   0.674512  0.564815  0.522908  108.00000

In [18]:
df_final_scores

Unnamed: 0,precision,recall,f1-score,support
accuracy,0.696886,0.696886,0.696886,0.696886
canon,0.703204,0.478059,0.541577,54.4
macro avg,0.676638,0.669897,0.63758,105.0
non_canon,0.650072,0.861734,0.733583,50.6
weighted avg,0.718791,0.696886,0.673068,105.0


In [12]:
df_final_scores

Unnamed: 0,precision,recall,f1-score,support
accuracy,0.705338,0.705338,0.705338,0.705338
canon,0.697328,0.521607,0.574906,54.4
macro avg,0.683946,0.675949,0.654682,105.0
non_canon,0.670563,0.830292,0.734458,50.6
weighted avg,0.720527,0.705338,0.687724,105.0


### Let's try Leave One Group Out Cross Validation (leave one author out, predict canonicity)

In [7]:
df_main = pd.read_csv(r'data/unigram_dataset_majed_canon_author_scale.csv')

df_main.set_index("index", inplace = True)
df_main = df_main.replace(np.nan, 0)

In [8]:
list_author = []
for elem in df_main.index:
    list_author.append(elem.split('_')[1])
    
df_main['auteur'] = list_author

In [9]:
len(set(list_author))

714

In [12]:
df_concat = pd.concat([df_main.iloc[:, 0:800], df_main.iloc[:, 1000:1800]],  axis=1)#df_main.iloc[:, 3156:3956]],
df_concat['canon']=df_main['canon']
df_concat['auteur']=df_main['auteur']

In [22]:
def LOGO_canonized(df_main, sampling=None):
    
    ALL_PREDS, ALL_GT = [], [] # lists of all predictions and all ground truth data
    set_auteur = len(set(list(df_main.auteur)))
    
    pipe = make_pipeline(StandardScaler(), SVC(class_weight={"canon":1.5, "non_canon":1})) #probability=True
    logo = LeaveOneGroupOut()
    
    
    for (train_index, test_index) in tqdm(logo.split(df_main.drop(['auteur', 'canon'], axis=1), df_main['canon'], df_main['auteur']), total=set_auteur): 
        
        train = df_main.iloc[train_index]
        test = df_main.iloc[test_index]
        
        #print(f"\n AUTHOR OUT {i} : "+test.index[0].split('_')[1]+" \n")

        X_train = train.drop(['auteur', 'canon'], axis=1)
        y_train = train['canon']
        X_test = test.drop(['auteur', 'canon'], axis=1)
        y_test = test['canon']
        
        if sampling is not None:     
                if sampling == 'over':
                    ros = RandomOverSampler(random_state=10)
                    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
                    
                elif sampling == 'svm':
                    sm = SVMSMOTE(random_state=10)
                    X_resampled, y_resampled = sm.fit_resample(X_train, y_train)
                    
                elif sampling == 'under':
                    rus = RandomUnderSampler(random_state=10)
                    X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
                    
                elif sampling == "smoteenn":
                    smote_enn = SMOTEENN(random_state=10)
                    X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
                    
                elif sampling == 'smotetomek':
                    smote_tomek = SMOTETomek(random_state=10)
                    X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
                
                else:
                    print('Please follow the sampling possible values : over, under, smoteenn, smotetomek')
                    return
                    
                print('Resampled dataset shape {}'.format(Counter(y_resampled)))
            
                pipe.fit(X_resampled, y_resampled)
        
        else:
                pipe.fit(X_train, y_train)
                
                
        preds = pipe.predict(X_test)
        
        #print(f"\n PREDS : ")
        #print(set(zip(test.index, preds)))
        #print("\n\n")
        
        ALL_PREDS.extend(preds)
        ALL_GT.extend(y_test)
    
    report = metrics.classification_report(ALL_GT, ALL_PREDS, output_dict=True)# zero_division=1
    df_scores = pd.DataFrame(report).transpose()
    #print(df_scores)
    
    return pipe, df_scores

In [23]:
pipe, df_final_scores = LOGO_canonized(df_concat, sampling='under')

  0%|          | 0/714 [00:00<?, ?it/s]

NameError: name 'RandomUnderSampler' is not defined

In [None]:
df_final_scores

In [27]:
df_final_scores

Unnamed: 0,precision,recall,f1-score,support
canon,0.712209,0.626598,0.666667,1173.0
non_canon,0.772822,0.8338,0.802153,1787.0
accuracy,0.751689,0.751689,0.751689,0.751689
macro avg,0.742515,0.730199,0.73441,2960.0
weighted avg,0.748802,0.751689,0.748462,2960.0


In [None]:
#filter author w/ more than 5 texts 

In [41]:
def select_N_novel_per_author(list_author, N):
    author_selected = []
    dict_author = dict(Counter(list_author))
    for key, value in dict_author.items():
        if value >= N:
            author_selected.append(key)
    return author_selected

In [42]:
author_selected = select_N_novel_per_author(list_author, 5)

In [44]:
df_main_selected = df_main.loc[df_main['auteur'].isin(author_selected)]

In [46]:
df_main_selected.canon.value_counts()

non_canon    1041
canon         957
Name: canon, dtype: int64

In [47]:
pipe, df_final_scores = LOGO_canonized(df_main_selected)

0it [00:00, ?it/s]

In [48]:
df_final_scores

Unnamed: 0,precision,recall,f1-score,support
canon,0.733407,0.69279,0.71252,957.0
non_canon,0.731261,0.768492,0.749415,1041.0
accuracy,0.732232,0.732232,0.732232,0.732232
macro avg,0.732334,0.730641,0.730967,1998.0
weighted avg,0.732289,0.732232,0.731743,1998.0
