In [1]:
import pandas as pd
import numpy as np
import re
from sklearn import set_config
import sklearn.metrics as metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import make_scorer

In [2]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [3]:
set_config(display='diagram')

In [4]:
def canonizer(data, test_size=0.1, nb_coef=20, random_state=42, cross_validation=False, kernel='rbf'):
    
    df_results = pd.DataFrame()
    
    if cross_validation == True:
        pipe = make_pipeline(StandardScaler(), SVC(kernel=kernel, probability=True))
        cv_results = cross_val_score(pipe, data.drop(['canonicity'], axis=1), data['canonicity'], cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
        #cv_results = cross_validate(pipe, data.drop(['canonicity'], axis=1), data['canonicity'], cv=cv)
        return pipe, cv_results
    
    else:
        pipe = make_pipeline(StandardScaler(), SVC(kernel=kernel, probability=True))   

        X_train, X_test, y_train, y_test = train_test_split(data.drop(['canonicity'], axis=1), data['canonicity'], test_size=test_size, random_state=random_state)        

        pipe.fit(X_train, y_train)
        
        df_results = metrics.classification_report(y_test, pipe.predict(X_test))
        print(df_results)
        
        # for non binary labels
        #canonicity_predictions = pipe.predict(X_test)
        #r2 = r2_score(y_test, canonicity_predictions)
        #mae = mean_absolute_error(y_test, canonicity_predictions)
        #mse = mean_squared_error(y_test, canonicity_predictions)
        #rmse = mean_squared_error(y_test, canonicity_predictions, squared=False)
        #print("r2 : %0.1f%% " % (r2 * 100))
        #print("mean_absolute_error : ", mae)
        #print("mean_squared_error : ", mse)
        #print("root mean_squared_error : ", rmse)
         
    return pipe, df_results

In [5]:
from sklearn.metrics import classification_report, accuracy_score, make_scorer

def classification_report_with_accuracy_score(y_true, y_pred):
    print(classification_report(y_true, y_pred))
    return accuracy_score(y_true, y_pred)

# BoW features 

In [6]:
df_main = pd.read_csv(r'../features/chunks_BoW_features2.csv')
df_main.set_index("index", inplace = True)
df_main = df_main.replace(np.nan, 0)

In [7]:
def retrieve_canonicity(index_name):
    list_canonicity = []
    for titre in index_name:
        temp_tag = titre.split('_')[3]
        list_canonicity.append(temp_tag)
    return list_canonicity

In [8]:
list_canonicity = retrieve_canonicity(df_main.index)

In [9]:
df_main['canonicity'] = list_canonicity

In [12]:
pipe, df_results = canonizer(df_main)

              precision    recall  f1-score   support

       canon       0.87      0.90      0.88      1772
    noncanon       0.87      0.83      0.85      1413

    accuracy                           0.87      3185
   macro avg       0.87      0.87      0.87      3185
weighted avg       0.87      0.87      0.87      3185



### cross-validation score

In [None]:
pipe_CV, df_results_CV = canonizer(df_main, cross_validation=True)

# LDA features 

In [11]:
df_lda_50 = pd.read_csv(r'../features/lda_features_50_topics.csv')
df_lda_50.set_index("Unnamed: 0", inplace = True)
df_lda_50 = df_lda_50.replace(np.nan, 0)

In [13]:
list_canonicity = retrieve_canonicity(df_lda_50.index)
df_lda_50['canonicity'] = list_canonicity

In [21]:
pipe_lda_50, df_results_lda_50 = canonizer(df_lda_50)

              precision    recall  f1-score   support

       canon       0.60      0.77      0.68      1738
    noncanon       0.59      0.39      0.47      1447

    accuracy                           0.60      3185
   macro avg       0.60      0.58      0.57      3185
weighted avg       0.60      0.60      0.58      3185



### cross-validation score

In [None]:
pipe_lda_CV, df_results_CV = canonizer(df_lda_50, cross_validation=True)

# DBoW neural embeddings features 

#### get and format data

In [14]:
import pickle

In [15]:
path_name = 'data_df.pkl'

In [16]:
with open(path_name, 'rb') as file:
    data_pickled = pickle.load(file, encoding='bytes')

In [17]:
columns = [i for i in range(0,300)]

In [18]:
test = list(data_pickled['vecteur'].values)

In [20]:
df_doc2vec = pd.DataFrame(data=test, columns=columns)

In [21]:
df_doc2vec['index'] = list(data_pickled['oeuvre'])

In [22]:
list_canonicity = retrieve_canonicity(list(data_pickled['oeuvre']))

In [23]:
df_doc2vec['canonicity'] = list_canonicity

In [24]:
df_doc2vec.set_index("index", inplace = True)

#### training

In [64]:
pipe_d2v, df_results_d2v = canonizer(df_doc2vec)

              precision    recall  f1-score   support

       canon       0.92      0.92      0.92      1799
    noncanon       0.90      0.90      0.90      1385

    accuracy                           0.91      3184
   macro avg       0.91      0.91      0.91      3184
weighted avg       0.91      0.91      0.91      3184



### cross-validation score

In [None]:
pipe_d2v_CV, df_results_d2v_CV = canonizer(df_doc2vec, cross_validation=True)