In [1]:
import pandas as pd
import numpy as np
import re
from sklearn import set_config
import sklearn.metrics as metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit

C:\Users\jeanb\anaconda3\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
C:\Users\jeanb\anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\jeanb\anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


In [2]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [3]:
set_config(display='diagram')

In [4]:
def canonizer(data, test_size=0.1, nb_coef=20, random_state=42, cross_validation=False, kernel='rbf'):
    
    df_results = pd.DataFrame()
    
    if cross_validation == True:
        pipe = make_pipeline(StandardScaler(), SVC(kernel=kernel, probability=True))
        cv_results = cross_validate(pipe, data.drop(['canonicity'], axis=1), data['canonicity'], cv=cv)
        return pipe, cv_results
    
    else:
        pipe = make_pipeline(StandardScaler(), SVC(kernel=kernel, probability=True))   

        X_train, X_test, y_train, y_test = train_test_split(data.drop(['canonicity'], axis=1), data['canonicity'], test_size=test_size, random_state=random_state)        

        pipe.fit(X_train, y_train)
        
        print(metrics.classification_report(y_test, pipe.predict(X_test)))
        
        # for non binary labels
        canonicity_predictions = pipe.predict(X_test)
        r2 = r2_score(y_test, canonicity_predictions)
        mae = mean_absolute_error(y_test, canonicity_predictions)
        mse = mean_squared_error(y_test, canonicity_predictions)
        rmse = mean_squared_error(y_test, canonicity_predictions, squared=False)
        print("r2 : %0.1f%% " % (r2 * 100))
        print("mean_absolute_error : ", mae)
        print("mean_squared_error : ", mse)
        print("root mean_squared_error : ", rmse)
        
        if kernel == 'linear':
            coefs = pipe.named_steps['svc'].coef_
            plot_coefficients(*coefs, data.columns, nb_coef)
            
        df_results['metadata'] = y_test
        df_results['proba canon'] = pipe.predict_proba(X_test)[:,0]
        df_results['proba non-canon'] = pipe.predict_proba(X_test)[:,1]
        df_results['prediction']= pipe.predict(X_test)
    
        df_results['accord'] = [True if row['metadata'] == row['prediction'] else False for index, row in df_results.iterrows()]
    
    
    return pipe, df_results

# BoW features 

In [6]:
df_main = pd.read_csv(r'../data/01_chunks_BoW_features.csv')
df_main.set_index("index", inplace = True)
df_main = df_main.replace(np.nan, 0)

In [17]:
def retrieve_canonicity(index_name):
    list_canonicity = []
    for titre in index_name:
        temp_tag = titre.split('_')[3]
        list_canonicity.append(temp_tag)
    return list_canonicity

In [18]:
list_canonicity = retrieve_canonicity(df_main.index)

In [19]:
df_main['canonicity'] = list_canonicity

In [52]:
pipe, df_results = canonizer(df_main)

              precision    recall  f1-score   support

       canon       0.90      0.87      0.88       689
    noncanon       0.90      0.92      0.91       899

    accuracy                           0.90      1588
   macro avg       0.90      0.89      0.90      1588
weighted avg       0.90      0.90      0.90      1588



In [53]:
pipe

## Get canonicity ratings

In [11]:
df_corpus = pd.read_csv(r'../../canonization_process/metadata/df_corpus_rate_canonicity.csv')

In [12]:
def title(df_main):

    liste_titre = []
    
    for doc_name in list(df_main.index):
        
        date = doc_name.split("_")[0]
        auteur = doc_name.split("_")[1]
        titre = doc_name.split("_")[2]
        liste_titre.append(titre)
        
    return net_title(liste_titre)

In [13]:
def net_title(liste_titre):
    liste_result = []
    for titre in liste_titre:
        temp = re.sub("-", " ", titre)
        liste_result.append(temp.lower()) 
    return liste_result

In [14]:
def get_canonicity_rate(df_main, df_corpus):
    canonicity_list = []
    list_title = title(df_main)
    for titre in list_title:
        row = df_corpus.loc[df_corpus['titre'] == titre]
        if row.values.any():
            canonicity_list.append(int(row["canonicity_1"].values[0]))
        else:
            canonicity_list.append(0)
    return canonicity_list

In [15]:
canonicity_list = get_canonicity_rate(df_main, df_corpus)

In [20]:
df_rate = df_main.drop(["canonicity"], axis=1)

In [21]:
df_rate["canonicity_rate"] = canonicity_list

In [22]:
df_rate.canonicity_rate.value_counts()

0    10805
3     3631
5     1117
2      327
Name: canonicity_rate, dtype: int64

In [24]:
df_rate.canonicity_rate.value_counts(normalize=True)

0    0.680416
3    0.228652
5    0.070340
2    0.020592
Name: canonicity_rate, dtype: float64

In [None]:
pipe_rate, df_results_rate = canonizer(df_rate)

# LDA features 

# DBoW neural embeddings features 