In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm 

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC

In [2]:
df = pd.read_csv('/home/crazyjeannot/Documents/doctorat/AVENTURES/data/metadata/GPT_ANNOT_OUTPUT_MAIN.csv')
df.set_index('doc', inplace=True)

In [3]:
def classification(df, N_sample=10):
    all_accuracy = []
    for i in tqdm(range(N_sample)):
        X_train, X_test, y_train, y_test = train_test_split(
            df.drop(['label'], axis=1), df['label'], test_size=0.2#, random_state=42
        )

        pipe = make_pipeline(StandardScaler(), SVC(probability=True))

        pipe.fit(X_train, y_train) # clf
        preds = pipe.predict(X_test)
        probas = pipe.predict_proba(X_test)

        accuracy = accuracy_score(y_test, preds)
        report = classification_report(y_test, preds)
        #print(report)
        print(accuracy)
        all_accuracy.append(accuracy)
    print(sum(all_accuracy)/len(all_accuracy))
    return pipe, accuracy

# EMBEDDINGS ALL

In [102]:
df_all_embeddings = pd.read_csv('/home/crazyjeannot/Documents/doctorat/AVENTURES/data/GPT_EMBEDDINGS_OUTPUT_MAIN.csv')
df_all_embeddings.set_index('doc', inplace=True)

In [103]:
df_merged = pd.merge(df_all_embeddings, df, left_index=True, right_index=True, how='left')

In [104]:
pipe, acc = classification(df_merged)

  0%|          | 0/10 [00:00<?, ?it/s]

0.855
0.84
0.835
0.87
0.82
0.84
0.825
0.795
0.865
0.82
0.8365


### DOC2VEC EMBEDDINGS

In [6]:
df_all_embeddings = pd.read_csv('/home/crazyjeannot/Documents/doctorat/AVENTURES/data/DOC2VEC_EMBEDDINGS_OUTPUT_MAIN.csv')
df_all_embeddings.set_index('doc', inplace=True)

In [8]:
df_merged = pd.merge(df_all_embeddings, df, left_index=True, right_index=True, how='left')

In [11]:
pipe, acc = classification(df_merged)

  0%|          | 0/10 [00:00<?, ?it/s]

0.835
0.81
0.785
0.82
0.79
0.82
0.82
0.815
0.83
0.815
0.8140000000000001


## Get Vingt Mille Lieux sous la mer - inferences

In [17]:
df_main = pd.read_csv('GPT_verne_EMBEDDINGS_OUTPUT_MAIN.csv')
df_main.set_index(['doc'], inplace=True)

In [18]:
preds = pipe.predict(df_main)
probas = pipe.predict_proba(df_main)

In [22]:
proba_A, proba_nonA = [proba[0] for proba in probas], [proba[1] for proba in probas]

In [23]:
zipped = list(zip(list(df_main.index), preds, proba_A, proba_nonA))

In [25]:
df_verne = pd.DataFrame(zipped, columns=['Index', 'Preds', 'Proba_A', 'Proba_nonA'])

In [28]:
df_verne.to_csv('preds_Vingt_Milles_Lieux.csv', index=False, header=True)

## Get l'education sentimentale - inferences

In [55]:
df_main = pd.read_csv('GPT_flaubert_EMBEDDINGS_OUTPUT_MAIN.csv')
df_main.set_index(['doc'], inplace=True)

In [56]:
preds = pipe.predict(df_main)
probas = pipe.predict_proba(df_main)

In [57]:
proba_A, proba_nonA = [proba[0] for proba in probas], [proba[1] for proba in probas]

In [58]:
zipped = list(zip(list(df_main.index), preds, proba_A, proba_nonA))

In [59]:
df_flaubert = pd.DataFrame(zipped, columns=['Index', 'Preds', 'Proba_A', 'Proba_nonA'])

In [60]:
df_flaubert.to_csv('preds_Education.csv', index=False, header=True)

### Get 10000 random chunks score

In [137]:
df_main = pd.read_csv('../data/EMBEDDINGS_CHAPITRES_RANDOM.csv')
df_main.set_index(['doc'], inplace=True)

In [138]:
preds = pipe2.predict(df_main)
probas = pipe2.predict_proba(df_main)

In [139]:
proba_A, proba_nonA = [proba[0] for proba in probas], [proba[1] for proba in probas]

In [140]:
zipped = list(zip(list(df_main.index), preds, proba_A, proba_nonA))

In [141]:
df_10000 = pd.DataFrame(zipped, columns=['Index', 'Preds', 'Proba_A', 'Proba_nonA'])

In [142]:
df_10000.to_csv('preds_10000.csv', index=False, header=True)

In [144]:
df_10000.Preds.value_counts()

Preds
NON_ADVENTURE    7787
ADVENTURE        2213
Name: count, dtype: int64

# EMBEDDINGS BOOKNLP

In [105]:
df_booknlp = pd.read_csv('../data/GPT_EMBEDDINGS_1000_BOOKNLP_OUTPUT_MAIN.csv')
df_booknlp.set_index('doc', inplace=True)

In [106]:
df_merged_booknlp = pd.merge(df_booknlp, df, left_index=True, right_index=True, how='left')

In [107]:
pipe_booknlp, acc_booknlp = classification(df_merged_booknlp)

  0%|          | 0/10 [00:00<?, ?it/s]

0.8170731707317073
0.7682926829268293
0.7621951219512195
0.7621951219512195
0.7865853658536586
0.7317073170731707
0.7987804878048781
0.7682926829268293
0.7987804878048781
0.7621951219512195
0.775609756097561


# EMBEDDINGS WITHOUT BOOKNLP

In [108]:
df_non_booknlp = pd.read_csv('../data/GPT_EMBEDDINGS_1000_NONBOOKNLP_OUTPUT_MAIN.csv')
df_non_booknlp.set_index('doc', inplace=True)

In [109]:
df_merged_non_booknlp = pd.merge(df_non_booknlp, df, left_index=True, right_index=True, how='left')

In [110]:
pipe_non_booknlp, acc_non_booknlp = classification(df_merged_non_booknlp)

  0%|          | 0/10 [00:00<?, ?it/s]

0.8170731707317073
0.75
0.7439024390243902
0.7439024390243902
0.8048780487804879
0.8109756097560976
0.7987804878048781
0.7926829268292683
0.8414634146341463
0.8109756097560976
0.7914634146341463


# EMBEDDINGS 150 tokens WITHOUT BOOKNLP

In [114]:
df_random_non_booknlp = pd.read_csv('GPT_EMBEDDINGS_1000_RANDOM_NONBOOKNLP_OUTPUT_MAIN.csv')
df_random_non_booknlp.set_index('doc', inplace=True)

In [115]:
df_merged_random_non_booknlp = pd.merge(df_random_non_booknlp, df, left_index=True, right_index=True, how='left')

In [116]:
pipe_random_non_booknlp, acc_random_non_booknlp = classification(df_merged_non_booknlp)

  0%|          | 0/10 [00:00<?, ?it/s]

0.8170731707317073
0.8475609756097561
0.8048780487804879
0.7804878048780488
0.7682926829268293
0.7378048780487805
0.774390243902439
0.7926829268292683
0.8048780487804879
0.8109756097560976
0.7939024390243903


In [None]:
GPT_EMBEDDINGS_1000_RANDOM_NONBOOKNLP_OUTPUT_MAIN