In [3]:
import time
import os
import glob
import re
import csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
incerto_dir = '/content/drive/MyDrive/incerto-autore'
poems_dir = os.path.join(incerto_dir, 'data', 'poems')
figures_dir = os.path.join(incerto_dir, 'figures')
output_dir = os.path.join(incerto_dir, 'output')
predictions_dir = os.path.join(incerto_dir, 'output','predictions')

## Binary Classifiers

In [8]:
bin_pred = pd.read_csv(os.path.join(predictions_dir, 'predictions_binary2.csv'))
print(len(bin_pred))
# bin_pred = bin_pred.loc[(bin_pred['cv_f1-score'] > 0.7) & (bin_pred['f1-score'] > 0.7)].drop('cv_f1-score', axis=1)
# print(len(bin_pred))

franco_bin_split = bin_pred[(bin_pred['poem_type'] == 'Split') & (bin_pred['author'] == 'VeronicaFranco')][['label', 'author',  'prediction', 'classifier', 'cv_f1-score', 'poem_type', 'classifier_type']].copy()
print(len(franco_bin_split))
franco_bin_split[:1]

21327
10500


Unnamed: 0,label,author,prediction,classifier,cv_f1-score,poem_type,classifier_type
884,UnknownAuthor_1_1,VeronicaFranco,0,Logit,0.715512,Split,Binary


In [13]:
bin_bert_pred = pd.read_csv(os.path.join(predictions_dir, 'predictions_binary_bertoldo.csv'))
franco_bin_bert = bin_bert_pred[bin_bert_pred['author'] == 'VeronicaFranco'].copy()
franco_bin_bert['classifier_type'] = 'Binary'
franco_bin_bert['poem_type'] = 'Split'

finetuned_path = os.path.join(incerto_dir, 'output','finetuned-models', 'binary-class', 'bertoldo', 'VeronicaFranco', 'classification_report.csv')
df = pd.read_csv(finetuned_path)
franco_bin_bert['f1-score'] = df.loc[:, 'f1-score'][3]
franco_bin_bert = franco_bin_bert.rename(columns={'f1-score':'cv_f1-score'})
print(len(franco_bin_bert))
franco_bin_bert[:1]

60


Unnamed: 0,label,author,classifier,prediction,classifier_type,poem_type,cv_f1-score
300,UnknownAuthor_1_1,VeronicaFranco,BERToldo,0.118942,Binary,Split,0.904452


In [14]:
franco_bin = pd.concat([franco_bin_split, franco_bin_bert])
print(len(franco_bin))
franco_bin[:1]

10560


Unnamed: 0,label,author,prediction,classifier,cv_f1-score,poem_type,classifier_type
884,UnknownAuthor_1_1,VeronicaFranco,0.0,Logit,0.715512,Split,Binary


## Multiclass Classifiers

In [15]:
multi_pred = pd.read_csv(os.path.join(predictions_dir, 'predictions_multiclass2.csv'))
multi_pred.rename(columns={'probability':'prediction'}, inplace=True)
print(len(multi_pred))
multi_pred[:1]

5040


Unnamed: 0,label,prediction,classifier,author,vectorizer,ngram_type,ngram_range,max_df,min_df,max_f,num_f,scaler,poem_type,classifier_type,cv_f1-score
0,UnknownAuthor_1_1,0.0304,Logit,AntonGiacomoCorso,Count,Word,Unigrams,0.8,0.0,1000.0,1000,StandardScaler,Split,Multiclass,0.416496


In [16]:
franco_multi_split = multi_pred[multi_pred['poem_type'] == 'Split'].copy()

franco_multi_split = franco_multi_split[['label', 'author',  'prediction', 'classifier', 'cv_f1-score', 'poem_type', 'classifier_type']]
franco_multi_split[:1]

Unnamed: 0,label,author,prediction,classifier,cv_f1-score,poem_type,classifier_type
0,UnknownAuthor_1_1,AntonGiacomoCorso,0.0304,Logit,0.416496,Split,Multiclass


In [17]:
label2id = {'AntonGiacomoCorso': 0, 'CelioMagno': 1, 'DomenicoVenier': 2, 'Franco': 3, 'GiorgioGradenigo': 4, 'MarcoVenier': 5, 'Petrarca': 6, 'PietroBembo': 7, 'macro avg': 9}

In [18]:
franco_multi = franco_multi_split #pd.concat([new_multi_pred, new_multi_bert_pred], ignore_index=True)
cond = (franco_multi['cv_f1-score'] > 0.7)
franco_multi = franco_multi.loc[cond].reset_index(drop=True).copy()
franco_multi.reset_index()
print(len(franco_multi))
franco_multi

0


Unnamed: 0,label,author,prediction,classifier,cv_f1-score,poem_type,classifier_type


## All

In [19]:
author_UA_dict = {
    "UnknownAuthor_1": "UA11",
    "UnknownAuthor_2": "UA14",
    "UnknownAuthor_3": "UA1",
    "UnknownAuthor_4": "UA4",
    "UnknownAuthor_5": "UA6",
    "UnknownAuthor_6": "UA7",
    "UnknownAuthor_7": "UA9"
}

In [20]:
author_UA_number_dict = {
    "UnknownAuthor_1": 11,
    "UnknownAuthor_2": 14,
    "UnknownAuthor_3": 1,
    "UnknownAuthor_4": 4,
    "UnknownAuthor_5": 6,
    "UnknownAuthor_6": 7,
    "UnknownAuthor_7": 9
}

In [21]:
all_pred = pd.concat([franco_bin, franco_multi], ignore_index=True)
all_pred['group_label'] = ['_'.join(x.split('_')[:-1]) for x in all_pred.label]
all_pred['group_label'] = all_pred['group_label'].map(author_UA_number_dict)
# all_pred['label'] = all_pred['label'].apply(lambda x: author_UA_dict['_'.join(x.split('_')[:2])] + '_' + '_'.join(x.split('_')[2:]) if '_'.join(x.split('_')[:2]) in author_UA_dict else x)
all_pred[:3]

Unnamed: 0,label,author,prediction,classifier,cv_f1-score,poem_type,classifier_type,group_label
0,UnknownAuthor_1_1,VeronicaFranco,0.0,Logit,0.715512,Split,Binary,11
1,UnknownAuthor_1_2,VeronicaFranco,1.0,Logit,0.715512,Split,Binary,11
2,UnknownAuthor_1_3,VeronicaFranco,1.0,Logit,0.715512,Split,Binary,11


## Voting system

In [22]:
votes = []
for poem_n in all_pred.group_label.unique():
  yes = 0
  no = 0
  df = all_pred[all_pred['group_label'] == poem_n]
  N = len(df)
  for prediction in df.prediction:
    if prediction > 0.80:
      yes += 1
    else:
      no += 1
  votes.append({'Poem by Franco?' : f'UA{poem_n}', 'Yes %': round(yes/N*100), 'No %': round(no/N*100)})

votes_df = pd.DataFrame(votes).sort_values('Yes %', ascending=False)
votes_df

Unnamed: 0,Poem by Franco?,Yes %,No %
4,UA6,59,41
1,UA14,58,42
6,UA9,56,44
2,UA1,41,59
5,UA7,39,61
3,UA4,33,67
0,UA11,12,88


## Poem chunk with highest probability

In [None]:
of_interest = all_pred.groupby(['label'], as_index=False).mean(numeric_only=True)
of_interest['prediction'] = of_interest['prediction'].apply(lambda x: round(x*100))
of_interest.sort_values('prediction', ascending=False)[:10]

## Probability by poem chunk

In [None]:
poems_df = pd.read_csv(os.path.join(poems_dir, 'poems_split.csv'))[['label', 'poem']]
poems_df

In [None]:
merged = pd.merge(left=poems_df, on='label', right=of_interest[['label', 'prediction']])
merged

In [None]:
merged['label'] = merged['label'].apply(lambda x: author_UA_dict['_'.join(x.split('_')[:2])] + '_' + '_'.join(x.split('_')[2:]) if '_'.join(x.split('_')[:2]) in author_UA_dict else x)
merged

In [None]:
merged.to_csv(os.path.join(output_dir, 'chunk_probabilities.csv'), index=False)