# Visualizations of the Summarized Predictions Part 2

In [None]:
import time
import os
import glob
import re
import csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
incerto_dir = '/content/drive/MyDrive/incerto-autore'
poems_dir = os.path.join(incerto_dir, 'data', 'poems')
figures_dir = os.path.join(incerto_dir, 'figures')
output_dir = os.path.join(incerto_dir, 'output')
predictions_dir = os.path.join(incerto_dir, 'output','predictions')

## Binary Classifiers

In [None]:
bin_pred = pd.read_csv(os.path.join(predictions_dir, 'predictions_binary.csv'))
print(len(bin_pred))
bin_pred = bin_pred.loc[(bin_pred['cv_f1-score'] > 0.7) & (bin_pred['f1-score'] > 0.7)].drop('cv_f1-score', axis=1)
print(len(bin_pred))

franco_bin_split = bin_pred[(bin_pred['poem_type'] == 'Split') & (bin_pred['author'] == 'Franco')][['label', 'author',  'prediction', 'classifier', 'f1-score', 'poem_type', 'classifier_type']].copy()
print(len(franco_bin_split))
franco_bin_split[:1]

28780
28745
20880


Unnamed: 0,label,author,prediction,classifier,f1-score,poem_type,classifier_type
0,UA11_1,Franco,0,SVM,0.808679,Split,Binary


In [None]:
bin_bert_pred = pd.read_csv(os.path.join(predictions_dir, 'predictions_binary_bertoldo.csv'))
franco_bin_bert = bin_bert_pred[bin_bert_pred['author'] == 'Franco'].copy()
franco_bin_bert['classifier_type'] = 'Binary'
franco_bin_bert['poem_type'] = 'Split'

finetuned_path = os.path.join(incerto_dir, 'output','finetuned-models', 'binary-class', 'bertoldo', 'Franco', 'classification_report.csv')
df = pd.read_csv(finetuned_path)
franco_bin_bert['f1-score'] = df.loc[:, 'f1-score'][3]
print(len(franco_bin_bert))
franco_bin_bert[:1]

60


Unnamed: 0,label,author,classifier,prediction,classifier_type,poem_type,f1-score
0,UA11_1,Franco,BERToldo,0.703703,Binary,Split,0.937564


In [None]:
franco_bin = pd.concat([franco_bin_split, franco_bin_bert])
print(len(franco_bin))
franco_bin[:1]

20940


Unnamed: 0,label,author,prediction,classifier,f1-score,poem_type,classifier_type
0,UA11_1,Franco,0.0,SVM,0.808679,Split,Binary


## Multiclass Classifiers

In [None]:
multi_pred = pd.read_csv(os.path.join(predictions_dir, 'predictions_multiclass.csv'))
print(len(multi_pred))
multi_pred[:1]

360


Unnamed: 0,label,prediction,classifier,vectorizer,ngram_type,ngram_range,max_df,min_df,max_f,num_f,...,cv_f1-score,f1-score,AntonGiacomoCorso_f1,CelioMagno_f1,DomenicoVenier_f1,Franco_f1,GiorgioGradenigo_f1,MarcoVenier_f1,Petrarca_f1,PietroBembo_f1
0,UA11_1,Franco,Logit,Count,Word,Unigrams,0.8,0.0,1000,1000,...,0.438551,0.370928,0.4,0.521739,0.166667,0.848921,0.0,0.0,0.769231,0.26087


In [None]:
franco_multi_split = multi_pred[multi_pred['poem_type'] == 'Split'].drop(columns=['f1-score']).copy()
franco_multi_split.rename(columns={'prediction':'author', 'Franco_f1':'f1-score'}, inplace=True)
franco_multi_split['prediction'] = [1 if x == 'Franco' else 0 for x in franco_multi_split['author']]
franco_multi_split['author'] = 'Franco'

franco_multi_split = franco_multi_split[['label', 'author',  'prediction', 'classifier', 'f1-score', 'poem_type', 'classifier_type']]
franco_multi_split[:1]

Unnamed: 0,label,author,prediction,classifier,f1-score,poem_type,classifier_type
0,UA11_1,Franco,1,Logit,0.848921,Split,Multiclass


In [None]:
label2id = {'AntonGiacomoCorso': 0, 'CelioMagno': 1, 'DomenicoVenier': 2, 'Franco': 3, 'GiorgioGradenigo': 4, 'MarcoVenier': 5, 'Petrarca': 6, 'PietroBembo': 7, 'macro avg': 9}

In [None]:
multi_bert_pred = pd.read_csv(os.path.join(predictions_dir, 'predictions_multi_bertoldo.csv'))
franco_multi_bert = multi_bert_pred[multi_bert_pred['author'] == 'Franco'].copy()
finetuned_path = os.path.join(incerto_dir, 'output','finetuned-models', 'multi-class', 'bertoldo', 'classification_report.csv')
df = pd.read_csv(finetuned_path)
franco_multi_bert['f1-score'] = df.loc[:, 'f1-score'][3]
franco_multi_bert['poem_type'] = 'Split'
franco_multi_bert['classifier_type'] = 'Multiclass'
franco_multi_bert[:1]

Unnamed: 0,label,author,classifier,prediction,f1-score,poem_type,classifier_type
3,UA11_1,Franco,BERToldo,0.822792,0.846847,Split,Multiclass


In [None]:
franco_multi = pd.concat([franco_multi_split, franco_multi_bert], ignore_index=True)
print(len(franco_multi))
franco_multi[:1]

420


Unnamed: 0,label,author,prediction,classifier,f1-score,poem_type,classifier_type
0,UA11_1,Franco,1.0,Logit,0.848921,Split,Multiclass


## All

In [None]:
all_pred = pd.concat([franco_bin, franco_multi], ignore_index=True)
all_pred['group_label'] = [int(x.split('_')[0][2:]) for x in all_pred.label]
all_pred[:3]

Unnamed: 0,label,author,prediction,classifier,f1-score,poem_type,classifier_type,group_label
0,UA11_1,Franco,0.0,SVM,0.808679,Split,Binary,11
1,UA11_2,Franco,0.0,SVM,0.808679,Split,Binary,11
2,UA11_3,Franco,0.0,SVM,0.808679,Split,Binary,11


## Voting system

In [None]:
votes = []
for poem_n in all_pred.group_label.unique():
  yes = 0
  no = 0
  df = all_pred[all_pred['group_label'] == poem_n]
  N = len(df)
  for prediction in df.prediction:
    if prediction > 0.80:
      yes += 1
    else:
      no += 1
  votes.append({'Poem by Franco?' : f'UA{poem_n}', 'Yes %': round(yes/N*100), 'No %': round(no/N*100)})

votes_df = pd.DataFrame(votes).sort_values('Yes %', ascending=False)
votes_df

Unnamed: 0,Poem by Franco?,Yes %,No %
3,UA6,82,18
1,UA14,70,30
5,UA9,54,46
4,UA7,50,50
6,UA1,40,60
2,UA4,30,70
0,UA11,17,83


## Probability by poem chunk

In [37]:
poems_df = pd.read_csv(os.path.join(poems_dir, 'poems_split.csv'))[['label', 'poem']]
merged = pd.merge(left=poems_df, on='label', right=of_interest[['label', 'prediction']])
merged.to_csv('chunk_probabilities.csv', index=False)