# Getting predictions for the unknown author from the best binary performing models

In [1]:
import time
import os
import glob
import re
import csv
import pandas as pd
import numpy as np

import ray

from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm
2025-07-20 21:39:31,248	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
run_predictions = True

In [3]:
incerto_dir = os.path.join(os.getcwd(), '..', '..')
poems_dir = os.path.join(incerto_dir, 'data', 'poems')
figures_dir = os.path.join(incerto_dir, 'figures')
output_dir = os.path.join(incerto_dir, 'output')
classification_path = os.path.join(output_dir, 'classification-performance', f'binary_classification_performance.csv')

In [4]:
p_df_split = pd.read_csv(os.path.join(poems_dir, 'poems_split.csv'))
p_df_whole = pd.read_csv(os.path.join(poems_dir, 'poems_whole.csv'))

In [22]:
results_df = pd.read_csv(classification_path, keep_default_na=False)
print(len(results_df))
results_df[:1]

32400


Unnamed: 0,classifier,author,vectorizer,ngram type,ngram range,max_df,min_df,max_f,num_f,scaler,f1-score,type
0,Logit,AntonGiacomoCorso,Count,Char,Bigrams,0.8,0.0,,259,StandardScaler,0.649688,Split


In [6]:
results_df

Unnamed: 0,classifier,author,vectorizer,ngram type,ngram range,max_df,min_df,max_f,num_f,scaler,f1-score,type
0,Logit,AntonGiacomoCorso,Count,Char,Bigrams,0.8,0.0,,259,StandardScaler,0.649688,Split
1,kNN,AntonGiacomoCorso,Count,Char,Bigrams,0.8,0.0,,259,StandardScaler,0.518690,Split
2,SVM,AntonGiacomoCorso,Count,Char,Bigrams,0.8,0.0,,259,StandardScaler,0.486660,Split
3,RandomForest,AntonGiacomoCorso,Count,Char,Bigrams,0.8,0.0,,259,StandardScaler,0.486660,Split
4,Logit,BartolomeoZacco,Count,Char,Bigrams,0.8,0.0,,259,StandardScaler,0.498877,Split
...,...,...,...,...,...,...,...,...,...,...,...,...
32395,RandomForest,ValerioSali,TfIdf,Word,Unigrams,1.0,0.2,1000,64,L2,0.502253,Whole
32396,kNN,VeronicaFranco,TfIdf,Word,Unigrams,1.0,0.2,1000,64,L2,0.672914,Whole
32397,Logit,VeronicaFranco,TfIdf,Word,Unigrams,1.0,0.2,1000,64,L2,0.487926,Whole
32398,SVM,VeronicaFranco,TfIdf,Word,Unigrams,1.0,0.2,1000,64,L2,0.575008,Whole


In [7]:
cond = (results_df['f1-score'] >= 0.7) & (results_df['author'] == 'VeronicaFranco')
franco_top = results_df.loc[cond]
print('Number of models with F1-score (macro avg) > 0.7:\t', len(franco_top))
print('Average f1-score:\t\t\t\t\t', round(franco_top['f1-score'].mean(), 2))
print('Number of models using split poems:\t\t\t', len(franco_top.loc[franco_top['type'] == 'Split']))

Number of models with F1-score (macro avg) > 0.7:	 394
Average f1-score:					 0.73
Number of models using split poems:			 175


In [8]:
cond = (results_df['f1-score'] >= 0.7) & (results_df['author'] == 'Petrarca')
petrarca_top = results_df.loc[cond].copy()
print('Number of models with F1-score (macro avg) > 0.7:\t', len(petrarca_top))
print('Authors the models are trained to classify:\t\t', petrarca_top['author'].unique())
print('Average f1-score:\t\t\t\t\t', round(petrarca_top['f1-score'].mean(), 2))
print('Number of models using split poems:\t\t\t', len(petrarca_top.loc[petrarca_top['type'] == 'Split']))

Number of models with F1-score (macro avg) > 0.7:	 158
Authors the models are trained to classify:		 ['Petrarca']
Average f1-score:					 0.75
Number of models using split poems:			 42


In [9]:
cond = (results_df['f1-score'] >= 0.7) & (results_df['author'] != 'VeronicaFranco') & (results_df['author'] != 'Petrarca')
other_top = results_df.loc[cond].copy()
print('Number of models with F1-score (macro avg) > 0.7:\t', len(other_top))
print('Authors the models are trained to classify:\t\t', other_top['author'].unique())
print('Average f1-score:\t\t\t\t\t', round(other_top['f1-score'].mean(), 2))
print('Number of models using split poems:\t\t\t', len(other_top.loc[other_top['type'] == 'Split']))

Number of models with F1-score (macro avg) > 0.7:	 140
Authors the models are trained to classify:		 ['MaffioVenier' 'DomenicoVenier' 'OrsattoGiustinian' 'PietroBembo'
 'MuzioManfredi']
Average f1-score:					 0.73
Number of models using split poems:			 94


In [10]:
top_models = results_df.loc[results_df['f1-score'] >= 0.7]
print(len(top_models))
top_models[:1]

692


Unnamed: 0,classifier,author,vectorizer,ngram type,ngram range,max_df,min_df,max_f,num_f,scaler,f1-score,type
20,Logit,MaffioVenier,Count,Char,Bigrams,0.8,0.0,,259,StandardScaler,0.713763,Split


In [21]:
for author in top_models.author.unique():
  n = len(results_df[(results_df['author'] == author) & (results_df['f1-score'] >= 0.7)])
  print(author, n)

MaffioVenier 49
VeronicaFranco 394
Petrarca 158
DomenicoVenier 36
OrsattoGiustinian 9
PietroBembo 4
MuzioManfredi 42


## Set up

In [11]:
ngram_range_d = {'Unigrams': (1,1),
                 'Bigrams': (2,2),
                 'Trigrams': (3,3)}

In [12]:
classifiers = {
  'RandomForest': RandomForestClassifier(),
  'kNN': KNeighborsClassifier(),
  'Logit': LogisticRegression(),
  'SVM': SVC()}

In [13]:
scalers = {'StandardScaler': StandardScaler(),
           'L1': Normalizer(norm='l1'),
           'L2': Normalizer(norm='l2')}

In [14]:
def build_vectorizer(_typ, _ngram, _max, _min, _max_f):

    if _max_f == 'None':
        _max_f = None
    else:
        _max_f = int(_max_f)

    if _typ == 'Count':
        vec = CountVectorizer(input='content',
                    encoding='utf-8',
                    lowercase=True,
                    analyzer=_ngram[0].lower(),
                    ngram_range=_ngram[2],
                    max_df=_max,
                    min_df=_min,
                    max_features=_max_f)

    elif _typ == 'TfIdf':
        vec = TfidfVectorizer(input='content',
                    encoding='utf-8',
                    lowercase=True,
                    analyzer=_ngram[0].lower(),
                    ngram_range=_ngram[2],
                    max_df=_max,
                    min_df=_min,
                    max_features=_max_f,
                    norm=None)
    return vec

In [15]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    report_d = classification_report(test_labels, predictions, zero_division=0, output_dict=True)
    f1score = report_d['macro avg']['f1-score']
    f1score_1 = report_d['1']['f1-score']

    return f1score, f1score_1

In [16]:
if run_predictions == True:
    ray.init(num_cpus=6)

2025-07-20 21:39:33,971	INFO worker.py:1888 -- Started a local Ray instance.


In [17]:
@ray.remote
def do_predictions(path, row, p_df):

  ngram_range = row['ngram range']
  author = row['author']
  classifier = row['classifier']
  scaler = row['scaler']
  cv_score = row['f1-score']

  if row['ngram type'] == 'Char':
      poems = [re.sub(r'\s+', '', x) for x in p_df['poem'].tolist()]
  elif row['ngram type'] == 'Word':
      poems = p_df['poem'].tolist()
  
  # vectorize poems
  vectorizer = build_vectorizer(row['vectorizer'], [row['ngram type'], ngram_range, ngram_range_d[ngram_range]], row['max_df'], row['min_df'], row['max_f'])
  X = vectorizer.fit_transform(poems)
  scaled_X = scalers[scaler].fit_transform(X.toarray())
  # final_X = pd.DataFrame(scaled_X, columns=vectorizer.get_feature_names_out()) 
  # df = p_df[['label', 'author']].merge(final_X, left_index=True, right_index=True)

  # select rows for training and testing
  known_df = p_df.loc[p_df['author'] != 'UnknownAuthor']
  known_X = scaled_X[known_df.index, :]
  known_y = known_df['author'].map(lambda x: 1 if x==author else 0).tolist()
  # X_train, X_test, y_train, y_test = train_test_split(known_X, known_y, test_size = 0.25, random_state = 42)
  
  # training and testing
  # cl = classifiers[classifier].fit(X_train, y_train)
  cl = classifiers[classifier].fit(known_X, known_y)
  # f1, f1_1 = evaluate(cl, X_test, y_test)

  # selecting poems to predict
  unknown_df = p_df.loc[p_df['author'] == 'UnknownAuthor']
  unknown_labels = unknown_df['label'].tolist()
  unknown_X = scaled_X[unknown_df.index, :]

  predictions = cl.predict(unknown_X)
  labeled_predictions = list(zip(unknown_labels, predictions))

  for tuple in labeled_predictions:
    with open(path, 'a') as csvfile:
      csvwriter = csv.writer(csvfile)
      csvwriter.writerow((tuple[0], tuple[1],
                classifier, author, row['vectorizer'],
                row['ngram type'], ngram_range,
                row['max_df'], row['min_df'], row['max_f'], row['num_f'], row['scaler'],
                row['type'], 'Binary',
                cv_score))#, f1, f1_1))

In [18]:
predictions_dir = os.path.join(incerto_dir, 'output', 'predictions')
predictions_path = os.path.join(predictions_dir, 'predictions_binary2.csv')
if not os.path.exists(predictions_dir):
    os.makedirs(predictions_dir)
if not os.path.exists(predictions_path):
    with open(predictions_path, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(('label', 'prediction',
                            'classifier', 'author', 'vectorizer',
                            'ngram_type', 'ngram_range',
                            'max_df', 'min_df', 'max_f', 'num_f', 'scaler',
                            'poem_type', 'classifier_type',
                            'cv_f1-score'))#, 'f1-score', 'f1-score_1'))

In [19]:
futures = []
if run_predictions == True:
    for _index, _row in top_models.iterrows():
        if _row['type'] == 'Split':
            _p_df = p_df_split
        elif _row['type'] == 'Whole':
            _p_df = p_df_whole
        futures.append(do_predictions.remote(predictions_path, _row, _p_df))
        # futures.append(do_predictions(predictions_path, _row, _p_df))

    results = ray.get(futures)

In [20]:
predictions_df = pd.read_csv(predictions_path)
print(len(predictions_df))
predictions_df.head()

21327


Unnamed: 0,label,prediction,classifier,author,vectorizer,ngram_type,ngram_range,max_df,min_df,max_f,num_f,scaler,poem_type,classifier_type,cv_f1-score
0,UnknownAuthor_1_1,0,Logit,MaffioVenier,Count,Char,Bigrams,0.8,0.2,,137,StandardScaler,Split,Binary,0.722234
1,UnknownAuthor_1_2,0,Logit,MaffioVenier,Count,Char,Bigrams,0.8,0.2,,137,StandardScaler,Split,Binary,0.722234
2,UnknownAuthor_1_1,0,Logit,MaffioVenier,Count,Char,Bigrams,0.8,0.1,,157,StandardScaler,Split,Binary,0.700967
3,UnknownAuthor_1_1,0,Logit,MaffioVenier,Count,Char,Bigrams,0.8,0.1,1000.0,157,StandardScaler,Split,Binary,0.700967
4,UnknownAuthor_1_3,0,Logit,MaffioVenier,Count,Char,Bigrams,0.8,0.2,,137,StandardScaler,Split,Binary,0.722234
