# Getting predictions for the unknown author from the best binary performing models

In [1]:
import time
import os
import glob
import re
import csv
import pandas as pd
import numpy as np

import ray

from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [2]:
run_predictions = True

In [3]:
incerto_dir = './'
poems_dir = os.path.join(incerto_dir, 'data', 'poems')
figures_dir = os.path.join(incerto_dir, 'figures')
output_dir = os.path.join(incerto_dir, 'output')
classification_path = os.path.join(output_dir, f'binary_classification_performance.csv')

In [4]:
p_df_split = pd.read_csv(os.path.join(poems_dir, 'poems_split.csv'))
p_df_whole = pd.read_csv(os.path.join(poems_dir, 'poems_whole.csv'))

In [5]:
results_df = pd.read_csv(classification_path)
results_df[:1]

Unnamed: 0,classifier,author,vectorizer,ngram type,ngram range,max_df,min_df,max_f,num_f,scaler,f1-score,type
0,kNN,Franco,Count,Char,Bigrams,0.8,0.0,,243,StandardScaler,0.579222,Split


In [6]:
cond = (results_df['f1-score'] >= 0.7) & (results_df['author'] == 'Franco')
franco_top = results_df.loc[cond]
print('Number of models with F1-score (macro avg) > 0.7:\t', len(franco_top))
print('Average f1-score:\t\t\t\t\t', round(franco_top['f1-score'].mean(), 2))
print('Number of models using split poems:\t\t\t', len(franco_top.loc[franco_top['type'] == 'Split']))

Number of models with F1-score (macro avg) > 0.7:	 769
Average f1-score:					 0.77
Number of models using split poems:			 348


In [7]:
cond = (results_df['f1-score'] >= 0.7) & (results_df['author'] == 'Petrarca')
petrarca_top = results_df.loc[cond].copy()
print('Number of models with F1-score (macro avg) > 0.7:\t', len(petrarca_top))
print('Authors the models are trained to classify:\t\t', petrarca_top['author'].unique())
print('Average f1-score:\t\t\t\t\t', round(petrarca_top['f1-score'].mean(), 2))
print('Number of models using split poems:\t\t\t', len(petrarca_top.loc[petrarca_top['type'] == 'Split']))

Number of models with F1-score (macro avg) > 0.7:	 533
Authors the models are trained to classify:		 ['Petrarca']
Average f1-score:					 0.75
Number of models using split poems:			 14


In [8]:
cond = (results_df['f1-score'] >= 0.7) & (results_df['author'] != 'Franco') & (results_df['author'] != 'Petrarca')
other_top = results_df.loc[cond].copy()
print('Number of models with F1-score (macro avg) > 0.7:\t', len(other_top))
print('Authors the models are trained to classify:\t\t', other_top['author'].unique())
print('Average f1-score:\t\t\t\t\t', round(other_top['f1-score'].mean(), 2))
print('Number of models using split poems:\t\t\t', len(other_top.loc[other_top['type'] == 'Split']))

Number of models with F1-score (macro avg) > 0.7:	 8
Authors the models are trained to classify:		 ['PietroBembo']
Average f1-score:					 0.73
Number of models using split poems:			 8


In [9]:
top_models = results_df.loc[results_df['f1-score'] >= 0.7]
print(len(top_models))
top_models[:1]

1310


Unnamed: 0,classifier,author,vectorizer,ngram type,ngram range,max_df,min_df,max_f,num_f,scaler,f1-score,type
2,SVM,Franco,Count,Char,Bigrams,0.8,0.0,,243,StandardScaler,0.708432,Split


## Set up

In [10]:
ngram_range_d = {'Unigrams': (1,1),
                 'Bigrams': (2,2),
                 'Trigrams': (3,3)}

In [11]:
classifiers = {
  'RandomForest': RandomForestClassifier(),
  'kNN': KNeighborsClassifier(),
  'Logit': LogisticRegression(),
  'SVM': SVC()}

In [12]:
scalers = {'StandardScaler': StandardScaler(),
           'L1': Normalizer(norm='l1'),
           'L2': Normalizer(norm='l2')}

In [13]:
def build_vectorizer(_typ, _ngram, _max, _min, _max_f):

    if _max_f == 'None':
        _max_f = None
    else:
        _max_f = int(_max_f)

    if _typ == 'Count':
        vec = CountVectorizer(input='content',
                    encoding='utf-8',
                    lowercase=True,
                    analyzer=_ngram[0].lower(),
                    ngram_range=_ngram[2],
                    max_df=_max,
                    min_df=_min,
                    max_features=_max_f)

    elif _typ == 'TfIdf':
        vec = TfidfVectorizer(input='content',
                    encoding='utf-8',
                    lowercase=True,
                    analyzer=_ngram[0].lower(),
                    ngram_range=_ngram[2],
                    max_df=_max,
                    min_df=_min,
                    max_features=_max_f,
                    norm=None)
    return vec

In [14]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    report_d = classification_report(test_labels, predictions, zero_division=0, output_dict=True)
    f1score = report_d['macro avg']['f1-score']
    f1score_1 = report_d['1']['f1-score']

    return f1score, f1score_1

In [15]:
if run_predictions == True:
    ray.init(num_cpus=6)

2023-04-30 22:19:59,904	INFO worker.py:1553 -- Started a local Ray instance.


In [16]:
@ray.remote
def do_predictions(path, row, p_df):

  ngram_range = row['ngram range']
  author = row['author']
  classifier = row['classifier']
  scaler = row['scaler']
  cv_score = row['f1-score']

  if row['ngram type'] == 'Char':
      poems = [re.sub(r'\s+', '', x) for x in p_df['poem'].tolist()]
  elif row['ngram type'] == 'Word':
      poems = p_df['poem'].tolist()
  
  # vectorize poems
  vectorizer = build_vectorizer(row['vectorizer'], [row['ngram type'], ngram_range, ngram_range_d[ngram_range]], row['max_df'], row['min_df'], row['max_f'])
  X = vectorizer.fit_transform(poems)
  scaled_X = scalers[scaler].fit_transform(X.toarray())
  # final_X = pd.DataFrame(scaled_X, columns=vectorizer.get_feature_names_out()) 
  # df = p_df[['label', 'author']].merge(final_X, left_index=True, right_index=True)

  # select rows for training and testing
  known_df = p_df.loc[p_df['author'] != 'Unknown']
  known_X = scaled_X[known_df.index, :]
  known_y = known_df['author'].map(lambda x: 1 if x==author else 0).tolist()
  X_train, X_test, y_train, y_test = train_test_split(known_X, known_y, test_size = 0.25, random_state = 42)
  
  # training and testing
  cl = classifiers[classifier].fit(X_train, y_train)
  f1, f1_1 = evaluate(cl, X_test, y_test)

  # selecting poems to predict
  unknown_df = p_df.loc[p_df['author'] == 'Unknown']
  unknown_labels = unknown_df['label'].tolist()
  unknown_X = scaled_X[unknown_df.index, :]

  predictions = cl.predict(unknown_X)
  labeled_predictions = list(zip(unknown_labels, predictions))

  for tuple in labeled_predictions:
    with open(path, 'a') as csvfile:
      csvwriter = csv.writer(csvfile)
      csvwriter.writerow((tuple[0], tuple[1],
                classifier, author, row['vectorizer'],
                row['ngram type'], ngram_range,
                row['max_df'], row['min_df'], row['max_f'], row['num_f'], row['scaler'],
                row['type'], 'Binary',
                cv_score, f1, f1_1))

In [17]:
predictions_dir = os.path.join(incerto_dir, 'output', 'predictions')
predictions_path = os.path.join(predictions_dir, 'predictions_binary.csv')
if not os.path.exists(predictions_dir):
    os.makedirs(predictions_dir)
if not os.path.exists(predictions_path):
    with open(predictions_path, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(('label', 'prediction',
                            'classifier', 'author', 'vectorizer',
                            'ngram_type', 'ngram_range',
                            'max_df', 'min_df', 'max_f', 'num_f', 'scaler',
                            'poem_type', 'classifier_type',
                            'cv_f1-score', 'f1-score', 'f1-score_1'))

In [18]:
futures = []
if run_predictions == True:
    for _index, _row in top_models.iterrows():
        if _row['type'] == 'Split':
            _p_df = p_df_split
        elif _row['type'] == 'Whole':
            _p_df = p_df_whole
        futures.append(do_predictions.remote(predictions_path, _row, _p_df))
        # futures.append(do_predictions(predictions_path, _row, _p_df))

    results = ray.get(futures)

In [19]:
predictions_df = pd.read_csv(predictions_path)
print(len(predictions_df))
predictions_df.head()

28780


Unnamed: 0,label,prediction,classifier,author,vectorizer,ngram_type,ngram_range,max_df,min_df,max_f,num_f,scaler,poem_type,classifier_type,cv_f1-score,f1-score,f1-score_1
0,UA11_1,0,SVM,Franco,Count,Char,Bigrams,0.9,0.0,,277,StandardScaler,Split,Binary,0.740272,0.808679,0.74
1,UA11_2,0,SVM,Franco,Count,Char,Bigrams,0.9,0.0,,277,StandardScaler,Split,Binary,0.740272,0.808679,0.74
2,UA11_3,0,SVM,Franco,Count,Char,Bigrams,0.9,0.0,,277,StandardScaler,Split,Binary,0.740272,0.808679,0.74
3,UA11_4,0,SVM,Franco,Count,Char,Bigrams,0.9,0.0,,277,StandardScaler,Split,Binary,0.740272,0.808679,0.74
4,UA11_5,0,SVM,Franco,Count,Char,Bigrams,0.9,0.0,,277,StandardScaler,Split,Binary,0.740272,0.808679,0.74
