# Getting predictions for the unknown author from the best multi-class performing models

In [1]:
import time
import os
import glob
import re
import csv
import pandas as pd
import numpy as np

import ray

from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split



In [2]:
run_predictions = True

In [3]:
incerto_dir = os.path.join(os.getcwd(), '..', '..')
poems_dir = os.path.join(incerto_dir, 'data', 'poems')
figures_dir = os.path.join(incerto_dir, 'figures')
output_dir = os.path.join(incerto_dir, 'output')
classification_path = os.path.join(output_dir, 'classification-performance', f'multi_classification_performance.csv')

In [4]:
p_df_split = pd.read_csv(os.path.join(poems_dir, 'poems_split.csv'))
p_df_whole = pd.read_csv(os.path.join(poems_dir, 'poems_whole.csv'))

In [5]:
results_df = pd.read_csv(classification_path)
results_df[:1]

Unnamed: 0,classifier,vectorizer,ngram type,ngram range,max_df,min_df,max_f,num_f,scaler,f1-score,type
0,kNN,Count,Char,Bigrams,0.8,0.0,,259,StandardScaler,0.113861,Split


In [6]:
top_models = results_df.loc[results_df['f1-score'] >= 0.4]
print(len(top_models))
top_models[:1]

6


Unnamed: 0,classifier,vectorizer,ngram type,ngram range,max_df,min_df,max_f,num_f,scaler,f1-score,type
445,Logit,Count,Word,Unigrams,0.8,0.0,1000.0,1000,StandardScaler,0.416496,Split


## Set up

In [7]:
ngram_range_d = {'Unigrams': (1,1),
                 'Bigrams': (2,2),
                 'Trigrams': (3,3)}

In [8]:
classifiers = {
  'RandomForest': RandomForestClassifier(),
  'kNN': KNeighborsClassifier(),
  'Logit': LogisticRegression(),
  'SVM': SVC()}

In [9]:
scalers = {'StandardScaler': StandardScaler(),
           'L1': Normalizer(norm='l1'),
           'L2': Normalizer(norm='l2')}

In [10]:
def build_vectorizer(_typ, _ngram, _max, _min, _max_f):

    if _max_f == 'None':
        _max_f = None
    else:
        _max_f = int(_max_f)

    if _typ == 'Count':
        vec = CountVectorizer(input='content',
                    encoding='utf-8',
                    lowercase=True,
                    analyzer=_ngram[0].lower(),
                    ngram_range=_ngram[2],
                    max_df=_max,
                    min_df=_min,
                    max_features=_max_f)

    elif _typ == 'TfIdf':
        vec = TfidfVectorizer(input='content',
                    encoding='utf-8',
                    lowercase=True,
                    analyzer=_ngram[0].lower(),
                    ngram_range=_ngram[2],
                    max_df=_max,
                    min_df=_min,
                    max_features=_max_f,
                    norm=None)
    return vec

In [11]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    report_d = classification_report(test_labels, predictions, zero_division=0, output_dict=True)
    # print(report_d)
    # f1score = report_d['macro avg']['f1-score']
    # f1scores = []
    # keys = sorted(report_d.keys())
    # for key in keys:
    #     if type(report_d[key]) == dict and 'avg' not in key:
    #         f1scores.append(report_d[key]['f1-score'])

    return report_d

In [12]:
# if run_predictions == True:
#     ray.init(num_cpus=6)

In [13]:
# @ray.remote
def do_predictions(path, row, p_df):
  print(row)
  ngram_range = row['ngram range']
  classifier = row['classifier']
  scaler = row['scaler']
  cv_score = row['f1-score']

  if row['ngram type'] == 'Char':
      poems = [re.sub(r'\s+', '', x) for x in p_df['poem'].tolist()]
  elif row['ngram type'] == 'Word':
      poems = p_df['poem'].tolist()
  
  # vectorize poems
  vectorizer = build_vectorizer(row['vectorizer'], [row['ngram type'], ngram_range, ngram_range_d[ngram_range]], row['max_df'], row['min_df'], row['max_f'])
  X = vectorizer.fit_transform(poems)
  scaled_X = scalers[scaler].fit_transform(X.toarray())
  # final_X = pd.DataFrame(scaled_X, columns=vectorizer.get_feature_names_out()) 
  # df = p_df[['label', 'author']].merge(final_X, left_index=True, right_index=True)

  # select rows for training and testing
  known_df = p_df.loc[p_df['author'] != 'UnknownAuthor']
  known_X = scaled_X[known_df.index, :]
  known_y = known_df['author'].tolist()
  # X_train, X_test, y_train, y_test = train_test_split(known_X, known_y, test_size = 0.25, random_state = 42)
  
  # training and testing
  # cl = classifiers[classifier].fit(X_train, y_train)
  # evaluation_report = evaluate(cl, X_test, y_test)
  cl = classifiers[classifier].fit(known_X, known_y)

  # selecting poems to predict
  unknown_df = p_df.loc[p_df['author'] == 'UnknownAuthor']
  unknown_labels = unknown_df['label'].tolist()
  unknown_X = scaled_X[unknown_df.index, :]

  predictions = cl.predict_proba(unknown_X)
  authored_predictions = []
  for chunk_predictions in predictions:
    authored_predictions.append(list(zip(cl.classes_, chunk_predictions)))
  # print(authored_predictions)
  labeled_predictions = list(zip(unknown_labels, authored_predictions))
  # print(labeled_predictions)

  for chunk_pred in labeled_predictions:
    chunk = chunk_pred[0]
    pred = chunk_pred[1]
    for au_prob in pred:
      au = au_prob[0]
      prob = au_prob[1]
      csv_row = [chunk, prob,
                  classifier, au, row['vectorizer'],
                  row['ngram type'], ngram_range,
                  row['max_df'], row['min_df'], row['max_f'], row['num_f'], row['scaler'],
                  row['type'], 'Multiclass',
                  cv_score]#, evaluation_report[au]['f1-score']]
      with open(path, 'a') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(csv_row)

In [14]:
predictions_dir = os.path.join(incerto_dir, 'output', 'predictions')
predictions_path = os.path.join(predictions_dir, 'predictions_multiclass2.csv')
authors = [x+'_f1' for x in sorted(p_df_split[p_df_split['author'] != 'Unknown'].author.unique())]

if not os.path.exists(predictions_dir):
    os.makedirs(predictions_dir)
if not os.path.exists(predictions_path):
    row = ['label', 'probability',
            'classifier', 'author', 'vectorizer',
            'ngram_type', 'ngram_range',
            'max_df', 'min_df', 'max_f', 'num_f', 'scaler',
            'poem_type', 'classifier_type',
            'cv_f1-score']#, 'f1-score'] #+ authors

    with open(predictions_path, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow((row))

In [15]:
futures = []
if run_predictions == True:
    for _index, _row in top_models.iterrows():
        if _row['type'] == 'Split':
            _p_df = p_df_split
        elif _row['type'] == 'Whole':
            _p_df = p_df_whole
        do_predictions(predictions_path, _row, _p_df)
        # futures.append(do_predictions.remote(predictions_path, _row, _p_df))

    # results = ray.get(futures)

classifier              Logit
vectorizer              Count
ngram type               Word
ngram range          Unigrams
max_df                    0.8
min_df                    0.0
max_f                  1000.0
num_f                    1000
scaler         StandardScaler
f1-score             0.416496
type                    Split
Name: 445, dtype: object
classifier              Logit
vectorizer              Count
ngram type               Word
ngram range          Unigrams
max_df                    0.9
min_df                    0.0
max_f                  1000.0
num_f                    1000
scaler         StandardScaler
f1-score              0.41646
type                    Split
Name: 517, dtype: object
classifier              Logit
vectorizer              Count
ngram type               Word
ngram range          Unigrams
max_df                    1.0
min_df                    0.0
max_f                  1000.0
num_f                    1000
scaler         StandardScaler
f1-score            

In [16]:
predictions_df = pd.read_csv(predictions_path)
print(len(predictions_df))
print(predictions_df.columns)
predictions_df.head()

5040
Index(['label', 'probability', 'classifier', 'author', 'vectorizer',
       'ngram_type', 'ngram_range', 'max_df', 'min_df', 'max_f', 'num_f',
       'scaler', 'poem_type', 'classifier_type', 'cv_f1-score'],
      dtype='object')


Unnamed: 0,label,probability,classifier,author,vectorizer,ngram_type,ngram_range,max_df,min_df,max_f,num_f,scaler,poem_type,classifier_type,cv_f1-score
0,UnknownAuthor_1_1,0.0304,Logit,AntonGiacomoCorso,Count,Word,Unigrams,0.8,0.0,1000.0,1000,StandardScaler,Split,Multiclass,0.416496
1,UnknownAuthor_1_1,0.000858,Logit,BartolomeoZacco,Count,Word,Unigrams,0.8,0.0,1000.0,1000,StandardScaler,Split,Multiclass,0.416496
2,UnknownAuthor_1_1,0.010046,Logit,CelioMagno,Count,Word,Unigrams,0.8,0.0,1000.0,1000,StandardScaler,Split,Multiclass,0.416496
3,UnknownAuthor_1_1,0.000414,Logit,DomenicoVenier,Count,Word,Unigrams,0.8,0.0,1000.0,1000,StandardScaler,Split,Multiclass,0.416496
4,UnknownAuthor_1_1,0.005832,Logit,GiorgioGradenigo,Count,Word,Unigrams,0.8,0.0,1000.0,1000,StandardScaler,Split,Multiclass,0.416496
