In [None]:
import time
import os
import glob
import re
import csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import product

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
incerto_dir = '/content/drive/MyDrive/incerto-autore'
figures_dir = os.path.join(incerto_dir, 'figures')
output_dir = os.path.join(incerto_dir, 'output')
performance_dir = os.path.join(incerto_dir, 'output','classification-performance')

## Experiments

In [None]:
def do_classification(_ids, _golden_labels, _classifier_name):

  naive_classifiers = {'Majority': DummyClassifier(strategy="most_frequent"),
                 'Minority': DummyClassifier(strategy='constant', constant=1)}

  classifier = naive_classifiers[_classifier_name]

  baseline = cross_val_score(classifier, _ids, _golden_labels, cv=3, scoring='f1_macro')

  return baseline.mean()

In [None]:
results = []
for typ in ['Split', 'Whole']:

    poems_path = os.path.join(incerto_dir, 'data', 'poems', f'poems_{typ.lower()}.csv')
    all_poems_df = pd.read_csv(poems_path)
    print(f'All {typ} poems:\t', len(all_poems_df))
    poems_df = all_poems_df.loc[all_poems_df['author'] != 'Unknown']
    print(f'Known {typ} poems:\t', len(poems_df))

    for author in poems_df['author'].unique():

        golden_labels = poems_df['author'].map(lambda x: 1 if x==author else 0).tolist()

        for classifier_name in ['Majority']: #, 'Minority']:

          score = do_classification(poems_df['label'], golden_labels, classifier_name)

          d = {'Author': author,
                'Poem format': typ,
                'Naive Classifier Type': classifier_name,
                'F-1 score': score}

          results.append(d)

All Split poems:	 682
Known Split poems:	 622
All Whole poems:	 339
Known Whole poems:	 332


In [None]:
df = pd.DataFrame(results)

In [None]:
df.groupby(['Author', 'Poem format']).max().reset_index()

Unnamed: 0,Author,Poem format,Naive Classifier Type,F-1 score
0,AntonGiacomoCorso,Split,Majority,0.475105
1,AntonGiacomoCorso,Whole,Majority,0.47634
2,CelioMagno,Split,Majority,0.47599
3,CelioMagno,Whole,Majority,0.471337
4,DomenicoVenier,Split,Majority,0.483389
5,DomenicoVenier,Whole,Majority,0.468799
6,Franco,Split,Majority,0.390195
7,Franco,Whole,Majority,0.484474
8,GiorgioGradenigo,Split,Majority,0.49472
9,GiorgioGradenigo,Whole,Majority,0.490015


In [None]:
df.groupby(['Author', 'Poem format']).max().to_csv(os.path.join(performance_dir, 'binary_baseline.csv'))