In [None]:
# Mount the Google drive for access to files
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Basic Python modules
import os
import re
from collections import defaultdict, Counter
import random
import csv

# For data manipulation and analysis
import pandas as pd
import numpy as np


# For deep learning
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
import torch

In [None]:
incerto_dir = '/content/drive/MyDrive/incerto-autore'
new_poems_dir = os.path.join(incerto_dir, 'data', 'poems')
poems_split_df = pd.read_csv(os.path.join(new_poems_dir, 'poems_split.csv'))
len(poems_split_df)

pretrained_path = os.path.join(incerto_dir, 'contbertoldo-all', 'checkpoint')
#pretrained_path = 'dbmdz/bert-base-italian-xxl-uncased'

if 'contbertoldo' in pretrained_path:
  predictions_path = os.path.join(incerto_dir, 'output', 'predictions', 'predictions_binary_bertoldo.csv')
  finetuned_path = os.path.join(incerto_dir, 'output','finetuned-models', 'binary-class', 'bertoldo')
elif 'italian':
  predictions_path = os.path.join(incerto_dir, 'output', 'predictions', 'predictions_binary_bert-ita.csv')
  finetuned_path = os.path.join(incerto_dir, 'output','finetuned-models', 'binary-class', 'bert-ita')

In [None]:
pip install transformers

In [None]:
# using DistilBERT for testing --> can switch to BERT once set up
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [None]:
device = 'cuda'

### Set up

In [None]:
# load all paragraphs from the endometriosis reddit communities
unknown_df = poems_split_df.loc[poems_split_df['author'] == 'UnknownAuthor']
unknown_poems = unknown_df['poem'].tolist()
unknown_labels = unknown_df['label'].tolist()

In [None]:
with open(predictions_path, 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(('label', 'author', 'classifier', 'prediction'))

### Make Predictions

In [None]:
annotations_df = poems_split_df.loc[poems_split_df['author'] != 'UnknownAuthor']
len(annotations_df)

1116

In [None]:
for author in annotations_df['author'].unique():

  author_finetuned_path = os.path.join(finetuned_path, author)
  author_finetuned_csv = os.path.join(author_finetuned_path, 'classification_report.csv')
  report_df = pd.read_csv(author_finetuned_csv)
  f1 = report_df['f1-score'].tolist()[3]
  print(author_finetuned_path, f1)

  if f1 > 0.7:
    print('getting predictions')
    # choose batchsize --> reduce this if out of GPUs
    batchsize = 8
    predictions = []
    worklist = unknown_poems

    # load the fine-tuned model from our directory and send it to cuda
    model = BertForSequenceClassification.from_pretrained(author_finetuned_path).to(device)

    # load the tokenizer (make sure this is the same type of tokenizer as what we used when training)
    tokenizer = BertTokenizer.from_pretrained(pretrained_path)

    # get predictions in batches
    for i in range(0, len(worklist), batchsize):
      print(f"Processing batch {i//batchsize + 1}, items {i} to {min(i+batchsize, len(worklist))}")

      batch = worklist[i:i+batchsize] # extract batch from worklist
      test_encodings = tokenizer(batch, truncation=True, padding=True, return_tensors="pt").to(device) # tokenize the posts
      output = model(**test_encodings) # make predictions with model on our test_encodings for this batch
      batch_predictions = torch.softmax(output.logits, dim=1).tolist() # get the predictions result
      predictions.extend(batch_predictions)
      if i % 8 == 0:
        print(str(i)+" in "+str(len(worklist)))

    print(f"Got {len(predictions)} predictions")
    labeled_predictions = list(zip(unknown_labels, predictions))

    with open(predictions_path, 'a') as csvfile:
      csvwriter = csv.writer(csvfile)
      for tup in labeled_predictions:
        csvwriter.writerow((tup[0], author, 'BERToldo', tup[1][1]))

/content/drive/MyDrive/incerto-autore/output/finetuned-models/binary-class/bertoldo/AntonGiacomoCorso 0.7278048780487805
getting predictions
Processing batch 1, items 0 to 8
0 in 60
Processing batch 2, items 8 to 16
8 in 60
Processing batch 3, items 16 to 24
16 in 60
Processing batch 4, items 24 to 32
24 in 60
Processing batch 5, items 32 to 40
32 in 60
Processing batch 6, items 40 to 48
40 in 60
Processing batch 7, items 48 to 56
48 in 60
Processing batch 8, items 56 to 60
56 in 60
Got 60 predictions
/content/drive/MyDrive/incerto-autore/output/finetuned-models/binary-class/bertoldo/BartolomeoZacco 0.4991023339317774
/content/drive/MyDrive/incerto-autore/output/finetuned-models/binary-class/bertoldo/CelioMagno 0.6139298892988929
/content/drive/MyDrive/incerto-autore/output/finetuned-models/binary-class/bertoldo/DomenicoVenier 0.6615902964959569
/content/drive/MyDrive/incerto-autore/output/finetuned-models/binary-class/bertoldo/GiorgioGradenigo 0.4972972972972973
/content/drive/MyDrive

In [9]:
f1_scores = {}

for author in annotations_df['author'].unique():
    author_finetuned_path = os.path.join(finetuned_path, author)
    author_finetuned_csv = os.path.join(author_finetuned_path, 'classification_report.csv')

    report_df = pd.read_csv(author_finetuned_csv)
    f1 = report_df['f1-score'].tolist()[3]

    # Store F1 score in dictionary
    f1_scores[author] = f1
    print(f"Author: {author}, F1 Score: {f1}")

# Load predictions DataFrame and add F1 scores column
predictions_df = pd.read_csv(predictions_path)

# Map F1 scores to each row based on author
predictions_df['f1-score'] = predictions_df['author'].map(f1_scores)

predictions_df.to_csv(os.path.join(incerto_dir, 'output', 'predictions', 'predictions_binary_bertoldo_f1.csv'), index=False)

Author: AntonGiacomoCorso, F1 Score: 0.7278048780487805
Author: BartolomeoZacco, F1 Score: 0.4991023339317774
Author: CelioMagno, F1 Score: 0.6139298892988929
Author: DomenicoVenier, F1 Score: 0.6615902964959569
Author: GiorgioGradenigo, F1 Score: 0.4972972972972973
Author: MaffioVenier, F1 Score: 0.9026993094789704
Author: MarcoStecchini, F1 Score: 0.7176684881602915
Author: MarcoVenier, F1 Score: 0.4972972972972973
Author: MuzioManfredi, F1 Score: 0.8342442965779467
Author: OrsattoGiustinian, F1 Score: 0.6942773441867747
Author: Petrarca, F1 Score: 0.7903656703957255
Author: PietroBembo, F1 Score: 0.6684885931558935
Author: ValerioSali, F1 Score: 0.4842883548983364
Author: VeronicaFranco, F1 Score: 0.9044520547945204
