In [None]:
# Mount the Google drive for access to files
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Basic Python modules
import os
import re
from collections import defaultdict, Counter
import random
import csv

# For data manipulation and analysis
import pandas as pd
import numpy as np


# For deep learning
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
import torch

In [None]:
incerto_dir = '/content/drive/MyDrive/incerto-autore'
new_poems_dir = os.path.join(incerto_dir, 'data', 'poems')
poems_split_df = pd.read_csv(os.path.join(new_poems_dir, 'poems_split.csv'))
len(poems_split_df)

pretrained_path = os.path.join(incerto_dir, 'contbertoldo-all', 'checkpoint')
#pretrained_path = 'dbmdz/bert-base-italian-xxl-uncased'

if 'contbertoldo' in pretrained_path:
  predictions_path = os.path.join(incerto_dir, 'output', 'predictions', 'predictions_multi_bertoldo.csv')
  finetuned_path = os.path.join(incerto_dir, 'output','finetuned-models', 'multi-class', 'bertoldo')
elif 'italian':
  predictions_path = os.path.join(incerto_dir, 'output', 'predictions', 'predictions_multi_bert-ita.csv')
  finetuned_path = os.path.join(incerto_dir, 'output','finetuned-models', 'multi-class', 'bert-ita')
if not os.path.exists(os.path.join(incerto_dir, 'output', 'predictions')):
  os.makedirs(predictions_path)

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m91.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3


In [None]:
# using DistilBERT for testing --> can switch to BERT once set up
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [None]:
device = 'cuda'

### Set up

In [None]:
# load all paragraphs from the endometriosis reddit communities
unknown_df = poems_split_df.loc[poems_split_df['author'] == 'Unknown']
unknown_poems = unknown_df['poem'].tolist()
unknown_labels = unknown_df['label'].tolist()

In [None]:
unique_labels = list(set(poems_split_df.loc[poems_split_df['author'] != 'Unknown']['author'].tolist()))
unique_labels.sort()
print(unique_labels)

['AntonGiacomoCorso', 'CelioMagno', 'DomenicoVenier', 'Franco', 'GiorgioGradenigo', 'MarcoVenier', 'Petrarca', 'PietroBembo']


In [None]:
with open(predictions_path, 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(('label', 'author', 'classifier', 'prediction'))

### Make Predictions

In [None]:
# choose batchsize --> reduce this if out of GPUs
batchsize = 8
predictions = []
worklist = unknown_poems

# load the fine-tuned model from our directory and send it to cuda
model = BertForSequenceClassification.from_pretrained(finetuned_path).to(device)

# load the tokenizer (make sure this is the same type of tokenizer as what we used when training)
tokenizer = BertTokenizer.from_pretrained(pretrained_path)

# get predictions in batches
for i in range(0, len(worklist), batchsize):
  batch = worklist[i:i+batchsize] # extract batch from worklist
  test_encodings = tokenizer(batch, truncation=True, padding=True, return_tensors="pt").to(device) # tokenize the posts
  output = model(**test_encodings) # make predictions with model on our test_encodings for this batch
  batch_predictions = torch.softmax(output.logits, dim=1).tolist() # get the predictions result
  predictions.extend(batch_predictions)
  if i % 8 == 0:
    print(str(i)+" in "+str(len(worklist)))

labeled_predictions = list(zip(unknown_labels, predictions))
print(labeled_predictions[0])

with open(predictions_path, 'a') as csvfile:
  csvwriter = csv.writer(csvfile)
  for tup in labeled_predictions:
    ix = 0
    for ix in range(0,len(tup[1])):
      csvwriter.writerow((tup[0], unique_labels[ix], 'BERToldo', tup[1][ix]))

0 in 60
8 in 60
16 in 60
24 in 60
32 in 60
40 in 60
48 in 60
56 in 60
('UA11_1', [0.08376613259315491, 0.027634575963020325, 0.013629131019115448, 0.822791576385498, 0.01671084389090538, 0.015613539144396782, 0.007880816236138344, 0.011973322369158268])


In [None]:
predictions_df = pd.read_csv(predictions_path)
predictions_df[:1]

In [None]:
f1_scores = {}
for author in label2id:
  finetuned_path = os.path.join(incerto_dir, 'output','finetuned-models', 'multi-class', 'bertoldo', 'classification_report.csv')
  df = pd.read_csv(finetuned_path)
  idx = label2id[author]
  f1_scores[author] = df.loc[:, 'f1-score'][idx]
f1_scores

predictions_df['f1-score'] = predictions_df['author'].map(f1_scores)