### Mount Google drive

*  Mount Google drive in the directory '/content/drive'
*  Drive contains dataset files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Install packages

*  `transformers` package
*  `datasets` package
*  pip will install all models and dependencies automatically.

In [None]:
!pip install transformers

In [None]:
!pip install datasets

### Imports

In [None]:
import os
from collections import Counter

import csv
import pandas as pd

from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from transformers import create_optimizer
from transformers.keras_callbacks import KerasMetricCallback
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import tensorflow as tf

import datasets
from datasets import Dataset
from datasets import ClassLabel, Value

import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit

### Preprocessing dataset

- Read Datasets
- Change label types

In [None]:
# Path to datasets
#training_set = "/content/drive/MyDrive/Dissertacao/Subtask_1a/augmented_training_data/augmented_training.tsv"
#training_set = "/content/drive/MyDrive/Dissertacao/Subtask_1a/augmented_synonym_training_data/augmented_synonym_training.tsv"
#training_set = "/content/drive/MyDrive/Dissertacao/Subtask_1a/oversampled_training_data/oversampled_training.tsv"
#training_set = "/content/drive/MyDrive/Dissertacao/Subtask_1a/augmented_oversampled_training_data/augmented_oversampled_training.tsv"
#training_set = "/content/drive/MyDrive/Dissertacao/Subtask_1a/undersampled_training_data/undersampled_training.tsv"
#training_set = "/content/drive/MyDrive/Dissertacao/Subtask_1a/merged_train_validation_data/augmented_train_val_data.tsv"
training_set = "/content/drive/MyDrive/Dissertacao/Subtask_1a/training_data/merged_training_dataset.tsv"
validation_set = "/content/drive/MyDrive/Dissertacao/Subtask_1a/validation_data/merged_validation_dataset.tsv"
reference_set = "/content/drive/MyDrive/Dissertacao/IMI_WEBRADR_Reference_Dataset/T1_MOESM_dataset.tsv"

df = pd.read_csv(training_set, sep='\t', quoting=csv.QUOTE_NONE)
validation_df = pd.read_csv(validation_set, sep='\t', quoting=csv.QUOTE_NONE)
reference_df = pd.read_csv(reference_set, sep='\t', quoting=csv.QUOTE_NONE, engine='python')

print(reference_df["label"][225])

df.loc[ df["label"] == "ADE", "label"] = 1 # Positive classification
df.loc[ df["label"] == "noADE", "label"] = 0 # Negative classification

validation_df.loc[ validation_df["label"] == "ADE", "label"] = 1 # Positive classification
validation_df.loc[ validation_df["label"] == "noADE", "label"] = 0 # Negative classification

reference_df.loc[ reference_df["label"] == "ADE", "label"] = 1 # Positive classification
reference_df.loc[ reference_df["label"] == "NoADE", "label"] = 0 # Negative classification

df = df.astype({"label": int})
validation_df = validation_df.astype({"label": int})
reference_df = reference_df.astype({"label": int})

dataset = Dataset.from_pandas(df)
validation_dataset = Dataset.from_pandas(validation_df)
reference_dataset = Dataset.from_pandas(reference_df)

new_features = dataset.features.copy()
val_new_features = validation_dataset.features.copy()
ref_new_features = reference_dataset.features.copy()

new_features["label"] = ClassLabel(names=['NoADE', 'ADE'])
val_new_features["label"] = ClassLabel(names=['NoADE', 'ADE'])
ref_new_features["label"] = ClassLabel(names=['NoADE', 'ADE'])

dataset = dataset.cast(new_features)
validation_dataset = validation_dataset.cast(val_new_features)
reference_dataset = reference_dataset.cast(ref_new_features)

label_ids = {"NoADE": 0, "ADE": 1}

dataset = dataset.align_labels_with_mapping(label_ids, "label")
validation_dataset = validation_dataset.align_labels_with_mapping(label_ids, "label")
reference_dataset = reference_dataset.align_labels_with_mapping(label_ids, "label")

print(dataset.features)
print(validation_dataset.features)
print(reference_dataset.features)

### Validation with the challenge validation Dataset (Trainer/Pytorch)

- Define transformer model to be used in classification
- Encode the dataset with the embeddings related to the used model
- Define model training parameters
- Write a log with the model, some parameters and the calculated metrics
- Save the trained models on Google Drive

In [None]:
# If using bertweet-base
!pip3 install emoji==0.6.0

In [None]:
output_log_file = "log.csv"

#model_checkpoint = "bert-base-uncased"
#model_checkpoint = "bert-large-uncased"
#model_checkpoint = "roberta-base"
#model_checkpoint = "roberta-large"
model_checkpoint = "vinai/bertweet-base"
#model_checkpoint = "vinai/bertweet-large"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_data(examples):
    return tokenizer(examples["text"], truncation=True)

encoded_dataset = dataset.map(preprocess_data, batched = True)
encoded_val_dataset = validation_dataset.map(preprocess_data, batched = True)

pre_tokenizer_columns = set(dataset.features)
tokenizer_columns = list(set(encoded_dataset.features) - pre_tokenizer_columns)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
num_epochs = 3
batch_size = 32
init_lr = 2e-5
num_warmup_steps = 0
fold = 0

In [None]:
print()
print("Training Model")
print()

encoded_training_dataset = encoded_dataset
encoded_validation_dataset = encoded_val_dataset


training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Dissertacao/Subtask_1a/outputs/results",
    learning_rate=init_lr,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    warmup_steps=num_warmup_steps,
    logging_dir="/content/drive/MyDrive/Dissertacao/Subtask_1a/outputs/logs",
)

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_training_dataset,
    eval_dataset=encoded_validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
    

### Loading model from Google drive and predict validation data

- Load model from Drive
- Insert model into text classification pipeline
- Output the values of the predictions against the validation set

In [None]:
model_dir = "/content/drive/MyDrive/Dissertacao/Subtask_1a/outputs/results/checkpoint-1632"
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
#model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [None]:
from transformers import TextClassificationPipeline

#pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=1)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

In [None]:
l = []

for i in range(len(validation_df)):
  if (i!=0 and i%100==0):
    print(i)
  l.append(pipe(validation_df["text"][i]))

In [None]:
#Run if prevous cell has error
l = [i[0] for i in l]
l[1][1]["score"]

In [None]:
#validation_df

results_file = "/content/drive/MyDrive/Dissertacao/Subtask_1a/outputs/challenge_valdation_results.tsv"

with open(results_file, 'wt') as out_file:
  tsv_writer = csv.writer(out_file, delimiter='\t')
  for i in range(len(validation_df)):
    label = "noADE"
    if l[i][0]["score"] > l[i][1]["score"]:
      label = "noADE"
    elif l[i][0]["score"] < l[i][1]["score"]:
      label = "ADE"
    tsv_writer.writerow([validation_df["tweet_id"][i], label, "0", "0", "span"])



### Loading model from Google drive and predict test data

- Read test data
- Load model from Drive
- Insert model into text classification pipeline
- Output the values of the predictions against the test set

In [None]:
test_set = "/content/drive/MyDrive/Dissertacao/Subtask_1a/test_data/test_tweets_unannotated.tsv"

test_df = pd.read_csv(test_set, sep='\t', quoting=csv.QUOTE_NONE, header=None)

In [None]:
model_dir = "/content/drive/MyDrive/Dissertacao/Subtask_1a/outputs/results/checkpoint-2202"
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
#model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [None]:
from transformers import TextClassificationPipeline

#pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=1)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

In [None]:
l = []

for i in range(len(test_df)):
  if (i!=0 and i%100==0):
    print(i)
  l.append(pipe(test_df[1][i]))

In [None]:
#test_df

results_file = "/content/drive/MyDrive/Dissertacao/Subtask_1a/outputs/challenge_test_results.tsv"

with open(results_file, 'wt') as out_file:
  tsv_writer = csv.writer(out_file, delimiter='\t')
  for i in range(len(test_df)):
    label = "noADE"
    if l[i][0]["score"] > l[i][1]["score"]:
      label = "noADE"
    elif l[i][0]["score"] < l[i][1]["score"]:
      label = "ADE"
    tsv_writer.writerow([test_df[0][i], label, "0", "0", "span"])



-------------------------------------------------------

### Reset Log file

- Reset the log file
- Only uncomment and run this cell to reset the log file

In [None]:
# Reset Log File
import csv

output_log_file = "log.csv"
#with open(output_log_file, 'w') as log:
#        csv_writer = csv.writer(log)
#        csv_writer.writerow(['Model', 'Batch_size', 'Init_lr', 'Warmup_steps', 'Fold', 'Epochs', 'Precision', 'Recall', 'F1-score' ])