### Mount Google drive

*  Mount Google drive in the directory '/content/drive'
*  Drive contains dataset files

In [None]:
# Mounting Google Drive files

from google.colab import drive
drive.mount('/content/drive')

### Install packages

*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. Not needed in currently when running script in Kaggle
*  `transformers` package
*  `datasets` package
*  pip will install all models and dependencies automatically.

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install hyperopt

In [None]:
!pip install -U ray

In [None]:
!pip install pickle5

### Imports

In [None]:
import os

import csv
import pandas as pd

from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from transformers import create_optimizer
from transformers.keras_callbacks import KerasMetricCallback

import tensorflow as tf

import datasets
from datasets import Dataset
from datasets import ClassLabel, Value

import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit

### Preprocessing dataset

- Read Augmented data dataset
- Change label types

In [None]:
# Path to datasets
#training_set = "/content/drive/MyDrive/Dissertacao/Subtask_1a/augmented_training_data/augmented_training.tsv"
#training_set = "/content/drive/MyDrive/Dissertacao/Subtask_1a/oversampled_training_data/oversampled_training.tsv"
training_set = "/content/drive/MyDrive/Dissertacao/Subtask_1a/training_data/merged_training_dataset.tsv"
reference_set = "/content/drive/MyDrive/Dissertacao/IMI_WEBRADR_Reference_Dataset/T1_MOESM_dataset.tsv"

df = pd.read_csv(training_set, sep='\t', quoting=csv.QUOTE_NONE)
reference_df = pd.read_csv(reference_set, sep='\t', quoting=csv.QUOTE_NONE, engine='python')

print(reference_df["label"][225])

df.loc[ df["label"] == "ADE", "label"] = 1 # Positive classification
df.loc[ df["label"] == "NoADE", "label"] = 0 # Negative classification

reference_df.loc[ reference_df["label"] == "ADE", "label"] = 1 # Positive classification
reference_df.loc[ reference_df["label"] == "NoADE", "label"] = 0 # Negative classification

df = df.astype({"label": int})
reference_df = reference_df.astype({"label": int})

dataset = Dataset.from_pandas(df)
reference_dataset = Dataset.from_pandas(reference_df)

new_features = dataset.features.copy()
ref_new_features = reference_dataset.features.copy()

new_features["label"] = ClassLabel(names=['NoADE', 'ADE'])
ref_new_features["label"] = ClassLabel(names=['NoADE', 'ADE'])

dataset = dataset.cast(new_features)
reference_dataset = reference_dataset.cast(ref_new_features)

label_ids = {"NoADE": 0, "ADE": 1}

dataset = dataset.align_labels_with_mapping(label_ids, "label")
reference_dataset = reference_dataset.align_labels_with_mapping(label_ids, "label")

print(dataset.features)
print(reference_dataset.features)

### Validation with the Reference Dataset (with hyperparameter optimization)

- Define transformer model to be used in classification
- Encode the dataset with the embeddings related to the used model

In [None]:
#output_log_file = "log.csv"
output_log_file = "/content/drive/MyDrive/Dissertacao/Subtask_1a/log_test.csv"

model_checkpoint = "bert-base-uncased"
#model_checkpoint = "bert-large-uncased"
#model_checkpoint = "roberta-base"
#model_checkpoint = "roberta-large"
#model_checkpoint = "vinai/bertweet-base"
#model_checkpoint = "vinai/bertweet-large"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_data(examples):
    #return tokenizer(examples["text"], truncation=True)
    return tokenizer(examples["text"], padding="max_length", max_length = 160)

encoded_dataset = dataset.map(preprocess_data, batched = True)
encoded_ref_dataset = reference_dataset.map(preprocess_data, batched = True)

pre_tokenizer_columns = set(dataset.features)
tokenizer_columns = list(set(encoded_dataset.features) - pre_tokenizer_columns)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

- Define metrics to be calculated
- Define model training parameters
- Write a log with the model, some parameters and the calculated metrics

In [None]:
metric_f1 = datasets.load_metric('f1')
metric_precision = datasets.load_metric('precision')
metric_recall = datasets.load_metric('recall')

num_epochs = 3
batch_size = 32
init_lr = 2e-5
num_warmup_steps = 0
fold = 0
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def compute_metrics(eval_predictions): 
    predictions, labels = eval_predictions
    predictions = np.argmax(predictions, axis=1)
    f1_d = metric_f1.compute(predictions=predictions, references=labels)
    prec_d = metric_precision.compute(predictions=predictions, references=labels)
    recall_d =  metric_recall.compute(predictions=predictions, references=labels)
    d = {"f1": f1_d["f1"], "precision": prec_d["precision"], "recall":recall_d["recall"]}
    # Save performance in log
    with open(output_log_file, 'a') as log:
        csv_writer = csv.writer(log)
        csv_writer.writerow([model_checkpoint, batch_size, init_lr, num_warmup_steps, fold, num_epochs, prec_d["precision"], recall_d["recall"], f1_d["f1"] ])
    return d

- Train the model using referenece dataset as a validation dataset
- Using trainer to try to optimize hyperparameters
- Right now in Google Collab it uses up all memory and shuts down

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification

import ray
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.schedulers import ASHAScheduler
from ray import tune
from ray.tune import CLIReporter
from ray.tune.examples.pbt_transformers.utils import (
    download_data,
    build_compute_metrics_fn,
)
from ray.tune.schedulers import PopulationBasedTraining

#metric_eval_acc = datasets.load_metric('eval_acc')

#def compute_metrics_eval_acc(eval_predictions): 
#    predictions, labels = eval_predictions
#    predictions = np.argmax(predictions, axis=1)
#    eval_d = metric_eval_acc.compute(predictions=predictions, references=labels)
#    d = {"eval_acc": eval_d["eval_acc"]}
#    return d

#training_args = TrainingArguments(output_dir=".", evaluation_strategy="epoch")

training_args = TrainingArguments(
        output_dir=".",
        learning_rate=1e-5,  # config
        do_train=True,
        do_eval=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        num_train_epochs=2,  # config
        max_steps=-1,
        warmup_steps=0,
        weight_decay=0.1,  # config
        logging_dir="./logs",
        skip_memory_metrics=True,
        report_to="none",
    )

scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="eval_f1",
    mode="max",
    perturbation_interval=1,
    hyperparam_mutations={
        "weight_decay": tune.uniform(0.0, 0.3),
        "learning_rate": tune.uniform(1e-5, 5e-5),
        "per_device_train_batch_size": [16, 32, 64],
    },
)

encoded_training_dataset = encoded_dataset
encoded_validation_dataset = encoded_ref_dataset

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased", return_dict=True, num_labels=2)

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

trainer = Trainer(
    model_init=model_init,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=encoded_training_dataset,
    eval_dataset=encoded_validation_dataset,
    compute_metrics=compute_metrics,
)

trainer.hyperparameter_search(
    backend="ray",
    n_trials=3,
    scheduler=scheduler,
    keep_checkpoints_num=1,
    checkpoint_score_attr="training_iteration",
    log_to_file=True,
)

#trainer.hyperparameter_search(
#    direction="maximize", 
#    backend="ray", 
#    n_trials=10 # number of trials
#)
#trainer.train()



-------------------------------------------------

### Reset Log file

- Reset the log file
- Only uncomment and run this cell to reset the log file

In [None]:
# Reset Log File
import csv

output_log_file = "log.csv"
#with open(output_log_file, 'w') as log:
#        csv_writer = csv.writer(log)
#        csv_writer.writerow(['Model', 'Batch_size', 'Init_lr', 'Warmup_steps', 'Fold', 'Epochs', 'Precision', 'Recall', 'F1-score' ])