### Mount Google drive

*  Mount Google drive in the directory '/content/drive'
*  Drive contains dataset files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Install packages

*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. Not needed in currently when running script in Kaggle
*  `transformers` package
*  `datasets` package
*  pip will install all models and dependencies automatically.

In [None]:
!pip install transformers

In [None]:
!pip install datasets

### Imports

In [None]:
import os
from collections import Counter

import csv
import pandas as pd

from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from transformers import create_optimizer
from transformers.keras_callbacks import KerasMetricCallback
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import tensorflow as tf

import datasets
from datasets import Dataset
from datasets import ClassLabel, Value

import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit

### Preprocessing dataset

- Read Augmented data dataset
- Change label types

In [None]:
# Path to datasets

training_set = "/content/drive/MyDrive/Dissertacao/Subtask_1c/training_data/merged_training_dataset.tsv"
validation_set = "/content/drive/MyDrive/Dissertacao/Subtask_1c/validation_data/merged_validation_dataset.tsv"

df = pd.read_csv(training_set, sep='\t', quoting=csv.QUOTE_NONE)
validation_df = pd.read_csv(validation_set, sep='\t', quoting=csv.QUOTE_NONE)

df = df.astype({"label": int})
validation_df = validation_df.astype({"label": int})

med_id_dict = {}

train_id_set = set(Counter(df["label"]).keys())
map_id = 0
for id in train_id_set:
  df.loc[ df["label"] == id, "label"] = map_id
  med_id_dict[id] = map_id
  map_id = map_id + 1

print("Training set has", len(train_id_set), "unique MEDDRA IDs.")

val_id_set = set(Counter(validation_df["label"]).keys())
for id in val_id_set:
  if id in med_id_dict.keys():
    validation_df.loc[ validation_df["label"] == id, "label"] = med_id_dict[id]
  else:
    validation_df.loc[ validation_df["label"] == id, "label"] = map_id
    med_id_dict[id] = map_id
    map_id = map_id + 1

print("Validation set has", len(val_id_set), "unique MEDDRA IDs.")

dataset = Dataset.from_pandas(df)
validation_dataset = Dataset.from_pandas(validation_df)

num_labels = map_id

print("Both sets have", (num_labels), "unique MEDDRA IDs.")

print(dataset.features)
print(validation_dataset.features)

### WEBRADR Reference Dataset Pre-processing

- Processing MedDRA dataset
- Matching WEBRADR mentions to the MedDRA IDs

In [None]:
meddra_path = "/content/drive/MyDrive/Dissertacao/Subtask_1c/meddra/meddra.tsv"

meddra_df = pd.read_csv(meddra_path, sep='\t', quoting=csv.QUOTE_NONE, header=None)

meddra_dict = {}

for i in range(len(meddra_df)):
  if meddra_df[1][i] == "PT":
    term = meddra_df[3][i].lower()
    term_id = meddra_df[2][i]
    if term in meddra_dict.keys():
      print("Repeated term appeared")
    meddra_dict[term] = term_id


In [None]:
reference_set = "/content/drive/MyDrive/Dissertacao/IMI_WEBRADR_Reference_Dataset/T2_MOESM_dataset.tsv"

reference_df = pd.read_csv(reference_set, sep='\t', quoting=csv.QUOTE_NONE)

not_found = 0
not_found_set = set()

reference_dataset_dict = {"tweet_id":[], "text":[], "label":[], "span":[]}


for i in range(len(reference_df)):
  if reference_df["preferred_term"][i].lower() not in meddra_dict.keys():
    #print("Preferred term not in meddra dictionary!!!")
    #print("Term not found:", reference_df["preferred_term"][i].lower())
    not_found = not_found + 1
    not_found_set.add(reference_df["preferred_term"][i].lower())
    continue
  reference_dataset_dict["tweet_id"].append(reference_df["tweet_id"][i])
  reference_dataset_dict["text"].append(reference_df["text"][i])
  reference_dataset_dict["label"].append(meddra_dict[reference_df["preferred_term"][i].lower()])
  reference_dataset_dict["span"].append(reference_df["span"][i])
  

print("Found:", len(reference_df) - not_found)
print("Not found:", not_found)
print("Unique not found:", len(not_found_set))


### (Pre-requisite to) Adding other datasets to training data

In [None]:
dataset_dict = {"text":[], "label":[], "span":[]}

for i in range(len(df)):
  dataset_dict["text"].append(df["text"][i])
  dataset_dict["label"].append(df["label"][i])
  dataset_dict["span"].append(df["span"][i])


#### Adding validation dataset to training data

In [None]:
val_dataset_dict = {"text":[], "label":[], "span":[]}

for i in range(len(validation_df)):
  val_dataset_dict["text"].append(validation_df["text"][i])
  val_dataset_dict["label"].append(validation_df["label"][i])
  val_dataset_dict["span"].append(validation_df["span"][i])

In [None]:
dataset_dict["text"] = dataset_dict["text"] + val_dataset_dict["text"]
dataset_dict["label"] = dataset_dict["label"] + val_dataset_dict["label"]
dataset_dict["span"] = dataset_dict["span"] + val_dataset_dict["span"]

dataset = Dataset.from_dict(dataset_dict)

print(dataset.features)

#### Adding WEBRADR data to training data

In [None]:
ref_id_set = set(Counter(reference_dataset_dict["label"]).keys())

for id in ref_id_set:
  if id in med_id_dict.keys():
    reference_dataset_dict["label"] = [med_id_dict[id] if i == id else i for i in reference_dataset_dict["label"]]
  else:
    reference_dataset_dict["label"] = [map_id if i == id else i for i in reference_dataset_dict["label"]]
    med_id_dict[id] = map_id
    map_id = map_id + 1

print("WEBRADR reference dataset has", len(ref_id_set), "unique MEDDRA IDs.")
print("All sets have", (map_id), "unique MEDDRA IDs.")

In [None]:
num_labels = map_id

dataset_dict["text"] = dataset_dict["text"] + reference_dataset_dict["text"]
dataset_dict["label"] = dataset_dict["label"] + reference_dataset_dict["label"]
dataset_dict["span"] = dataset_dict["span"] + reference_dataset_dict["span"]

dataset = Dataset.from_dict(dataset_dict)

print(dataset.features)

### Validation with the challenge validation Dataset (Trainer)

- Define transformer model to be used in classification
- Encode the dataset with the embeddings related to the used model

In [None]:
# If using bertweet-base
!pip3 install emoji==0.6.0

In [None]:
output_log_file = "log.csv"

#model_checkpoint = "bert-base-uncased"
#model_checkpoint = "bert-large-uncased"
model_checkpoint = "roberta-base"
#model_checkpoint = "roberta-large"
#model_checkpoint = "vinai/bertweet-base"
#model_checkpoint = "vinai/bertweet-large"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_data_span(examples):
  return tokenizer(examples["span"], truncation=True)

def preprocess_data(examples):
    return tokenizer(examples["text"], truncation=True)

#encoded_dataset = dataset.map(preprocess_data, batched = True)
#encoded_val_dataset = validation_dataset.map(preprocess_data, batched = True)

encoded_dataset = dataset.map(preprocess_data_span, batched = True)
encoded_val_dataset = validation_dataset.map(preprocess_data_span, batched = True)

pre_tokenizer_columns = set(dataset.features)
tokenizer_columns = list(set(encoded_dataset.features) - pre_tokenizer_columns)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


- Define model training parameters
- Train the model
- Write a log with the model, some parameters and the calculated metrics
- Save the trained models on Google Drive

In [None]:
num_epochs = 3
batch_size = 32
init_lr = 2e-5
num_warmup_steps = 0
fold = 0

#num_labels = len(Counter(df['label']))
print("Number of labels:", num_labels)

In [None]:
print()
print("Training Model")
print()

encoded_training_dataset = encoded_dataset
encoded_validation_dataset = encoded_val_dataset


training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Dissertacao/Subtask_1c/outputs/results",
    learning_rate=init_lr,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="no",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    warmup_steps=num_warmup_steps,
    logging_dir="/content/drive/MyDrive/Dissertacao/Subtask_1c/outputs/logs",
)

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_training_dataset,
    eval_dataset=encoded_validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

trainer.save_model("/content/drive/MyDrive/Dissertacao/Subtask_1c/outputs/results/checkpoint")


- Save dictionary with Meddra id key and respective numeric id to load to test the predictions in next step

In [None]:
with open("/content/drive/MyDrive/Dissertacao/Subtask_1c/outputs/results/med_label_dict.tsv", 'wt') as out_file:
  tsv_writer = csv.writer(out_file, delimiter='\t')
  for k in med_id_dict.keys():
    tsv_writer.writerow([k, med_id_dict[k]])

### Loading model from Google drive and predict validation data

- Load model from Drive
- Load MedDRA label mapping from Drive
- Insert model into text classification pipeline
- Output the values of the predictions against the validation set

In [None]:
med_id_dict_file = "/content/drive/MyDrive/Dissertacao/Subtask_1c/outputs/results/med_label_dict.tsv"

med_id_dict_df = pd.read_csv(med_id_dict_file, sep='\t', quoting=csv.QUOTE_NONE, header=None)

med_id_dict = {}
inv_med_id_dict = {}

for i in range(0, len(med_id_dict_df)):
  med_id_dict[med_id_dict_df[0][i]] = int(med_id_dict_df[1][i])
  inv_med_id_dict[med_id_dict_df[1][i]] = int(med_id_dict_df[0][i])

In [None]:
model_dir = "/content/drive/MyDrive/Dissertacao/Subtask_1c/outputs/results/checkpoint"
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
#model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [None]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=1)
#pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)


In [None]:
l = []

for i in range(len(validation_df)):
  if (i!=0 and i%100==0):
    print(i)
  # Use when model was trained with encoded tweets
  #l.append(pipe(validation_df["text"][i]))
  # Use when model was trained with encoded spans
  l.append(pipe(validation_df["span"][i]))

In [None]:
l = [i[0][0] for i in l]
int(l[1]['label'][6:])

In [None]:
#validation_df

results_file = "/content/drive/MyDrive/Dissertacao/Subtask_1c/outputs/challenge_valdation_results.tsv"

with open(results_file, 'wt') as out_file:
  tsv_writer = csv.writer(out_file, delimiter='\t')
  for i in range(len(validation_df)):
    label = "ADE"
    med_id = int(l[i]['label'][6:])
    true_med_id = inv_med_id_dict[med_id]
    tsv_writer.writerow([validation_df["tweet_id"][i], label, validation_df["start"][i], validation_df["end"][i], validation_df["span"][i], true_med_id])



### Loading model from Google drive and predict test data

- Read test data
- Load model from Drive
- Load MedDRA label mapping from Drive
- Insert model into text classification pipeline
- Output the values of the predictions against the test set

In [None]:
test_set = "/content/drive/MyDrive/Dissertacao/Subtask_1b/outputs/final_test_results.tsv"

test_df = pd.read_csv(test_set, sep='\t', quoting=csv.QUOTE_NONE, header=None)

In [None]:
med_id_dict_file = "/content/drive/MyDrive/Dissertacao/Subtask_1c/outputs/results/med_label_dict.tsv"

med_id_dict_df = pd.read_csv(med_id_dict_file, sep='\t', quoting=csv.QUOTE_NONE, header=None)

med_id_dict = {}
inv_med_id_dict = {}

for i in range(0, len(med_id_dict_df)):
  med_id_dict[med_id_dict_df[0][i]] = int(med_id_dict_df[1][i])
  inv_med_id_dict[med_id_dict_df[1][i]] = int(med_id_dict_df[0][i])

In [None]:
model_dir = "/content/drive/MyDrive/Dissertacao/Subtask_1c/outputs/results/checkpoint"
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
#model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [None]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=1)
#pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

In [None]:
l = []

for i in range(len(test_df)):
  if (i!=0 and i%100==0):
    print(i)
  # Use when model was trained with encoded tweets
  #l.append(pipe(validation_df["text"][i]))
  # Use when model was trained with encoded spans
  l.append(pipe(test_df[4][i]))

In [None]:
l = [i[0][0] for i in l]
int(l[8]['label'][6:])

In [None]:
#test_df
results_file = "/content/drive/MyDrive/Dissertacao/Subtask_1c/outputs/final_test_results.tsv"

with open(results_file, 'wt') as out_file:
  tsv_writer = csv.writer(out_file, delimiter='\t')
  for i in range(len(test_df)):
    label = "ADE"
    med_id = int(l[i]['label'][6:])
    true_med_id = inv_med_id_dict[med_id]
    tsv_writer.writerow([test_df[0][i], label, test_df[2][i], test_df[3][i], test_df[4][i], true_med_id])



-------------------------------------------------------

### Reset Log file

- Reset the log file
- Only uncomment and run this cell to reset the log file

In [None]:
# Reset Log File
import csv

output_log_file = "log.csv"
#with open(output_log_file, 'w') as log:
#        csv_writer = csv.writer(log)
#        csv_writer.writerow(['Model', 'Batch_size', 'Init_lr', 'Warmup_steps', 'Fold', 'Epochs', 'Precision', 'Recall', 'F1-score' ])