### Mount Google drive

*  Mount Google drive in the directory '/content/drive'
*  Drive contains dataset files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Install packages

*  `transformers` package
*  `datasets` package
*  pip will install all models and dependencies automatically.

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install nltk

In [None]:
!pip install seqeval

### Imports

In [None]:
import os
import re

import csv
import pandas as pd

import nltk
from nltk import TweetTokenizer, wordpunct_tokenize, TreebankWordTokenizer

from transformers import DataCollatorWithPadding, DataCollatorForTokenClassification
from transformers import AutoTokenizer
from transformers import TFAutoModelForTokenClassification
from transformers import create_optimizer
from transformers.keras_callbacks import KerasMetricCallback
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

import tensorflow as tf

import datasets
from datasets import Dataset
from datasets import ClassLabel, Value

import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit

### Preprocessing training dataset

- Read dataset
- Replace character "&amp;"
- Group the ADE spans per tweet
- Tokenize the tweets
- Locate ADE mention in the tweet and label arrays with the same length as the tokenized tweets
- The labels used were:
    - 0 - Not ADE mention
    - 1 - First token of ADE mention
    - 2 - Tokens inside of ADE mention


In [None]:
training_set_spans = "/content/drive/MyDrive/Dissertacao/Subtask_1b/training_data/train_spans.tsv"

training_set_tweets = "/content/drive/MyDrive/Dissertacao/Subtask_1b/training_data/train_tweets.tsv"

df_spans = pd.read_csv(training_set_spans, sep='\t', quoting=csv.QUOTE_NONE, header=None)
df_tweets = pd.read_csv(training_set_tweets, sep='\t', quoting=csv.QUOTE_NONE, header=None)

# dict where key = tweet_id and value = tweet_text
tweet_dict = {}

for i in range(len(df_tweets)):
    tweet_dict[df_tweets[0][i]] = str(df_tweets[1][i])

In [None]:
positive_tweets = {}

for i in range(len(df_spans)):
    tweet_id = df_spans[0][i]
    span_text = df_spans[4][i].lower().replace("&amp;", "&")
    if (tweet_id in positive_tweets.keys()):
        tweet_object = positive_tweets[tweet_id]  
        tweet_object["spans"].append(span_text)
        positive_tweets[tweet_id] = tweet_object
    else:
        tweet_object = {}
        tweet_text = tweet_dict[tweet_id].lower().replace("&amp;", "&")
        tweet_object["text"] = tweet_text
        tweet_object["spans"] = [span_text]
        positive_tweets[tweet_id] = tweet_object

In [None]:
def contains_sublist(lst, sublst):
    return_vals = []
    n = len(sublst)
    for i in range(len(lst)-n+1):
        if(sublst == lst[i:i+n]):
            return_vals.append(i)
    return return_vals

In [None]:
dataset_dict = {}
dataset_dict["tokens"] = []
dataset_dict["tweet_id"] = []
dataset_dict["labels"] = []

tweet_tokenizer = TweetTokenizer()

for id in positive_tweets.keys():
    tweet_tokens = tweet_tokenizer.tokenize(positive_tweets[id]["text"].replace("#", "# "))
    labels = [0]*len(tweet_tokens)

    for span in positive_tweets[id]["spans"]:
        span_tokens = tweet_tokenizer.tokenize(span.replace("#", "# "))

        if (contains_sublist(tweet_tokens, span_tokens)==[]):
            i = 0
            #print("Entered first if!!!!!")
            #print(span_tokens)
            while i < len(tweet_tokens):
                #print(i, "<", len(tweet_tokens))
                word = tweet_tokens[i]
                sps = [s for s in span_tokens if s in word]
                #print("Word ->", word)
                if len(sps)==0 or (len(sps)==1 and len(sps[0])==len(word)):
                    i = i + 1
                    continue
                elif len(sps)==1 and len(sps[0])!=len(word):
                    l = []
                    index = word.find(sps[0])
                    if (index==0):
                        l = [word[0:len(sps[0])], word[len(sps[0]):len(word)]]
                    else:
                        l = [word[0:index], word[index:len(word)]]
                    tweet_tokens.insert(i, l[0])
                    tweet_tokens[i+1] = l[1]
                    labels.insert(i,0)
                    i = i + 1
                    #print("Action taken!!!!!")
                else:
                    print("Span found in more than one token")
                    i = i + 1

        #print(contains_sublist(tweet_tokens, span_tokens))
        for s in contains_sublist(tweet_tokens, span_tokens):
            labels[s:s+len(span_tokens)] = [2]*len(span_tokens)
            labels[s] = 1

        if (contains_sublist(tweet_tokens, span_tokens)==[]):
            print("Span not detected in text.")
            print("Text -> ", positive_tweets[id]["text"])
            print("Tokenized text -> ", tweet_tokens)
            print("Tokenized Span -> ",span_tokens)

    dataset_dict["labels"].append(labels)
    dataset_dict["tweet_id"].append(id)
    dataset_dict["tokens"].append(tweet_tokens)

#### Add negative examples to the dataset (Optional)

In [None]:
task_1a_training_class_file = "/content/drive/MyDrive/Dissertacao/Subtask_1a/training_data/train_class.tsv"
task_1a_training_tweet_file = "/content/drive/MyDrive/Dissertacao/Subtask_1a/training_data/train_tweets.tsv"

task_1a_class_df = pd.read_csv(task_1a_training_class_file, sep='\t', header=None)
task_1a_tweet_df = pd.read_csv(task_1a_training_tweet_file, sep='\t', quoting=csv.QUOTE_NONE, header=None)

tweet_tokenizer = TweetTokenizer()

max_l = 0

for i in range(len(task_1a_class_df)):
  if task_1a_class_df[1][i] == "noADE":
    dataset_dict["tweet_id"].append(task_1a_class_df[0][i])
    id = task_1a_class_df[0][i]
    tweet_tokens = tweet_tokenizer.tokenize(tweet_dict[id].replace("&amp;", "&").replace("#", "# "))
    tweet_labels = [0]*len(tweet_tokens)
    if len(tweet_tokens) > 40:
      tweet_tokens = tweet_tokens[0:40]
      tweet_labels = tweet_labels[0:40]
      max_l = max_l + 1
    dataset_dict["labels"].append(tweet_labels)
    dataset_dict["tokens"].append(tweet_tokens)
    

if max_l != 0:
  print(max_l, "tweets with more than 40 tokens")


In [None]:
neg = 0
for i in range(len(dataset_dict["labels"])):
  if sum(dataset_dict["labels"][i]) == 0:
    neg = neg + 1

print("Detected", neg, "negative examples.")

### Preprocessing validation dataset

- Read dataset
- Replace character "&amp;"
- Group the ADE spans per tweet
- Tokenize the tweets
- Locate ADE mention in the tweet and label arrays with the same length as the tokenized tweets
- The labels used were:
    - 0 - Not ADE mention
    - 1 - First token of ADE mention
    - 2 - Tokens inside of ADE mention

In [None]:
validation_set_spans = "/content/drive/MyDrive/Dissertacao/Subtask_1b/validation_data/spans.tsv"

validation_set_tweets = "/content/drive/MyDrive/Dissertacao/Subtask_1b/validation_data/tweets.tsv"

df_val_spans = pd.read_csv(validation_set_spans, sep='\t', quoting=csv.QUOTE_NONE, header=None)
df_val_tweets = pd.read_csv(validation_set_tweets, sep='\t', quoting=csv.QUOTE_NONE, header=None)

# dict where key = tweet_id and value = tweet_text
val_tweet_dict = {}

for i in range(len(df_val_tweets)):
    val_tweet_dict[df_val_tweets[0][i]] = str(df_val_tweets[1][i])

In [None]:
positive_tweets = {}

for i in range(len(df_val_spans)):
    tweet_id = df_val_spans[0][i]
    span_text = df_val_spans[4][i].lower().replace("&amp;", "&")
    if (tweet_id in positive_tweets.keys()):
        tweet_object = positive_tweets[tweet_id]  
        tweet_object["spans"].append(span_text)
        positive_tweets[tweet_id] = tweet_object
    else:
        tweet_object = {}
        tweet_text = val_tweet_dict[tweet_id].lower().replace("&amp;", "&")
        tweet_object["text"] = tweet_text
        tweet_object["spans"] = [span_text]
        positive_tweets[tweet_id] = tweet_object

In [None]:
def contains_sublist(lst, sublst):
    return_vals = []
    n = len(sublst)
    for i in range(len(lst)-n+1):
        if(sublst == lst[i:i+n]):
            return_vals.append(i)
    return return_vals

In [None]:
val_dataset_dict = {}
val_dataset_dict["tokens"] = []
val_dataset_dict["tweet_id"] = []
val_dataset_dict["labels"] = []

tweet_tokenizer = TweetTokenizer()

for id in positive_tweets.keys():
    tweet_tokens = tweet_tokenizer.tokenize(positive_tweets[id]["text"].replace("#", "# "))
    labels = [0]*len(tweet_tokens)

    for span in positive_tweets[id]["spans"]:
        span_tokens = tweet_tokenizer.tokenize(span.replace("#", "# "))

        if (contains_sublist(tweet_tokens, span_tokens)==[]):
            i = 0
            #print("Entered first if!!!!!")
            #print(span_tokens)
            while i < len(tweet_tokens):
                #print(i, "<", len(tweet_tokens))
                word = tweet_tokens[i]
                sps = [s for s in span_tokens if s in word]
                #print("Word ->", word)
                if len(sps)==0 or (len(sps)==1 and len(sps[0])==len(word)):
                    i = i + 1
                    continue
                elif len(sps)==1 and len(sps[0])!=len(word):
                    l = []
                    index = word.find(sps[0])
                    if (index==0):
                        l = [word[0:len(sps[0])], word[len(sps[0]):len(word)]]
                    else:
                        l = [word[0:index], word[index:len(word)]]
                    tweet_tokens.insert(i, l[0])
                    tweet_tokens[i+1] = l[1]
                    labels.insert(i,0)
                    i = i + 1
                    #print("Action taken!!!!!")
                else:
                    print("Span found in more than one token")
                    i = i + 1

        #print(contains_sublist(tweet_tokens, span_tokens))
        for s in contains_sublist(tweet_tokens, span_tokens):
            labels[s:s+len(span_tokens)] = [2]*len(span_tokens)
            labels[s] = 1

        if (contains_sublist(tweet_tokens, span_tokens)==[]):
            print("Span not detected in text.")
            print("Text -> ", positive_tweets[id]["text"])
            print("Tokenized text -> ", tweet_tokens)
            print("Tokenized Span -> ",span_tokens)

    val_dataset_dict["labels"].append(labels)
    val_dataset_dict["tweet_id"].append(id)
    val_dataset_dict["tokens"].append(tweet_tokens)

#### Add negative examples to the dataset (Optional)

In [None]:
task_1a_validation_class_file = "/content/drive/MyDrive/Dissertacao/Subtask_1a/validation_data/class.tsv"
task_1a_validation_tweet_file = "/content/drive/MyDrive/Dissertacao/Subtask_1a/validation_data/tweets.tsv"

task_1a_val_class_df = pd.read_csv(task_1a_validation_class_file, sep='\t', header=None)
task_1a_val_tweet_df = pd.read_csv(task_1a_validation_tweet_file, sep='\t', quoting=csv.QUOTE_NONE, header=None)

tweet_tokenizer = TweetTokenizer()

max_l = 0

for i in range(len(task_1a_val_class_df)):
  if task_1a_val_class_df[1][i] == "noADE":
    val_dataset_dict["tweet_id"].append(task_1a_val_class_df[0][i])
    id = task_1a_val_class_df[0][i]
    tweet_tokens = tweet_tokenizer.tokenize(val_tweet_dict[id].replace("&amp;", "&").replace("#", "# "))
    tweet_labels = [0]*len(tweet_tokens)
    if len(tweet_tokens) > 40:
      tweet_tokens = tweet_tokens[0:40]
      tweet_labels = tweet_labels[0:40]
      max_l = max_l + 1
    val_dataset_dict["labels"].append(tweet_labels)
    val_dataset_dict["tokens"].append(tweet_tokens)
    

if max_l != 0:
  print(max_l, "tweets with more than 40 tokens")


In [None]:
neg = 0
for i in range(len(val_dataset_dict["labels"])):
  if sum(val_dataset_dict["labels"][i]) == 0:
    neg = neg + 1

print("Detected", neg, "negative examples.")

### Reference Dataset Processing

- Read dataset
- Group the ADE spans per tweet
- Tokenize the tweets
- Locate ADE mention in the tweet and label arrays with the same length as the tokenized tweets
- The labels used were:
    - 0 - Not ADE mention
    - 1 - First token of ADE mention
    - 2 - Tokens inside of ADE mention

In [None]:
reference_set = "/content/drive/MyDrive/Dissertacao/IMI_WEBRADR_Reference_Dataset/T2_MOESM_dataset.tsv"

df_reference = pd.read_csv(reference_set, sep='\t', quoting=csv.QUOTE_NONE)
#df_reference = pd.read_csv(reference_set, sep='\t')

def contains_sublist(lst, sublst):
    return_vals = []
    n = len(sublst)
    for i in range(len(lst)-n+1):
        if(sublst == lst[i:i+n]):
            return_vals.append(i)
    return return_vals

In [None]:
positive_tweets_ref = {}

for i in range(len(df_reference)):
    tweet_id = df_reference["tweet_id"][i]
    span_text = df_reference["span"][i].lower()
    if (tweet_id in positive_tweets_ref.keys()):
        tweet_object = positive_tweets_ref[tweet_id]  
        tweet_object["spans"].append(span_text)
        positive_tweets_ref[tweet_id] = tweet_object
    else:
        tweet_object = {}
        tweet_text = df_reference["text"][i].lower()
        tweet_object["text"] = tweet_text
        tweet_object["spans"] = [span_text]
        positive_tweets_ref[tweet_id] = tweet_object


In [None]:
reference_dataset_dict = {}
reference_dataset_dict["tokens"] = []
reference_dataset_dict["tweet_id"] = []
reference_dataset_dict["labels"] = []

tweet_tokenizer = TweetTokenizer()

for id in positive_tweets_ref.keys():
    tweet_tokens = tweet_tokenizer.tokenize(positive_tweets_ref[id]["text"].replace("#", "# "))
    labels = [0]*len(tweet_tokens)

    for span in positive_tweets_ref[id]["spans"]:
        span_tokens = tweet_tokenizer.tokenize(span.replace("#", "# "))
        
        if (contains_sublist(tweet_tokens, span_tokens)==[]):
            i = 0
            #print("Entered first if!!!!!")
            #print(span_tokens)
            while i < len(tweet_tokens):
                #print(i, "<", len(tweet_tokens))
                word = tweet_tokens[i]
                sps = list(set([s for s in span_tokens if s in word]))
                sps = [s for s in sps if len(s)>1 and not any([s2 for s2 in sps if (s2!=s) and (s in s2)])]

                #print("Word ->", word)
                if len(sps)==0 or (len(sps)==1 and len(sps[0])==len(word)):
                    i = i + 1
                    continue
                elif len(sps)==1 and len(sps[0])!=len(word):
                    l = []
                    index = word.find(sps[0])
                    if (index==0):
                        l = [word[0:len(sps[0])], word[len(sps[0]):len(word)]]
                    else:
                        l = [word[0:index], word[index:len(word)]]
                    tweet_tokens.insert(i, l[0])
                    tweet_tokens[i+1] = l[1]
                    labels.insert(i,0)
                    i = i + 1
                    #print("Action taken!!!!!")
                else:
                    print("Span found in more than one token")
                i = i + 1

        if (contains_sublist(tweet_tokens, span_tokens)==[] and span_tokens[0] =='"' and span_tokens[-1]=='"'):
            span_tokens = span_tokens[1:-1]

        if (contains_sublist(tweet_tokens, span_tokens)==[] and ".." in tweet_tokens):
            index = tweet_tokens.index("..")
            tweet_tokens[index] = "."
            tweet_tokens.insert(index, ".")
            labels.insert(index,0)

        #print(contains_sublist(tweet_tokens, span_tokens))
        for s in contains_sublist(tweet_tokens, span_tokens):
            labels[s:s+len(span_tokens)] = [2]*len(span_tokens)
            labels[s] = 1

        if (contains_sublist(tweet_tokens, span_tokens)==[]):
            print("Span not detected in text.")
            print("ID -> ", id)
            print("Text -> ", positive_tweets_ref[id]["text"])
            print("Tokenized text -> ", tweet_tokens)
            print("Span -> ", span)
            print("Tokenized Span -> ",span_tokens)
            
    reference_dataset_dict["labels"].append(labels)
    reference_dataset_dict["tweet_id"].append(str(id))
    reference_dataset_dict["tokens"].append(tweet_tokens)

---------------------------------------------------

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
len(dataset_dict["labels"])

In [None]:
len(val_dataset_dict["labels"])

In [None]:
len(reference_dataset_dict["labels"])

#### Add validation dataset examples to training dataset

In [None]:
dataset_dict["tokens"] = dataset_dict["tokens"] + val_dataset_dict["tokens"]
dataset_dict["tweet_id"] = dataset_dict["tweet_id"] + val_dataset_dict["tweet_id"]
dataset_dict["labels"] = dataset_dict["labels"] + val_dataset_dict["labels"]

#### Add reference dataset examples to training dataset

In [None]:
dataset_dict["tokens"] = dataset_dict["tokens"] + reference_dataset_dict["tokens"]
dataset_dict["tweet_id"] = dataset_dict["tweet_id"] + reference_dataset_dict["tweet_id"]
dataset_dict["labels"] = dataset_dict["labels"] + reference_dataset_dict["labels"]

#### Random oversampler

In [None]:
from imblearn.over_sampling import RandomOverSampler
import numpy as np
from collections import Counter

label_list = [1 if sum(l)!=0 else 0 for l in dataset_dict["labels"]]
x_list = list(range(len(label_list)))

oversampler = RandomOverSampler(sampling_strategy=0.3)

X_oversampled, y_oversampled = oversampler.fit_resample(np.asarray(x_list).reshape(-1,1), label_list)

new_dataset_dict = {"tokens":[], "tweet_id":[], "labels":[]}

for x in X_oversampled:
  x = x[0]
  new_dataset_dict["tokens"] = new_dataset_dict["tokens"] + [dataset_dict["tokens"][x]]
  new_dataset_dict["tweet_id"] = new_dataset_dict["tweet_id"] + [dataset_dict["tweet_id"][x]]
  new_dataset_dict["labels"] = new_dataset_dict["labels"] + [dataset_dict["labels"][x]]
  
dataset_dict = new_dataset_dict


#### Random undersampler

In [None]:
label_list = [1 if sum(l)!=0 else 0 for l in dataset_dict["labels"]]
Counter(label_list)

In [None]:
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
from collections import Counter

label_list = [1 if sum(l)!=0 else 0 for l in dataset_dict["labels"]]
x_list = list(range(len(label_list)))

undersampler = RandomUnderSampler(sampling_strategy=0.1)

X_undersampled, y_undersampled = undersampler.fit_resample(np.asarray(x_list).reshape(-1,1), label_list)

new_dataset_dict = {"tokens":[], "tweet_id":[], "labels":[]}

for x in X_undersampled:
  x = x[0]
  new_dataset_dict["tokens"] = new_dataset_dict["tokens"] + [dataset_dict["tokens"][x]]
  new_dataset_dict["tweet_id"] = new_dataset_dict["tweet_id"] + [dataset_dict["tweet_id"][x]]
  new_dataset_dict["labels"] = new_dataset_dict["labels"] + [dataset_dict["labels"][x]]
  
dataset_dict = new_dataset_dict


## Tensorflow

- Tokenize datasets using the used model's specific tokenizer
- Define hyperparameters
- Define performance measures to calculate in testing the models with each epoch
- Train the model
- Save the model and the tokenizer to the Drive

In [None]:
# If using bertweet-base
!pip3 install emoji==0.6.0

In [None]:
dataset = Dataset.from_dict(dataset_dict)

validation_dataset = Dataset.from_dict(val_dataset_dict)

reference_dataset = Dataset.from_dict(reference_dataset_dict)

output_log_file = "./log.csv"

label_list = ['O', 'B-ADE', 'I-ADE']

#model_checkpoint = "bert-base-uncased"
#model_checkpoint = "bert-large-uncased"
#model_checkpoint = "roberta-base"
model_checkpoint = "roberta-large"
#model_checkpoint = "vinai/bertweet-base" # Raises error becausr of tokenize_and_align_labels function
#model_checkpoint = "vinai/bertweet-large"

  
# add_prefix_space=True nos modelos relacionados com o Roberta (Robertas e Bertweets)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)


tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

tokenized_validation_dataset = validation_dataset.map(tokenize_and_align_labels, batched=True)

tokenized_reference_dataset = reference_dataset.map(tokenize_and_align_labels, batched=True)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

In [None]:
batch_size = 32
num_train_epochs = 3
num_epochs = 3
num_train_steps = (len(tokenized_dataset) // batch_size) * num_train_epochs
init_lr = 2e-5
weight_decay_rate = 0.01
num_warmup_steps = 0

metric = datasets.load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    res = {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
    with open(output_log_file, 'a') as log:
        csv_writer = csv.writer(log)
        csv_writer.writerow([model_checkpoint, batch_size, init_lr, num_warmup_steps, fold, num_epochs, res["precision"], res["recall"], res["f1"] ])
    return res


In [None]:
print()
fold = -1

encoded_training_dataset = tokenized_dataset
encoded_validation_dataset = tokenized_validation_dataset

tf_train_set = encoded_training_dataset.to_tf_dataset(columns=["attention_mask", "input_ids", "labels"], shuffle=True, batch_size=32, collate_fn=data_collator)
tf_validation_set = encoded_validation_dataset.to_tf_dataset(columns=["attention_mask", "input_ids", "labels"], shuffle=False, batch_size=32, collate_fn=data_collator)

metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=tf_validation_set
)

model = TFAutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=3)

callbacks = [metric_callback]

optimizer, lr_schedule = create_optimizer(
    init_lr=init_lr,
    num_train_steps=num_train_steps,
    weight_decay_rate=weight_decay_rate,
    num_warmup_steps=num_warmup_steps,
)

model.compile(optimizer=optimizer)

model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs, callbacks=callbacks)

with open(output_log_file, 'a') as log:
    csv_writer = csv.writer(log)
    csv_writer.writerow(['###', '###', '###', '###', '###', '###', '###', '###', '###'])

In [None]:
model.save_pretrained("/content/drive/MyDrive/Dissertacao/Subtask_1b/outputs/results/final_model_roberta_large")


In [None]:
tokenizer.save_pretrained("/content/drive/MyDrive/Dissertacao/Subtask_1b/outputs/results/final_tokenizer_roberta_large/")

### Predict validation set spans

- Load model and tokenizer from the Drive
- Initialize pipeline
- Generate preditions for the validation set
- Writte predictions to a file

In [None]:
# If we need to load a saved model
model = TFAutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/Dissertacao/Subtask_1b/outputs/results/model_02", num_labels=3)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Dissertacao/Subtask_1b/outputs/results/tokenizer_02/", add_prefix_space=True)

In [None]:
from transformers import TokenClassificationPipeline

#pipe = TokenClassificationPipeline(model=model, tokenizer=tokenizer, top_k=1)
pipe = TokenClassificationPipeline(model=model, tokenizer=tokenizer)


In [None]:
val_positive_tweets = {}

for i in range(len(df_val_spans)):
    tweet_id = df_val_spans[0][i]
    if (tweet_id not in val_positive_tweets.keys()):
        tweet_text = val_tweet_dict[tweet_id].lower().replace("&amp;", "&")
        val_positive_tweets[tweet_id] = tweet_text

In [None]:
l = []

i = 0
for id in val_positive_tweets.keys():
  if (i!=0 and i%100==0):
    print(i)
  l.append((id, pipe(val_positive_tweets[id])))
  i = i + 1

In [None]:
#validation_df

results_file = "/content/drive/MyDrive/Dissertacao/Subtask_1b/outputs/challenge_validation_results.tsv"

with open(results_file, 'wt') as out_file:
  tsv_writer = csv.writer(out_file, delimiter='\t')
  for i in range(len(l)):
    tweet_id, tokens = l[i]
    spans = []
    prev_span = [0,0,0]
    for i2 in range(len(tokens)):
      token = tokens[i2]
      if token["entity"]=='LABEL_0' and prev_span[2]!=0:
        spans.append(prev_span[0:2])
        prev_span = [0,0,0]
      if token["entity"]=='LABEL_1':
        if prev_span[2]!=0:
          spans.append(prev_span[0:2])
          prev_span = [0,0,0]
        prev_span[0] = token["start"]
        prev_span[1] = token["end"]
        prev_span[2] = 1
      elif token["entity"]=='LABEL_2':
        prev_span[1] = token["end"]
    if prev_span[2]!=0:
      spans.append(prev_span[0:2])
      prev_span = [0,0,0]
    for s in spans:
      start = s[0]
      end = s[1]
      tsv_writer.writerow([tweet_id, "ADE", start, end, val_tweet_dict[tweet_id][start:end]])



### Predict and write test set results

- Import results from test set obtained from model used in task 1a
- Load model
- Create pipe from model
- Predict spans for positive labeled tweets
- Write results

In [None]:
test_set = "/content/drive/MyDrive/Dissertacao/Subtask_1a/test_data/test_tweets_unannotated.tsv"
test_result_file = "/content/drive/MyDrive/Dissertacao/Subtask_1a/outputs/challenge_test_results.tsv"

test_df = pd.read_csv(test_set, sep='\t', quoting=csv.QUOTE_NONE, header=None)

test_tweet_dict = {}

for i in range(len(test_df)):
  tweet_id = test_df[0][i]
  test_tweet_dict[tweet_id] = test_df[1][i].lower().replace("&amp;", "&")

test_result_df = pd.read_csv(test_result_file, sep='\t', header=None)

In [None]:
print("There are ", len(test_result_df[test_result_df[1]=="ADE"]) , "positive examples (ADE) in this dataset.")
print("There are ", len(test_result_df[test_result_df[1]=="noADE"]), "negative examples (NoADE) in this dataset.")

In [None]:
# If we need to load a saved model (optional)
model = TFAutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/Dissertacao/Subtask_1b/outputs/results/final_model_roberta_large", num_labels=3)

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Dissertacao/Subtask_1b/outputs/results/final_tokenizer_roberta_large/", add_prefix_space=True)

In [None]:
from transformers import TokenClassificationPipeline

#pipe = TokenClassificationPipeline(model=model, tokenizer=tokenizer, top_k=1)
pipe = TokenClassificationPipeline(model=model, tokenizer=tokenizer)


In [None]:
l = []

i = 0

for index in range(len(test_result_df)):
  if test_result_df[1][index] == "ADE":
    id = test_result_df[0][index]
    tweet = test_tweet_dict[id]
    if (i!=0 and i%100==0):
      print(i)
    l.append((id, pipe(tweet)))
    i = i + 1

In [None]:
l[0]

In [None]:

results_file = "/content/drive/MyDrive/Dissertacao/Subtask_1b/outputs/final_test_results.tsv"

with open(results_file, 'wt') as out_file:
  tsv_writer = csv.writer(out_file, delimiter='\t')
  for i in range(len(l)):
    tweet_id, tokens = l[i]
    spans = []
    prev_span = [0,0,0]
    for i2 in range(len(tokens)):
      token = tokens[i2]
      if token["entity"]=='LABEL_0' and prev_span[2]!=0:
        spans.append(prev_span[0:2])
        prev_span = [0,0,0]
      if token["entity"]=='LABEL_1':
        if prev_span[2]!=0:
          spans.append(prev_span[0:2])
          prev_span = [0,0,0]
        prev_span[0] = token["start"]
        prev_span[1] = token["end"]
        prev_span[2] = 1
      elif token["entity"]=='LABEL_2':
        prev_span[1] = token["end"]
    if prev_span[2]!=0:
      spans.append(prev_span[0:2])
      prev_span = [0,0,0]
    for s in spans:
      start = s[0]
      end = s[1]
      tsv_writer.writerow([tweet_id, "ADE", start, end, test_tweet_dict[tweet_id][start:end]])



## Reset Log file

In [None]:
# Reset Log File
output_log_file = "./log.csv"
with open(output_log_file, 'w') as log:
    csv_writer = csv.writer(log)
    csv_writer.writerow(['Model', 'Batch_size', 'Init_lr', 'Warmup_steps', 'Fold', 'Epochs', 'Precision', 'Recall', 'F1-score' ])