### Train and test BERT model
#### Code borrowed from NLP Lab 8 (Prud'hommeaux)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install datasets
!pip install transformers

In [None]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer

In [None]:
path = '/content/drive/MyDrive/nlp/'

In [None]:
# Read in list of all line numbers selected for cluster test set 

test_cluster_lines = []
with open(path + 'test_cluster_lines', 'r') as clusterlines:
    test_cluster_lines = clusterlines.readlines()[0].split(',')[:-1]
    test_cluster_lines = [int(i) for i in test_cluster_lines]

drop_lines = test_cluster_lines

In [None]:
# Drop lines in cluster test set from the rest of the data

train_raw = pd.read_csv(path + 'sample_subtitles_data.csv')
train_raw.drop(drop_lines,axis=0,inplace=True)
train_raw.drop("index",axis=1,inplace=True)
train_raw.to_csv(path + 'train.csv', index=None)

print(train_raw.columns)
print(len(train_raw))

In [None]:
# Convert decade labels to unique integer codes

train_df = pd.read_csv(path + 'train.csv')
train_df['label'] = train_df['label'].astype('category').cat.codes
train_df.to_csv(path + 'train.csv', index=None)

cluster_df = pd.read_csv(path + 'test_cluster.csv')
cluster_df['label'] = cluster_df['label'].astype('category').cat.codes
cluster_df.to_csv(path + 'test_cluster.csv', index=None)

In [None]:
# Uses distilBERT

MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# Preprocessing function (pads and truncates lines in dataset)

def preprocess_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

In [None]:
# Load data from csv files

dataset = load_dataset('csv',data_files={'train': '/content/drive/MyDrive/nlp/train.csv', 
                                           'test_cluster': '/content/drive/MyDrive/nlp/test_cluster.csv'})


In [None]:
# 80/20 train/test split

train_test = dataset["train"].shuffle(seed=42)
train_dataset = train_test.select([i for i in range(500000,len(train_test))])  ## Training data (80% of total lines)
test_random_dataset = train_test.select([i for i in range(500000)])   ## Randomly select lines for random test set (10% of total lines)

test_cluster_dataset = dataset["test_cluster"].shuffle(seed=42)  ## Pre-selected cluster test set (10% of total lines)

In [None]:
# Remove any data with null lines

train_dataset = train_dataset.filter(lambda x: x["text"])
test_cluster_dataset = test_cluster_dataset.filter(lambda x: x["text"])
test_random_dataset = test_random_dataset.filter(lambda x: x["text"])

In [None]:
### CUT TRAIN AND TEST DATA TO SMALLER SAMPLES
# Randomly sample 50,000 training lines and 5,000 test lines for each test set

test_random_dataset = test_random_dataset.select([i for i in range(5000)])
test_cluster_dataset = test_cluster_dataset.select([i for i in range(5000)])
train_dataset = train_dataset.select([i for i in range(50000)])

In [None]:
print(len(train_dataset))

In [None]:
# Apply preprocessing function to data

train = train_dataset.map(preprocess_function)
test_cluster = test_cluster_dataset.map(preprocess_function)
test_random = test_random_dataset.map(preprocess_function)


In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=9)

In [None]:
# Define function to report performance metrics (could not use load_metric()
# because data is multi-class)

import numpy as np
from datasets import load_metric
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.metrics import classification_report
 
def compute_metrics(eval_pred):
    # load_accuracy = load_metric("accuracy", average='micro')
    # load_precision = load_metric("precision", average='micro')
    # load_recall = load_metric("recall", average='micro')
    # load_f1 = load_metric("f1", average='micro')

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # accuracy = load_accuracy.predict(predictions=predictions, references=labels)["accuracy"]
    # precision = load_precision.predict(predictions=predictions, references=labels)["precision"]
    # recall = load_recall.predict(predictions=predictions, references=labels)["recall"]
    # f1 = load_f1.predict(predictions=predictions, references=labels)["f1"]
    print(classification_report(labels, predictions))
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions,average='micro')
    recall = recall_score(labels, predictions,average='micro')
    f1 = f1_score(labels, predictions,average='micro')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# Define model parameters

training_args = TrainingArguments(
    output_dir= path + "results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test_random,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Train model

trainer.train()

In [None]:
# Test model on random test set

trainer.evaluate(eval_dataset=test_random)

In [None]:
# Test model on cluster test set

trainer.evaluate(eval_dataset=test_cluster)