In [None]:
# mounting to drive to use colab
from google.colab import drive
drive.mount('/content/drive')

# load model, tokenizer
!pip install transformers
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers import BertTokenizer, BertForSequenceClassification

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
     "bert-base-uncased", 
     num_labels = 2, 
)

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
import torch


In [None]:
# get all preprocessed test datasets

test_in_preprocessed_path = 'drive/MyDrive/NLP_Final_Assignment/data/test_IN_preprocessed.csv'
test_out1_preprocessed_path = 'drive/MyDrive/NLP_Final_Assignment/data/test_OUT1_preprocessed.csv'
test_out2_preprocessed_path = 'drive/MyDrive/NLP_Final_Assignment/data/test_OUT2_preprocessed.csv'
test_textgain_preprocessed_path = 'drive/MyDrive/NLP_Final_Assignment/data/test_TEXTGAIN_preprocessed.csv'

test_in = pd.read_csv(test_in_preprocessed_path, sep=',', header=0)
test_out1 = pd.read_csv(test_out1_preprocessed_path, sep=',', header=0)
test_out2 = pd.read_csv(test_out2_preprocessed_path, sep=',', header=0)
test_textgain = pd.read_csv(test_textgain_preprocessed_path, sep=',', header=0)

In [None]:
test_in = test_in[['id', 'tweet_clean']]
test_in = test_in.set_index('id')
test_in.rename({'tweet_clean': 'text'}, axis=1, inplace=True)

test_out1 = test_out1[['id', 'tweet_clean']]
test_out1 = test_out1.set_index('id')
test_out1.rename({'tweet_clean': 'text'}, axis=1, inplace=True)

test_out2 = test_out2[['id', 'tweet_clean']]
test_out2 = test_out2.set_index('id')
test_out2.rename({'tweet_clean': 'text'}, axis=1, inplace=True)

test_textgain = test_textgain[['id', 'tweet_clean']]
test_textgain = test_textgain.set_index('id')
test_textgain.rename({'tweet_clean': 'text'}, axis=1, inplace=True)

In [None]:
# Preprocess data
test_in_test = list(test_in['text'])
test_out1_test = list(test_out1['text'])
test_out2_test = list(test_out2['text'])
test_textgain_test = list(test_textgain['text'])

test_in_test_tokenized = tokenizer(test_in_test, padding=True, truncation=True)
test_out1_test_tokenized = tokenizer(test_out1_test, padding=True, truncation=True)
test_out2_test_tokenized = tokenizer(test_out2_test, padding=True, truncation=True)
test_textgain_test_tokenized = tokenizer(test_textgain_test, padding=True, truncation=True)

In [None]:
print(len(test_in_test))
print(len(test_out1_test))
print(len(test_out2_test))
print(len(test_textgain_test))

In [None]:
len(test_in_test_tokenized['input_ids'][0])

In [None]:
test_in_test[0]

In [None]:
# Create torch dataset
test_in_dataset = Dataset(test_in_test_tokenized)
test_out1_dataset = Dataset(test_out1_test_tokenized)
test_out2_dataset = Dataset(test_out2_test_tokenized)
test_textgain_dataset = Dataset(test_textgain_test_tokenized)

In [None]:
len(test_in_dataset.encodings['input_ids'][0])

In [None]:
# Load trained model
model_path = "bertmodel/checkpoint-1000"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

**In-domain Twitter Data**

In [None]:
# Define test trainer
test_trainer = Trainer(model, args)

# Make prediction - in domain dataset
test_in_raw_pred, _, _ = test_trainer.predict(test_in_dataset)

# Preprocess raw predictions
test_in_y_pred = np.argmax(test_in_raw_pred, axis=1)

In [None]:
test_in_y_pred  = pd.DataFrame(test_in_y_pred)
test_in = test_in.reset_index()
test_in['test_in_y_pred'] = test_in_y_pred

test_in_neural = test_in[test_in[['id', 'test_in_y_pred']].notnull()].copy()
test_in_neural = test_in_neural[['id', 'test_in_y_pred']]
test_in_neural['test_in_y_pred'] = test_in_neural['test_in_y_pred'].map({1: 'OFF', 0: 'NOT'})
test_in_neural.rename({'test_in_y_pred': 'label'}, axis=1, inplace=True)
test_in_neural.head()

In [None]:
test_in_neural.label.value_counts()

In [None]:
# write out predictions
test_in_neural_path = 'drive/MyDrive/NLP_Final_Assignment/predictions/test_in_neural.csv'
test_in_neural.to_csv(test_in_neural_path, encoding='utf-8', index=False)

**Out-domain Data Nr. 1**

In [None]:
# Make prediction - out1 domain dataset
test_out1_raw_pred, _, _ = test_trainer.predict(test_out1_dataset)

# Preprocess raw predictions
test_out1_y_pred = np.argmax(test_out1_raw_pred, axis=1)

In [None]:
test_out1_y_pred  = pd.DataFrame(test_out1_y_pred)
test_out1 = test_out1.reset_index()
test_out1['test_out1_y_pred'] = test_out1_y_pred

test_out1_neural = test_out1[test_out1[['id', 'test_out1_y_pred']].notnull()].copy()
test_out1_neural = test_out1_neural[['id', 'test_out1_y_pred']]
test_out1_neural['test_out1_y_pred'] = test_out1_neural['test_out1_y_pred'].map({1: 'OFF', 0: 'NOT'})
test_out1_neural.rename({'test_out1_y_pred': 'label'}, axis=1, inplace=True)
test_out1_neural.head()

In [None]:
test_out1_neural.label.value_counts()

In [None]:
# write out predictions
test_out1_neural_path = 'drive/MyDrive/NLP_Final_Assignment/predictions/test_out1_neural.csv'
test_out1_neural.to_csv(test_out1_neural_path, encoding='utf-8', index=False)

**Out-domain Data Nr. 2**

In [None]:
# Make prediction - out2 domain dataset
test_out2_raw_pred, _, _ = test_trainer.predict(test_out2_dataset)

# Preprocess raw predictions
test_out2_y_pred = np.argmax(test_out2_raw_pred, axis=1)

In [None]:
test_out2_y_pred  = pd.DataFrame(test_out2_y_pred)
test_out2 = test_out2.reset_index()
test_out2['test_out2_y_pred'] = test_out2_y_pred

test_out2_neural = test_out2[test_out2[['id', 'test_out2_y_pred']].notnull()].copy()
test_out2_neural = test_out2_neural[['id', 'test_out2_y_pred']]
test_out2_neural['test_out2_y_pred'] = test_out2_neural['test_out2_y_pred'].map({1: 'OFF', 0: 'NOT'})
test_out2_neural.rename({'test_out2_y_pred': 'label'}, axis=1, inplace=True)
test_out2_neural.head()

In [None]:
test_out2_neural.label.value_counts()

In [None]:
# write out predictions
test_out2_neural_path = 'drive/MyDrive/NLP_Final_Assignment/predictions/test_out2_neural.csv'
test_out2_neural.to_csv(test_out2_neural_path, encoding='utf-8', index=False)

**TextGain Test Data**

In [None]:
# Make prediction - textgain dataset
test_textgain_raw_pred, _, _ = test_trainer.predict(test_textgain_dataset)

# Preprocess raw predictions
test_textgain_y_pred = np.argmax(test_textgain_raw_pred, axis=1)

In [None]:
test_textgain_y_pred  = pd.DataFrame(test_textgain_y_pred)
test_textgain = test_textgain.reset_index()
test_textgain['test_textgain_y_pred'] = test_textgain_y_pred

test_textgain_neural = test_textgain[test_textgain[['id', 'test_textgain_y_pred']].notnull()].copy()
test_textgain_neural = test_textgain_neural[['id', 'test_textgain_y_pred']]
test_textgain_neural['test_textgain_y_pred'] = test_textgain_neural['test_textgain_y_pred'].map({1: 'OFF', 0: 'NOT'})
test_textgain_neural.rename({'test_textgain_y_pred': 'label'}, axis=1, inplace=True)
test_textgain_neural.head()

In [None]:
test_textgain_neural.label.value_counts() # a lot more 1 predicted compared to SVM

In [None]:
# write out predictions
test_textgain_neural_path = 'drive/MyDrive/NLP_Final_Assignment/predictions/test_textgain_neural.csv'
test_textgain_neural.to_csv(test_textgain_neural_path, encoding='utf-8', index=False)

In [None]:
test_textgain.text.values[0:6]