# W266 Project

### Adam Sayre & Erin Werner

## BERT 

In [1]:
# Adding the emoji and wordcloud packages to PATH
import sys
sys.path.insert(0,r"./anaconda3/lib/python3.7/site-packages")

In [2]:
import numpy as np
import csv
import pandas as pd 
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import importlib
import emoji
import tensorflow as tf
import nltk
import re
from nltk.corpus import brown
nltk.download('stopwords')
from nltk.corpus import stopwords
assert(nltk.download("treebank"))
from nltk.corpus import europarl_raw
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from collections import Counter
from sklearn.model_selection import train_test_split




[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package treebank to /home/ubuntu/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [3]:
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
import random

In [4]:
data = pd.read_csv("tweet_data.csv") 
data.head()

Unnamed: 0,Emotion,Content,Original Content
0,disappointed,oh fuck did i wrote fil grinningfacewithsweat ...,b'RT @Davbingodav: @mcrackins Oh fuck.... did ...
1,disappointed,i feel nor am i shamed by it,i feel nor am i shamed by it
2,disappointed,i had been feeling a little bit defeated by th...,i had been feeling a little bit defeated by th...
3,happy,imagine if that reaction guy that called jj kf...,"b""@KSIOlajidebt imagine if that reaction guy t..."
4,disappointed,i wouldnt feel burdened so that i would live m...,i wouldnt feel burdened so that i would live m...


In [5]:
data_e = pd.read_csv("dataset(clean)_e.csv") 
data_e.head()[['Emotion','Content','Original Content','E_Content']]

Unnamed: 0,Emotion,Content,Original Content,E_Content
0,disappointed,oh fuck did i wrote fil grinningfacewithsweat ...,b'RT @Davbingodav: @mcrackins Oh fuck.... did ...,rt usertaginstance usertaginstance oh fuck wro...
1,disappointed,i feel nor am i shamed by it,i feel nor am i shamed by it,feel shamed
2,disappointed,i had been feeling a little bit defeated by th...,i had been feeling a little bit defeated by th...,feeling little bit defeated steps faith would ...
3,happy,imagine if that reaction guy that called jj kf...,"b""@KSIOlajidebt imagine if that reaction guy t...",usertaginstance imagine reaction guy called jj...
4,disappointed,i wouldnt feel burdened so that i would live m...,i wouldnt feel burdened so that i would live m...,wouldnt feel burdened would live life testamen...


In [6]:
data_a = pd.read_csv("dataset(clean)_a.csv") 
data_a.head()[['Emotion','Content','Original Content','A_Content']]

Unnamed: 0,Emotion,Content,Original Content,A_Content
0,disappointed,oh fuck did i wrote fil grinningfacewithsweat ...,b'RT @Davbingodav: @mcrackins Oh fuck.... did ...,b rt davbingodav mcrackins oh fuck wrote fil g...
1,disappointed,i feel nor am i shamed by it,i feel nor am i shamed by it,feel shamed
2,disappointed,i had been feeling a little bit defeated by th...,i had been feeling a little bit defeated by th...,feeling little bit defeated steps faith would ...
3,happy,imagine if that reaction guy that called jj kf...,"b""@KSIOlajidebt imagine if that reaction guy t...",b ksiolajidebt imagine reaction guy called jj ...
4,disappointed,i wouldnt feel burdened so that i would live m...,i wouldnt feel burdened so that i would live m...,wouldnt feel burdened would live life testamen...


In [7]:
data['Emotion'].value_counts()

disappointed    313714
happy           301871
angry           300990
Name: Emotion, dtype: int64

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [9]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [10]:
def evaluate(dataloader_val):

    model.eval()  
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)     
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [11]:
def train_bert_model(model, dataloader_train, dataloader_validation, optimizer, scheduler, epochs): 
    
    for epoch in tqdm(range(1, epochs+1)):
        model.train()
        loss_train_total = 0
        progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        
        for batch in progress_bar:
            model.zero_grad()
            batch = tuple(b.to(device) for b in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[2],
                     }       
            outputs = model(**inputs)

            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
            
        tqdm.write(f'\nEpoch {epoch}')

        loss_train_avg = loss_train_total/len(dataloader_train)            
        tqdm.write(f'Training loss: {loss_train_avg}')

        val_loss, predictions, true_vals = evaluate(dataloader_validation)
        val_f1 = f1_score_func(predictions, true_vals)
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (Weighted): {val_f1}')
        
    return loss_train_avg, val_f1, val_loss

In [12]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

#### Original Cleaned Data

In [13]:
possible_labels = data.Emotion.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'disappointed': 0, 'happy': 1, 'angry': 2}

In [14]:
data['label'] = data.Emotion.replace(label_dict)

In [15]:
X_train_c, X_val_c, y_train_c, y_val_c = train_test_split(data.index.values, data.label.values, test_size=0.3, 
                                                  random_state=42, stratify=data.label.values)

#X_train_c, X_val_c, y_train_c, y_val_c = train_test_split(X_train_c, y_train_c, test_size=0.5, 
#                                                  random_state=42, stratify=y_train_c)

In [16]:
data['data_type'] = ['not_set']*data.shape[0]

data.loc[X_train_c, 'data_type'] = 'train'
data.loc[X_val_c, 'data_type'] = 'val'

data.groupby(['Emotion', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Content,Original Content
Emotion,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1
angry,2,train,210693,210693
angry,2,val,90297,90297
disappointed,0,train,219600,219600
disappointed,0,val,94114,94114
happy,1,train,211309,211309
happy,1,val,90562,90562


In [17]:
encoded_data_train_c = tokenizer.batch_encode_plus(
    data[data.data_type=='train'].Content.values, 
    add_special_tokens=True, 
    truncation=True,
    padding=True,
    return_attention_mask=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val_c = tokenizer.batch_encode_plus(
    data[data.data_type=='val'].Content.values, 
    add_special_tokens=True, 
    truncation=True,
    padding=True,
    return_attention_mask=True, 
    max_length=256, 
    return_tensors='pt'
)

In [18]:
input_ids_train_c = encoded_data_train_c['input_ids']
attention_masks_train_c = encoded_data_train_c['attention_mask']
labels_train_c = torch.tensor(data[data.data_type=='train'].label.values)

input_ids_val_c = encoded_data_val_c['input_ids']
attention_masks_val_c = encoded_data_val_c['attention_mask']
labels_val_c = torch.tensor(data[data.data_type=='val'].label.values)

dataset_train_c = TensorDataset(input_ids_train_c, attention_masks_train_c, labels_train_c)
dataset_val_c = TensorDataset(input_ids_val_c, attention_masks_val_c, labels_val_c)

In [19]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=len(label_dict),
                                                      output_attentions=False,output_hidden_states=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [21]:
batch_size = 3

dataloader_train_c = DataLoader(dataset_train_c, sampler=RandomSampler(dataset_train_c), batch_size=batch_size)
dataloader_validation_c = DataLoader(dataset_val_c, sampler=SequentialSampler(dataset_val_c), batch_size=batch_size)

In [22]:
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)              
epochs = 1

scheduler_c = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train_c)*epochs)

In [23]:
bert_tr_loss_c, bert_f1_c, bert_val_loss_c = train_bert_model(model, dataloader_train_c, dataloader_validation_c,
                                                              optimizer, scheduler_c, epochs)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=213868.0, style=ProgressStyle(description_w…


Epoch 1
Training loss: 0.31075673452883357
Validation loss: 0.28929742172868833
F1 Score (Weighted): 0.9207116909580677



F1 Score of (0.85!)

#### Original Uncleaned Data

In [44]:
data["Original_Content"] = data["Original Content"]

In [45]:
X_train_oc, X_val_oc, y_train_oc, y_val_oc = train_test_split(data.index.values, data.label.values, test_size=0.3, 
                                                  random_state=42, stratify=data.label.values)

#X_train_oc, X_val_oc, y_train_oc, y_val_oc = train_test_split(X_train_oc, y_train_oc, test_size=0.5, 
#                                                  random_state=42, stratify=y_train_oc)

In [46]:
data['data_type'] = ['not_set']*data.shape[0]

data.loc[X_train_oc, 'data_type'] = 'train'
data.loc[X_val_oc, 'data_type'] = 'val'

data.groupby(['Emotion', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Content,Original Content,Original_Content
Emotion,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
angry,2,train,210693,210693,210693
angry,2,val,90297,90297,90297
disappointed,0,train,219600,219600,219600
disappointed,0,val,94114,94114,94114
happy,1,train,211309,211309,211309
happy,1,val,90562,90562,90562


In [47]:
encoded_data_train_oc = tokenizer.batch_encode_plus(
    data[data.data_type=='train'].Original_Content.values, 
    add_special_tokens=True, 
    truncation=True,
    padding=True,
    return_attention_mask=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val_oc = tokenizer.batch_encode_plus(
    data[data.data_type=='val'].Original_Content.values, 
    add_special_tokens=True, 
    truncation=True,
    padding=True,
    return_attention_mask=True, 
    max_length=256, 
    return_tensors='pt'
)

In [48]:
input_ids_train_oc = encoded_data_train_oc['input_ids']
attention_masks_train_oc = encoded_data_train_oc['attention_mask']
labels_train_oc = torch.tensor(data[data.data_type=='train'].label.values)

input_ids_val_oc = encoded_data_val_oc['input_ids']
attention_masks_val_oc = encoded_data_val_oc['attention_mask']
labels_val_oc = torch.tensor(data[data.data_type=='val'].label.values)

dataset_train_oc = TensorDataset(input_ids_train_oc, attention_masks_train_oc, labels_train_oc)
dataset_val_oc = TensorDataset(input_ids_val_oc, attention_masks_val_oc, labels_val_oc)

In [49]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=len(label_dict),
                                                      output_attentions=False,output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [50]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [51]:
batch_size = 3

dataloader_train_oc = DataLoader(dataset_train_oc, sampler=RandomSampler(dataset_train_oc), batch_size=batch_size)
dataloader_validation_oc = DataLoader(dataset_val_oc, sampler=SequentialSampler(dataset_val_oc),batch_size=batch_size)

In [52]:
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)              
epochs = 1

scheduler_oc = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train_oc)*epochs)

In [53]:
bert_tr_loss_oc, bert_f1_oc, bert_val_loss_oc = train_bert_model(model, dataloader_train_oc, dataloader_validation_oc,
                                                              optimizer, scheduler_oc, epochs)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=213868.0, style=ProgressStyle(description_w…


Epoch 1
Training loss: 0.29519668717924713
Validation loss: 0.27512661494702917
F1 Score (Weighted): 0.9246183270067194



#### Custom Cleaned Data #1

In [54]:
possible_labels_e = data_e.Emotion.unique()

label_dict_e = {}
for index, possible_label in enumerate(possible_labels_e):
    label_dict_e[possible_label] = index
label_dict_e

{'disappointed': 0, 'happy': 1, 'angry': 2}

In [55]:
data_e['label'] = data_e.Emotion.replace(label_dict_e)

In [56]:
X_train_e, X_val_e, y_train_e, y_val_e = train_test_split(data_e.index.values, data_e.label.values, test_size=0.3, 
                                                  random_state=42, stratify=data_e.label.values)

#X_train_e, X_val_e, y_train_e, y_val_e = train_test_split(X_train_e, y_train_e, test_size=0.5, 
#                                                  random_state=42, stratify=y_train_e)

In [57]:
data__e = pd.DataFrame()
data__e['Emotion'] = data_e['Emotion']
data__e['label'] = data_e['label']
data__e['E_Content'] = data_e['E_Content']

data__e['data_type'] = ['not_set']*data__e.shape[0]

data__e.loc[X_train_e, 'data_type'] = 'train'
data__e.loc[X_val_e, 'data_type'] = 'val'

data__e.groupby(['Emotion', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,E_Content
Emotion,label,data_type,Unnamed: 3_level_1
angry,2,train,210693
angry,2,val,90297
disappointed,0,train,219600
disappointed,0,val,94114
happy,1,train,211309
happy,1,val,90562


In [58]:
encoded_data_train_e = tokenizer.batch_encode_plus(
    data__e[data__e.data_type=='train'].E_Content.values, 
    add_special_tokens=True, 
    truncation=True,
    padding=True,
    return_attention_mask=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val_e = tokenizer.batch_encode_plus(
    data__e[data__e.data_type=='val'].E_Content.values, 
    add_special_tokens=True, 
    truncation=True,
    padding=True,
    return_attention_mask=True, 
    max_length=256, 
    return_tensors='pt'
)

In [59]:
input_ids_train_e = encoded_data_train_e['input_ids']
attention_masks_train_e = encoded_data_train_e['attention_mask']
labels_train_e = torch.tensor(data__e[data__e.data_type=='train'].label.values)

input_ids_val_e = encoded_data_val_e['input_ids']
attention_masks_val_e = encoded_data_val_e['attention_mask']
labels_val_e = torch.tensor(data__e[data__e.data_type=='val'].label.values)

dataset_train_e = TensorDataset(input_ids_train_e, attention_masks_train_e, labels_train_e)
dataset_val_e = TensorDataset(input_ids_val_e, attention_masks_val_e, labels_val_e)

In [60]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=len(label_dict_e),
                                                      output_attentions=False,output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [61]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [62]:
batch_size = 3

dataloader_train_e = DataLoader(dataset_train_e, sampler=RandomSampler(dataset_train_e), batch_size=batch_size)
dataloader_validation_e = DataLoader(dataset_val_e, sampler=SequentialSampler(dataset_val_e), batch_size=batch_size)

In [63]:
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)            
epochs = 1

scheduler_e = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train_e)*epochs)

In [64]:
bert_tr_loss_e, bert_f1_e, bert_val_loss_e = train_bert_model(model, dataloader_train_e, dataloader_validation_e,
                                                              optimizer, scheduler_e, epochs)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=213868.0, style=ProgressStyle(description_w…


Epoch 1
Training loss: 0.3167731010424299
Validation loss: 0.29601037087735566
F1 Score (Weighted): 0.9154768232720168



#### Custom Cleaned Data #2

In [65]:
possible_labels_a = data_a.Emotion.unique()

label_dict_a = {}
for index, possible_label in enumerate(possible_labels_a):
    label_dict_a[possible_label] = index
label_dict_a

{'disappointed': 0, 'happy': 1, 'angry': 2}

In [66]:
data_a['label'] = data_a.Emotion.replace(label_dict_a)

In [67]:
X_train_a, X_val_a, y_train_a, y_val_a = train_test_split(data_a.index.values, data_a.label.values, test_size=0.3, 
                                                  random_state=42, stratify=data_a.label.values)

#X_train_a, X_val_a, y_train_a, y_val_a = train_test_split(X_train_a, y_train_a, test_size=0.5, 
#                                                  random_state=42, stratify=y_train_a)

In [68]:
data__a = pd.DataFrame()
data__a['Emotion'] = data_a['Emotion']
data__a['label'] = data_a['label']
data__a['A_Content'] = data_a['A_Content']

data__a['data_type'] = ['not_set']*data__a.shape[0]

data__a.loc[X_train_a, 'data_type'] = 'train'
data__a.loc[X_val_a, 'data_type'] = 'val'

data__a.groupby(['Emotion', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A_Content
Emotion,label,data_type,Unnamed: 3_level_1
angry,2,train,210693
angry,2,val,90297
disappointed,0,train,219600
disappointed,0,val,94114
happy,1,train,211309
happy,1,val,90562


In [69]:
encoded_data_train_a = tokenizer.batch_encode_plus(
    data__a[data__a.data_type=='train'].A_Content.values, 
    add_special_tokens=True, 
    truncation=True,
    padding=True,
    return_attention_mask=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val_a = tokenizer.batch_encode_plus(
    data__a[data__a.data_type=='val'].A_Content.values, 
    add_special_tokens=True, 
    truncation=True,
    padding=True,
    return_attention_mask=True, 
    max_length=256, 
    return_tensors='pt'
)

In [70]:
input_ids_train_a = encoded_data_train_a['input_ids']
attention_masks_train_a = encoded_data_train_a['attention_mask']
labels_train_a = torch.tensor(data__a[data__a.data_type=='train'].label.values)

input_ids_val_a = encoded_data_val_a['input_ids']
attention_masks_val_a = encoded_data_val_a['attention_mask']
labels_val_a = torch.tensor(data__a[data__a.data_type=='val'].label.values)

dataset_train_a = TensorDataset(input_ids_train_a, attention_masks_train_a, labels_train_a)
dataset_val_a = TensorDataset(input_ids_val_a, attention_masks_val_a, labels_val_a)

In [71]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=len(label_dict_a),
                                                      output_attentions=False,output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [72]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [73]:
batch_size = 3

dataloader_train_a = DataLoader(dataset_train_a, sampler=RandomSampler(dataset_train_a), batch_size=batch_size)
dataloader_validation_a = DataLoader(dataset_val_a, sampler=SequentialSampler(dataset_val_a), batch_size=batch_size)

In [74]:
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)            
epochs = 1

scheduler_a = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train_a)*epochs)

In [75]:
bert_tr_loss_a, bert_f1_a, bert_val_loss_a = train_bert_model(model, dataloader_train_a, dataloader_validation_a,
                                                              optimizer, scheduler_a, epochs)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=213868.0, style=ProgressStyle(description_w…


Epoch 1
Training loss: 0.309894040836024
Validation loss: 0.2894129561177539
F1 Score (Weighted): 0.9158085377793592



#### Results

In [76]:
b_f1 = [bert_f1_oc, bert_f1_c, bert_f1_e, bert_f1_a]
b_tr_loss = [bert_tr_loss_oc, bert_tr_loss_c, bert_tr_loss_e, bert_tr_loss_a]
b_val_loss = [bert_val_loss_oc, bert_val_loss_c, bert_val_loss_e, bert_val_loss_a]
b_values = ['Orig. Uncleaned', 'Orig. Cleaned', 'Custom Cleaned #1', 'Custom Cleaned #2']
b_df = pd.DataFrame()
b_df['Cleaning Method'] = b_values
b_df['F1 Score'] = b_f1
b_df['Training Loss'] = b_tr_loss
b_df['Validation Loss'] = b_val_loss
b_df

Unnamed: 0,Cleaning Method,F1 Score,Training Loss,Validation Loss
0,Orig. Uncleaned,0.924618,0.295197,0.275127
1,Orig. Cleaned,0.920712,0.310757,0.289297
2,Custom Cleaned #1,0.915477,0.316773,0.29601
3,Custom Cleaned #2,0.915809,0.309894,0.289413
