In [1]:
# Step 1: Data collection and preprocessing

import pandas as pd
import numpy as np
from transformers import BertTokenizer


all_df = pd.read_csv('../data/EmoBank/corpus/emobank.csv')
train_df = all_df.loc[all_df['split'] == 'train']
test_df = all_df.loc[all_df['split'] == 'test']
dev_df = all_df.loc[all_df['split'] == 'dev']

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [2]:
train_df = train_df.reset_index()[['id', 'split', 'V', 'A', 'D', 'text']]
test_df = test_df.reset_index()[['id', 'split', 'V', 'A', 'D', 'text']]
dev_df = dev_df.reset_index()[['id', 'split', 'V', 'A', 'D', 'text']]

In [3]:
from textblob import TextBlob
from tqdm import tqdm

def add_rb_to_df(df):
    pol = []
    for ix in tqdm(range(len(df))):
        em = TextBlob(train_df['text'][ix])
        p = (((em.sentiment.polarity + 1) * 4) / 2) + 1
        pol.append(p)
    df['pol'] = pol
    return df

train_df = add_rb_to_df(train_df)
test_df = add_rb_to_df(test_df)
dev_df = add_rb_to_df(dev_df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8062/8062 [00:02<00:00, 3296.97it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2848.61it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3082.71it/s]


In [4]:
encoded_train = tokenizer(text=train_df.text.tolist(),
                         add_special_tokens=True,
                         padding='max_length',
                         truncation='longest_first',
                         max_length=300,
                         return_attention_mask=True)
train_input_ids = encoded_train['input_ids']
train_attention_mask = encoded_train['attention_mask']

In [5]:
encoded_test = tokenizer(text=test_df.text.tolist(),
                         add_special_tokens=True,
                         padding='max_length',
                         truncation='longest_first',
                         max_length=300,
                         return_attention_mask=True)
test_input_ids = encoded_test['input_ids']
test_attention_mask = encoded_test['attention_mask']

In [6]:
encoded_dev = tokenizer(text=dev_df.text.tolist(),
                         add_special_tokens=True,
                         padding='max_length',
                         truncation='longest_first',
                         max_length=300,
                         return_attention_mask=True)
dev_input_ids = encoded_dev['input_ids']
dev_attention_mask = encoded_dev['attention_mask']

In [7]:
train_input_ids = np.array(train_input_ids)
train_attention_mask = np.array(train_attention_mask)
train_labels = train_df.V.to_numpy(dtype='float32')
train_pol = train_df.pol.to_numpy(dtype='float32')
train_labels.dtype

dtype('float32')

In [8]:
test_input_ids = np.array(test_input_ids)
test_attention_mask = np.array(test_attention_mask)
test_labels = test_df.V.to_numpy(dtype='float32')
test_pol = test_df.pol.to_numpy(dtype='float32')
test_labels.dtype

dtype('float32')

In [9]:
dev_input_ids = np.array(dev_input_ids)
dev_attention_mask = np.array(dev_attention_mask)
dev_labels = dev_df.V.to_numpy(dtype='float32')
dev_pol = dev_df.pol.to_numpy(dtype='float32')
dev_labels.dtype

dtype('float32')

In [10]:
from sklearn.preprocessing import StandardScaler

label_scaler = StandardScaler()
label_scaler.fit(train_labels.reshape(-1, 1))

train_labels = label_scaler.transform(train_labels.reshape(-1, 1))
test_labels = label_scaler.transform(test_labels.reshape(-1, 1))
dev_labels = label_scaler.transform(dev_labels.reshape(-1, 1))

train_pol = label_scaler.transform(train_pol.reshape(-1, 1))
test_pol = label_scaler.transform(test_pol.reshape(-1, 1))
dev_pol = label_scaler.transform(dev_pol.reshape(-1, 1))

print(train_labels.dtype, test_labels.dtype, dev_labels.dtype)

float32 float32 float32


In [11]:
import torch
from torch.utils.data import TensorDataset, DataLoader

batch_size = 16

def create_dataloader(inputs, masks, labels, pol, batch_size):
    input_tensor = torch.tensor(inputs)
    mask_tensor = torch.tensor(masks)
    labels_tensor = torch.tensor(labels)
    rb_tensor = torch.tensor(pol)
    print(len(labels), len(input_tensor))
    dataset = TensorDataset(input_tensor,
                            mask_tensor,
                            labels_tensor,
                            rb_tensor) # rb = rule-based
    dataloader = DataLoader(dataset, 
                            batch_size = batch_size,
                            shuffle = True)
    return dataloader

train_dataloader = create_dataloader(train_input_ids, train_attention_mask, train_labels, train_pol, batch_size)
test_dataloader = create_dataloader(test_input_ids, test_attention_mask, test_labels, test_pol, batch_size)
dev_dataloader = create_dataloader(dev_input_ids, dev_attention_mask, dev_labels, dev_pol, batch_size)

8062 8062
1000 1000
1000 1000


In [12]:
import torch.nn as nn
from transformers import BertModel

class BertRegression(nn.Module):
    def __init__(self, dropout=0.1, freeze_bert=False):
        super(BertRegression, self).__init__()
        D_in,  D_layer, D_out = 768, 256, 1
        
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.LSTM = nn.LSTM(D_in, D_layer, batch_first=True)
        self.regressor = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(D_layer, D_out)
        )
    
    def forward(self, input_ids, attention_masks):
        outputs = self.bert(input_ids, attention_masks)
        lstm_outputs, (h, x) = self.LSTM(outputs[0])
        # hidden = torch.cat((lstm_outputs[:, -1, :D_layer], lstm_outputs[:, 0, D_layer:]), dim = -1)
        class_label_output = lstm_outputs[:, -1]
        outputs = self.regressor(class_label_output)
        return outputs
    
model = BertRegression(dropout=0.1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('Using CUDA')
else:
    print('Using CPU')
    device = torch.device('cpu')

model.to(device)

Using CPU


BertRegression(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [14]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr = 5e-5,
                  eps = 1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps = 0,
                                           num_training_steps = total_steps)

loss_function = nn.MSELoss()

In [15]:
from torch.nn.utils.clip_grad import clip_grad_norm
import math

def train(model, optimizer, scheduler, loss_function, epochs, train_dataloader, device, clip_value=2):
    for epoch in range(epochs):
        print(epoch)
        print('=' * 15)
        best_loss = 10
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch_inputs, batch_masks, batch_labels, batch_pol = tuple(b.to(device) for b in batch)
            model.zero_grad()
            outputs = model(batch_inputs, batch_masks)
            loss = loss_function(outputs.squeeze(), batch_labels.squeeze()) 
            # l2 = loss_function(outputs.squeeze(), batch_pol.squeeze())
            # loss = l1 * epoch + l2
            loss.backward()
            clip_grad_norm(model.parameters(), clip_value)
            # add dev step here to check change??
            optimizer.step()
            scheduler.step()
            print(str(step) + ' ' * 10 + str(loss.item()))
        print('-' * 15)
    return model

model = train(model, optimizer, scheduler, loss_function, epochs, train_dataloader, device, clip_value=2)

0


KeyboardInterrupt: 

In [None]:
def r2_score(outputs, labels):
    labels_mean = torch.mean(labels)
    ss_tot = torch.sum((labels - labels_mean) ** 2)
    ss_res = torch.sum((labels - outputs) ** 2)
    r2 = 1 - ss_res / ss_tot
    return r2

def evaluate (model, loss_function, test_dataloader, device):
    model.eval()
    test_loss, test_r2 = [], []
    for batch in test_dataloader:
        batch_inputs, batch_masks, batch_labels, _ = tuple(b.to(device) for b in batch)
        with torch.no_grad():
            outputs = model(batch_inputs, batch_masks)
        loss = loss_function(outputs, batch_labels)
        test_loss.append(loss.item())
        r2 = r2_score(outputs, batch_labels)
        test_r2.append(r2.item())
    return test_loss, test_r2

loss, r2 = evaluate(model, loss_function, test_dataloader, device)
print(np.mean(loss), np.mean(r2))

In [None]:
def predict(model, dataloader, device):
    model.eval()
    output = []
    for batch in dataloader:
        batch_inputs, batch_masks, _, _ = tuple(b.to(device) for b in batch)
        with torch.no_grad():
            output += model(batch_inputs, batch_masks).view(1, -1).tolist()[0]
    return output

y_pred = predict(model, dev_dataloader, device)
y_test = dev_df.V.to_numpy()
y_pred = label_scaler.inverse_transform(y_pred)
print(len(y_pred))

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

mae = mean_absolute_error(y_test, y_pred)
mdae = median_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mdape = ((pd.Series(y_test) - pd.Series(y_pred))\
         / pd.Series(y_test)).abs().median()
r_squared = r2_score(y_test, y_pred)

mae, mdae, mse, mape, mdape, r_squared