Some useful links (delete when submitting lols):
    
https://huggingface.co/transformers/model_doc/bert.html#bertforsequenceclassification

https://github.com/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb

In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics import accuracy_score

### Load and check the data

In [2]:
df_train = pd.read_csv('SST-2/train.tsv', delimiter='\t')
df_dev = pd.read_csv('SST-2/dev.tsv', delimiter='\t')
df_test = pd.read_csv('SST-2/test.tsv', delimiter='\t')

In [3]:
df_train.head()

Unnamed: 0,sentence,label
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates som...,1
3,remains utterly satisfied to remain the same t...,0
4,on the worst revenge-of-the-nerds clichés the ...,0


In [4]:
df_dev.head()

Unnamed: 0,sentence,label
0,it 's a charming and often affecting journey .,1
1,unflinchingly bleak and desperate,0
2,allows us to hope that nolan is poised to emba...,1
3,"the acting , costumes , music , cinematography...",1
4,"it 's slow -- very , very slow .",0


In [5]:
print("Training data: ", len(df_train))
print("Dev data: ", len(df_dev))
print("Test data: ", len(df_test))

Training data:  67349
Dev data:  872
Test data:  1821


### Load the model

In [6]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = DistilBertModel.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

### Preprocessing before sending to the model

In [7]:
def get_max_len(tokenized):
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)
    return max_len

In [8]:
# This turns every sentence into the list of ids
### NOTE!!! USING PARTIAL DATA FOR TRYING FIRST -- change later
tokenized_train = df_train['sentence'][:1000].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
tokenized_dev = df_dev['sentence'][:100].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
tokenized_test = df_test['sentence'][:100].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [9]:
# Padding
max_len = max(get_max_len(tokenized_train), get_max_len(tokenized_dev), get_max_len(tokenized_test))
padded_train = np.array([i + [0]*(max_len-len(i)) for i in tokenized_train.values])
padded_dev = np.array([i + [0]*(max_len-len(i)) for i in tokenized_dev.values])
padded_test = np.array([i + [0]*(max_len-len(i)) for i in tokenized_test.values])

print("Padded shape (train): ", np.array(padded_train).shape)
print("Padded shape (dev): ", np.array(padded_dev).shape)
print("Padded shape (test): ", np.array(padded_test).shape)

Padded shape (train):  (1000, 58)
Padded shape (dev):  (100, 58)
Padded shape (test):  (100, 58)


In [10]:
# Tell BERT to ignore padding
attention_mask_train = np.where(padded_train != 0, 1, 0)
attention_mask_dev = np.where(padded_dev != 0, 1, 0)
attention_mask_test = np.where(padded_test != 0, 1, 0)

### Using BERT to encode

In [11]:
input_ids_train = torch.tensor(padded_train)  
attention_mask_train = torch.tensor(attention_mask_train)
input_ids_dev = torch.tensor(padded_dev)  
attention_mask_dev = torch.tensor(attention_mask_dev)
input_ids_test = torch.tensor(padded_test)  
attention_mask_test = torch.tensor(attention_mask_test)

with torch.no_grad():
    last_hidden_states_train = model(input_ids_train, attention_mask=attention_mask_train)
    last_hidden_states_dev = model(input_ids_dev, attention_mask=attention_mask_dev)
    last_hidden_states_test = model(input_ids_test, attention_mask=attention_mask_test)

In [12]:
# get all sentences, all hidden unit outputs, at [CLS] (if output hidden state)
features_train = last_hidden_states_train[0][:,0,:].numpy()
features_dev = last_hidden_states_dev[0][:,0,:].numpy()
features_test = last_hidden_states_test[0][:,0,:].numpy()

In [15]:
print("train features: ", features_train.shape)
print("dev features: ", features_dev.shape)
print("test features: ", features_test.shape)

train features:  (1000, 768)
dev features:  (100, 768)
test features:  (100, 768)


In [13]:
# y_pred = torch.argmax(last_hidden_states_dev.logits, dim = 1).numpy()
# y_true = df_dev['label'][:100].to_numpy()
# accuracy_score(y_true, y_pred)

### Use training set to grid search for parameters