In [4]:
import torch

In [5]:
!pip install transformers

Collecting transformers
  Downloading transformers-2.4.1-py3-none-any.whl (475 kB)
[K     |████████████████████████████████| 475 kB 5.1 MB/s eta 0:00:01
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.38.tar.gz (860 kB)
[K     |████████████████████████████████| 860 kB 23.8 MB/s eta 0:00:01
Collecting tokenizers==0.0.11
  Downloading tokenizers-0.0.11-cp37-cp37m-manylinux1_x86_64.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 38.9 MB/s eta 0:00:01
[?25hCollecting regex!=2019.12.17
  Downloading regex-2020.1.8-cp37-cp37m-manylinux2010_x86_64.whl (690 kB)
[K     |████████████████████████████████| 690 kB 47.1 MB/s eta 0:00:01
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.85-cp37-cp37m-manylinux1_x86_64.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 101.9 MB/s eta 0:00:01
[?25hCollecting filelock
  Downloading filelock-3.0.12-py3-none-any.whl (7.6 kB)
Collecting requests
  Downloading requests-2.22.0-py2.py3-none-any.whl (57 kB)

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [6]:
df.head(), len(df)

(                                                   0  1
 0  a stirring , funny and finally transporting re...  1
 1  apparently reassembled from the cutting room f...  0
 2  they presume their audience wo n't sit still f...  0
 3  this is a visually stunning rumination on love...  1
 4  jonathan parker 's bartleby should have been t...  1, 6920)

In [9]:
batch_1= df[:2000]

In [10]:
batch_1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

## Loading pre-trained BERT

In [3]:
# for distillBERT
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')


# for BERT
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

#load pretrained , model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [11]:
# tokenization
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [12]:
#padding
max_len = 0
for i in tokenized.values:
    if len(i)>max_len:
        max_len = len(i)
        
padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

In [13]:
padded[0]

array([  101,  1037, 18385,  1010,  6057,  1998,  2633, 18276,  2128,
       16603,  1997,  5053,  1998,  1996,  6841,  1998,  5687,  5469,
        3152,   102,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0])

In [14]:
np.array(padded).shape

(2000, 59)

In [15]:
# masking
attention_mask = np.where(padded != 0, 1,0)
attention_mask.shape

(2000, 59)

In [20]:
padded

array([[  101,  1037, 18385, ...,     0,     0,     0],
       [  101,  4593,  2128, ...,     0,     0,     0],
       [  101,  2027,  3653, ...,     0,     0,     0],
       ...,
       [  101,  2023,  2028, ...,     0,     0,     0],
       [  101,  1999,  1996, ...,     0,     0,     0],
       [  101,  1996,  3185, ...,     0,     0,     0]])

In [22]:
attention_mask.shape
input_ids.shape

torch.Size([2000, 59])

In [16]:
input_ids = torch.tensor(padded)

attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.IntTensor instead (while checking arguments for embedding)

In [30]:
last_hidden_states[0].shape

torch.Size([2000, 59, 768])

In [31]:
features = last_hidden_states[0][:,0,:].numpy()
#(all sentences, only first position [CLS], all hidden states)


In [32]:
labels = batch_1[1]

In [33]:
labels.shape

(2000,)

In [34]:
# train test split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [35]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [36]:
lr_clf.score(test_features, test_labels)

0.832

In [37]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.504 (+/- 0.04)


## trying BERT

In [23]:
# for BERT
model_class1, tokenizer_class1, pretrained_weights1 = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

#load pretrained , model/tokenizer
tokenizer1 = tokenizer_class1.from_pretrained(pretrained_weights1)
model1 = model_class1.from_pretrained(pretrained_weights1)

HBox(children=(IntProgress(value=0, description='Downloading', max=433, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




In [38]:
# tokenization
tokenized1 = batch_1[0].apply((lambda x: tokenizer1.encode(x, add_special_tokens=True)))

In [39]:
#padding
max_len = 0
for i in tokenized1.values:
    if len(i)>max_len:
        max_len = len(i)
        
padded1 = np.array([i + [0]*(max_len - len(i)) for i in tokenized1.values])

In [40]:
attention_mask1 = np.where(padded1 != 0, 1,0)
attention_mask1.shape

(2000, 59)

In [41]:
input_ids1 = torch.tensor(padded1)

attention_mask1 = torch.tensor(attention_mask1)

with torch.no_grad():
    last_hidden_states1 = model1(input_ids1, attention_mask=attention_mask1)

In [42]:
last_hidden_states1[0].shape

torch.Size([2000, 59, 768])

In [43]:
features1 = last_hidden_states1[0][:,0,:].numpy()

In [44]:
# train test split
train_features1, test_features1, train_labels1, test_labels1 = train_test_split(features1, labels)

In [48]:
lr_clf1 = LogisticRegression(C=5.263252631578947)
lr_clf1.fit(train_features1, train_labels1)

LogisticRegression(C=5.263252631578947, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [49]:
lr_clf1.score(test_features1, test_labels1)

0.844

## grid search for bert features

In [47]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features1, train_labels1)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 5.263252631578947}
best scrores:  0.8080000000000002


In [50]:
## reference: https://github.com/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb


## Results


|MODEL|RESULTS|
|------|------|
|DistilBERT + Logistic|0.832|
|BERT + Logistic|0.85|
|BERT + gridsearch + Logistic|0.844|
