In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('train.tsv',delimiter='\t',header=None)

In [3]:
df

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1
...,...,...
6915,"painful , horrifying and oppressively tragic ,...",1
6916,take care is nicely performed by a quintet of ...,0
6917,"the script covers huge , heavy topics in a bla...",0
6918,a seriously bad film with seriously warped log...,0


In [4]:
batch_1 = df[:2000]

In [5]:
batch_1

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1
...,...,...
1995,too bland and fustily tasteful to be truly pru...,0
1996,it does n't work as either,0
1997,this one aims for the toilet and scores a dire...,0
1998,in the name of an allegedly inspiring and easi...,0


In [6]:
batch_1[1].value_counts()

1
1    1041
0     959
Name: count, dtype: int64

In [7]:
model_class,tokenizer_class,pretrained_weights = (ppb.DistilBertModel,ppb.DistilBertTokenizer,'distilbert-base-uncased')

In [8]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [9]:
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x,add_special_tokens=True)))

In [10]:
tokenized

0       [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
1       [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
2       [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
3       [101, 2023, 2003, 1037, 17453, 14726, 19379, 1...
4       [101, 5655, 6262, 1005, 1055, 12075, 2571, 376...
                              ...                        
1995    [101, 2205, 20857, 1998, 11865, 16643, 2135, 5...
1996    [101, 2009, 2515, 1050, 1005, 1056, 2147, 2004...
1997    [101, 2023, 2028, 8704, 2005, 1996, 11848, 199...
1998    [101, 1999, 1996, 2171, 1997, 2019, 9382, 1898...
1999    [101, 1996, 3185, 2003, 25757, 2011, 1037, 244...
Name: 0, Length: 2000, dtype: object

In [11]:
batch_1

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1
...,...,...
1995,too bland and fustily tasteful to be truly pru...,0
1996,it does n't work as either,0
1997,this one aims for the toilet and scores a dire...,0
1998,in the name of an allegedly inspiring and easi...,0


In [12]:
max_len = 0
for i in tokenized.values:
    if len(i)>max_len:
        max_len = len(i)

In [13]:
max_len

59

In [14]:
padded = np.array([i+[0]*(max_len-len(i)) for i in tokenized.values])

In [15]:
padded

array([[  101,  1037, 18385, ...,     0,     0,     0],
       [  101,  4593,  2128, ...,     0,     0,     0],
       [  101,  2027,  3653, ...,     0,     0,     0],
       ...,
       [  101,  2023,  2028, ...,     0,     0,     0],
       [  101,  1999,  1996, ...,     0,     0,     0],
       [  101,  1996,  3185, ...,     0,     0,     0]])

In [16]:
np.array(padded).shape

(2000, 59)

In [20]:
attention_mask = np.where(padded !=0,1,0)

In [21]:
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [22]:
attention_mask.shape

(2000, 59)

In [23]:
input_ids = torch.tensor(padded)

In [24]:
input_ids

tensor([[  101,  1037, 18385,  ...,     0,     0,     0],
        [  101,  4593,  2128,  ...,     0,     0,     0],
        [  101,  2027,  3653,  ...,     0,     0,     0],
        ...,
        [  101,  2023,  2028,  ...,     0,     0,     0],
        [  101,  1999,  1996,  ...,     0,     0,     0],
        [  101,  1996,  3185,  ...,     0,     0,     0]], dtype=torch.int32)

In [25]:
attention_mask = torch.tensor(attention_mask)

In [26]:
attention_mask

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32)

In [27]:
with torch.no_grad():
    last_hidden_states = model(input_ids,attention_mask=attention_mask)

In [32]:
last_hidden_states[0].shape

torch.Size([2000, 59, 768])

In [33]:
features = last_hidden_states[0][:,0,:].numpy()

In [34]:
features

array([[-0.21593437, -0.14028907,  0.00831087, ..., -0.13694851,
         0.5867003 ,  0.20112714],
       [-0.17262718, -0.14476174,  0.00223434, ..., -0.1744254 ,
         0.21386452,  0.37197474],
       [-0.05063344,  0.07203951, -0.0295976 , ..., -0.07148926,
         0.71852416,  0.26225466],
       ...,
       [-0.27829772, -0.248036  ,  0.13585785, ..., -0.1903915 ,
         0.13099578,  0.3497837 ],
       [-0.03667728,  0.10638569, -0.01111017, ..., -0.11206636,
         0.41619483,  0.5033802 ],
       [ 0.12402627,  0.0142516 ,  0.01038403, ..., -0.11606539,
         0.5345917 ,  0.27495342]], dtype=float32)

In [35]:
features.shape

(2000, 768)

In [36]:
labels = batch_1[1]

In [37]:
labels

0       1
1       0
2       0
3       1
4       1
       ..
1995    0
1996    0
1997    0
1998    0
1999    0
Name: 1, Length: 2000, dtype: int64

In [38]:
train_features,test_features,train_labels,test_labels = train_test_split(features,labels)

In [39]:
train_features.shape

(1500, 768)

In [40]:
test_features.shape

(500, 768)

In [41]:
lr_clf = LogisticRegression()

In [43]:
lr_clf.fit(train_features,train_labels)

In [44]:
lr_clf.score(test_features,test_labels)

0.834

In [45]:
from sklearn.dummy import DummyClassifier

In [46]:
clf = DummyClassifier()

In [47]:
scores = cross_val_score(clf,train_features,train_labels)

In [49]:
print('Dummy classifier score: %0.3f(+/- %0.2f)' %(scores.mean(),scores.std()*2))

Dummy classifier score: 0.524(+/- 0.00)
