In [133]:
import numpy as np
import pandas as pd
import warnings
import datetime
import re

import torch # a tensor library
# the huggingface transformers library (pre-trained deep learning for NLP models)
# run !pip install transformers in a Jupyter Notebook cell
import transformers as ppb 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

Load the dataset into a pandas dataframe

In [134]:
train_data = pd.read_csv("../w266_project/data/train.csv")
test_data = pd.read_csv("../w266_project/data/test.csv")
df = pd.concat([train_data,test_data])
df['len_txt'] =df.cleaned_contents.apply(lambda x: len(x.split()))
df = train_raw[train_raw.len_txt >249]
df = train_raw[train_raw.len_txt <20000]
df = df[['cleaned_contents', 'Discrimination_Label']]
df = df.rename(columns = {'cleaned_contents':'text', 'Discrimination_Label':'label'})
#lower case to help remove dates
df['text'] = df['text'].str.lower()
#remove dates
df['text'] = pd.Series(re.sub(r'(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(t)?(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)([\s]{1,3})?([0-9]{1,2})(.{1,3})?((,)|(.))?([\s]{1,3})?([0-9]{4})|([0-9]{1,2})(.{1,3})?([\s]{1,3})?(day)?([\s]{1,3})?(of)?([\s]{1,3})?(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(t)?(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)((,)|(.))?(\s{1,3})?([0-9]{4})|(first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|twenty-first|twenty-second|twenty-third|twenty-fourth|twenty-fifth|twenty-sixth|twenty-seventh|twenty-eighth|twenty-ninth|thirtieth|thirty-first)([\s]{1,3})?(day)?([\s]{1,3})?(of)?([\s]{1,3})?(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(t)?(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)((,)|(.))?(\s{1,3})?([0-9]{4})|(\b[0-9]{1,2}(\-|\/)[0-9]{1,2}(\-|\/)[0-9]{2,4}\b)|(\b[0-9]{2,4}(\-|\/)[0-9]{1,2}(\-|\/)[0-9]{1,2}\b)', '[DATE]', i) for i in df['text'])
#remove special character
df['text'] = pd.Series(re.sub("'", "", i) for i in df['text'])
df['text'] = pd.Series(re.sub("(\\W)+", " ", i) for i in df['text'])

df.reset_index(inplace=True, drop=True)
df.head()


Unnamed: 0,text,label
0,sentence 1 you are charged as follows first co...,0
1,sentence 1 josefa kotobalavu you were charged ...,1
2,sentence 1 the director of public prosecution ...,1
3,sentence 1 mohommed nabi ud dean you were conv...,1
4,judgment of the court background 1 the appella...,0


In [135]:
len(df['text'][0])

10184

In [136]:
# REduce to 512 tokens - does not work as some sentences are still longer)
#df['start_txt'] = df['text'].apply(lambda x: ' '.join(x.split()[:512]))

Load the pre-trained BERT model and Bert tokenizer
vocab, config, and model files will be downloaded from https://s3.amazonaws.com/models.huggingface.co/bert/ to /home/rdadmin/.cache/torch/transformers/

In [137]:


# we need a BERT model and a BERT tokenizer
# initialize the empty model and tokenizer objects
# we are going to use the Hugging Face's DistilBert model
BERT_model_class,BERT_tokenizer_class,BERT_pre_trained_weights = (ppb.DistilBertModel, # the pre-trained DistillBERT model
                                                                  ppb.DistilBertTokenizer,
                                                                  'distilbert-base-cased') # the type of DistilBERT model

# use the next line instead, if you want (Google's) BERT instead of DistillBERT
#BERT_model_class,BERT_tokenizer_class,BERT_pre_trained_weights = (ppb.BertModel, ppb.BertTokenizer,'bert-base-uncased')

# Load pretrained model/tokenizer weights/values for the desired type of BERT model into their respective objects
tokenizer = BERT_tokenizer_class.from_pretrained(BERT_pre_trained_weights)

#model1 is a pytorch BERT model
model1 = BERT_model_class.from_pretrained(BERT_pre_trained_weights)




Before DistilBERT can process this as input, we’ll need to make all the vectors the same size by padding 
shorter sentences with the token id 0. After the padding, we have a matrix/tensor that is ready to be passed to BERT:

In [140]:
tokenized = df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512)))
# This turns every sentence into a list of IDs
# tokenized is apandas Series object: <class 'pandas.core.series.Series'>
print(df.head())
print('\n')
print(tokenized.head())
print('\n')
print(tokenized.shape) #(6920,) a 1D pandas Series
print(type(tokenized)) #<class 'pandas.core.series.Series'>

                                                text  label
0  sentence 1 you are charged as follows first co...      0
1  sentence 1 josefa kotobalavu you were charged ...      1
2  sentence 1 the director of public prosecution ...      1
3  sentence 1 mohommed nabi ud dean you were conv...      1
4  judgment of the court background 1 the appella...      0


0    [101, 5650, 122, 1128, 1132, 4601, 1112, 3226,...
1    [101, 5650, 122, 179, 6787, 8057, 180, 12355, ...
2    [101, 5650, 122, 1103, 1900, 1104, 1470, 12369...
3    [101, 5650, 122, 182, 10559, 4165, 4611, 9468,...
4    [101, 9228, 1104, 1103, 2175, 3582, 122, 1103,...
Name: text, dtype: object


(806,)
<class 'pandas.core.series.Series'>


In [141]:
print(f'tokenized.values.shape: {tokenized.values.shape}')
# find the length of the longest sentence in the dataset
max_len = 0
for i in tokenized.values:  #tokenized.values is of type #<class 'numpy.ndarray'>
    if len(i) > max_len:
        max_len = len(i)
print(f'max sentence length is : {max_len}')
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
print(f'padded.shape: {padded.shape}')

tokenized.values.shape: (806,)
max sentence length is : 512
padded.shape: (806, 512)


In [142]:
print(tokenized[0])
print('\n')
print(padded[0])
print(f'\nlen(padded[0]): {len(padded[0])}')

[101, 5650, 122, 1128, 1132, 4601, 1112, 3226, 1148, 5099, 4195, 1104, 16273, 9372, 11565, 1106, 4886, 18513, 1105, 4214, 1104, 1103, 8228, 1348, 3463, 6707, 1542, 2440, 1116, 1104, 16273, 14516, 16996, 20991, 3687, 1161, 1206, 1103, 141, 13821, 2036, 1106, 141, 13821, 2036, 1120, 9468, 6738, 6851, 1491, 191, 2980, 7563, 6094, 1742, 27629, 25247, 1161, 1107, 1103, 2466, 2417, 1125, 26079, 1610, 7050, 3044, 1104, 183, 1179, 1443, 1123, 9635, 1248, 5099, 4195, 1104, 16273, 9372, 11565, 1106, 2237, 21606, 122, 1105, 123, 170, 1104, 1103, 6969, 11903, 3140, 1104, 1371, 2440, 1116, 1104, 16273, 14516, 16996, 20991, 3687, 1161, 1206, 1103, 141, 13821, 2036, 1106, 1103, 141, 13821, 2036, 1120, 9468, 6738, 6851, 1491, 191, 2980, 7563, 6094, 1742, 27629, 25247, 1161, 1107, 1103, 2466, 2417, 23937, 1103, 191, 27547, 1605, 1104, 183, 1179, 1114, 1117, 21504, 1443, 1123, 9635, 122, 1113, 141, 13821, 2036, 1128, 12404, 5425, 1106, 1241, 4917, 1222, 1128, 1105, 4120, 1103, 14940, 1104, 9193, 1113, 1

Masking - If we directly send padded to BERT, that would slightly confuse it. We need to create another
variable to tell it to ignore (mask) the padding we've added when it's processing its input. That's what
attention_mask is:

Tokenize the dataset

In [143]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(806, 512)

In [144]:
print(attention_mask[0])
print(attention_mask[1])

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 

converting from numpy.ndarray to pytorch tensor

In [145]:
#create a tensor for the attention_mask
attention_mask = torch.tensor(attention_mask)
#We now create an input tensor out of the padded token matrix, and send that to DistilBERT
input_ids = torch.tensor(padded) 
#the model() function runs our sentences through BERT. The results of the processing will be returned into last_hidden_states.

start_time = datetime.datetime.now()
with torch.no_grad(): #deactivates autograd engine
    last_hidden_states = model1(input_ids, attention_mask=attention_mask) #transformers.modeling_distilbert.DistilBertModel
    # we could also simply do this
    #last_hidden_states = model1(input_ids)
print(f'BERT Model Runtime: {datetime.datetime.now() - start_time}')

BERT Model Runtime: 0:19:38.247636


After running this step, last_hidden_states holds the outputs of DistilBERT. It is a tuple with the shape
(number of examples, max number of tokens in the sequence, number of hidden units in the DistilBERT model).
In our case, this will be 806 , 512 (which is the number of tokens in the longest sequence from the 806 examples),
768 (the number of hidden units in the DistilBERT model). T

In [146]:
type(last_hidden_states) # tuple - the model output is a tuple
type(last_hidden_states[0]) # torch.Tensor - First element of that tuple is the output tensor 
last_hidden_states[0].shape # 806 X 512 X 768 cube

torch.Size([806, 512, 768])

Let's slice only the part of the output that we need. That is the output corresponding the first token of each sentence.
The way BERT does sentence classification, is that it adds a token called [CLS] (for classification) at the beginning
of every sentence. The output corresponding to that token can be thought of as an embedding for the entire sentence.
so we select that slice of the cube and discard everything else.

In [147]:
#cube slicing
# [:,0,:] means we need all rows,column 0 (first output - for the CLS token),all depth of the cube (all hidden units)
#also convert back from torch tensors to numpy because sklearn works with numpy.ndarray and not tensors
features = last_hidden_states[0][:,0,:].numpy() # features for the logistic regression model - look at the image

print(type(features))
print(features.shape) # a slice of the cube depth (3D) is 2D - (806, 768)
# we have 806  sentences and so we will have 806 sentence embeddings
# each sentence embedding will have the size of the hidden units for each token in the last layer of BERT, which is 768

<class 'numpy.ndarray'>
(806, 768)


In [148]:
labels = df['label']

print(type(labels))
print(len(labels)) #806 labels as expected 

<class 'pandas.core.series.Series'>
806


In [149]:
#MODEL # 2 - train/test split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42, shuffle=True) # split is 20% test and 80% train

In [150]:
type(train_features) #numpy.ndarray
print(f'train_features.shape: {train_features.shape}') # 75%
print(f'test_features.shape: {test_features.shape}') # 25%
print(f'train_labels.shape: {train_labels.shape}')
print(f'test_labels.shape: {test_labels.shape}')


train_features.shape: (644, 768)
test_features.shape: (162, 768)
train_labels.shape: (644,)
test_labels.shape: (162,)


## Second Model  - sklearn Classifier

In [151]:
from sklearn.model_selection import GridSearchCV
parameters = {'C': np.linspace(0.0001, 100, 20)} #try 20 values between 1e-4 and 1e+2
print(f'parameters: {parameters}')
grid_search = GridSearchCV(LogisticRegression(), parameters)

begin_time = datetime.datetime.now()
grid_search.fit(train_features, train_labels)
print(f'Grid Search time: {datetime.datetime.now() - begin_time}')

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

parameters: {'C': array([1.00000000e-04, 5.26325263e+00, 1.05264053e+01, 1.57895579e+01,
       2.10527105e+01, 2.63158632e+01, 3.15790158e+01, 3.68421684e+01,
       4.21053211e+01, 4.73684737e+01, 5.26316263e+01, 5.78947789e+01,
       6.31579316e+01, 6.84210842e+01, 7.36842368e+01, 7.89473895e+01,
       8.42105421e+01, 8.94736947e+01, 9.47368474e+01, 1.00000000e+02])}
Grid Search time: 0:00:03.620024
best parameters:  {'C': 5.263252631578947}
best scrores:  0.5978561046511628


train a sklearn model with the best parameter values

In [152]:
# train a logistic regression model
lr_clf = LogisticRegression(C=grid_search.best_params_['C'])
start_time = datetime.datetime.now()
lr_clf.fit(train_features, train_labels)
print(f'Logistic Regression Model Runtime: {datetime.datetime.now() - start_time}')

Logistic Regression Model Runtime: 0:00:00.046943


In [153]:
# evaluate MODEL #2
lr_clf.score(test_features, test_labels)

0.6172839506172839

In [154]:
#How good is this score? What can we compare it against? Let's first look at a dummy classifier:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.511 (+/- 0.08)
