#Bert Classifier - Implement a Logistic Regression classifier using BERT embeddings

In [1]:
!pip install google-colab



In [2]:
# Colab Setup
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

# After downloading the shared starting point folder as a Zip
# Unzip it and re-upload it to a location on your GDrive

# This command copies the contents from the folder you uploaded to GDrive, to the colab working dir
!cp -r /content/drive/My\ Drive/ProjectoRI2020 /content

# Add working dir to the sys path, so that we can find the aux python files when running the Notebook
import sys
if not '/content/ProjectoRI2020' in sys.path:
  sys.path += ['/content/ProjectoRI2020']

# Finally install required dependencies to run the notebook
!pip install elasticsearch
!pip install bert-serving-client
!pip install transformers

Mounted at /content/drive
Collecting elasticsearch
[?25l  Downloading https://files.pythonhosted.org/packages/86/74/054342aa07121f7c82e30ae63e3f257a793a69ddc11c5065449252dcd8af/elasticsearch-7.10.1-py2.py3-none-any.whl (322kB)
[K     |████████████████████████████████| 327kB 9.6MB/s 
Installing collected packages: elasticsearch
Successfully installed elasticsearch-7.10.1
Collecting bert-serving-client
  Downloading https://files.pythonhosted.org/packages/1f/09/aae1405378a848b2e87769ad89a43d6d71978c4e15534ca48e82e723a72f/bert_serving_client-1.10.0-py2.py3-none-any.whl
Installing collected packages: bert-serving-client
Successfully installed bert-serving-client-1.10.0
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 7.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhos

In [18]:
# Imports
import TRECCASTeval as trec
import numpy as np
import ElasticSearchSimpleAPI as es
from sklearn.utils import resample
import pandas as pd
import pickle

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
# Get the interactive Tools for Matplotlib
%matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from transformers import BertTokenizer, BertTokenizerFast, BertModel
from bert_serving.client import BertClient
import torch

elastic = es.ESSimpleAPI()
test_bed = trec.ConvSearchEvaluation()

## BERT embeddings

### Initialization, constants, training set

In [4]:
bert_model_name = 'bert-base-cased'
bert_model_name = 'nboost/pt-bert-base-uncased-msmarco'
CLS_token = "[CLS]"
SEP_token = "[SEP]"

# Get relevance judgments from training set
train_relevant = test_bed.relevance_judgments

# Convert relevance labels to 0-1
train_relevant['rel'] = train_relevant['rel'] / 4

In [7]:
# # Create a dictionary with the topic turn id and the utterance of each query
dic = {}
turn_ids = []
utterances = []
for topic in test_bed.test_topics:
    conv_id = topic['number']  
    for turn in topic['turn']:
        turn_id = turn['number']
        turn_ids.append(f'{conv_id}_{turn_id}')
        utterance = turn['raw_utterance']
        utterances.append(utterance)
        dic[f"{conv_id}_{turn_id}"]=utterance

test = pd.DataFrame({'topic_turn': turn_ids, 'utterance': utterances })

test.to_csv(f"/content/ProjectoRI2020/results/topic_turn_test.csv")

### Pre-processing

In [8]:
device = torch.device("cuda")
model = BertModel.from_pretrained('nboost/pt-bert-base-uncased-msmarco', return_dict=True)
model = model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=408.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=437982986.0, style=ProgressStyle(descri…




In [9]:
# inspired by: https://github.com/nyu-dl/dl4marco-bert/blob/e6fdc7fd13ade0a8d7cde8c232529c8542a93549/tokenization.py

def convert_to_bert_input(sentences, max_seq_length, tokenizer, add_cls, return_tensors="pt"):

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0   0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). 
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as as the "sentence vector".
    
    # Tokenize both sentences
    sentences_tokens = [tokenizer.tokenize(s + SEP_token) for s in sentences]
    
    # Combine sentences tokens on a single list
    tokens = sum(sentences_tokens, [])
    
    if add_cls:
        tokens = [CLS_token] + tokens

    # Create Token type ids tensors
    token_type_ids = [[i]*len(s) for i, s in enumerate(sentences_tokens)] # Acount for the SEP token we've just added
    token_type_ids = [0] + sum(token_type_ids, []) # CLS + The whole token_type_ids flattened

    # Remove tokens if max_seq_length is exceeded
    # Account for [CLS] and [SEP] with "- 3"
    if len(tokens) > max_seq_length - 3:
        tokens = tokens[:max_seq_length - 4] + [tokens[-1]] # keep SEP token
        token_type_ids = token_type_ids[:max_seq_length - 3]

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    
     # Create Attention mask tensor -> Which tokens should BERT consider
    attention_mask = [1]*len(tokens)
    
    if return_tensors == "pt":
        input_ids = torch.tensor([input_ids], dtype=torch.long, device=device)
        token_type_ids = torch.tensor([token_type_ids], dtype=torch.long, device=device)
        attention_mask = torch.tensor([attention_mask], dtype=torch.long, device=device)
    
    data = {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_mask
    }

    return data

In [12]:
# Import triplets from file
triplets = pd.read_csv(f'/content/ProjectoRI2020/results/triplets.csv')

# Tokenize query/passage pairs
max_length = 512,  # maximum length of a sentence
tokenizer = BertTokenizerFast.from_pretrained(bert_model_name)

inputs_qa = triplets.apply(lambda x : convert_to_bert_input( [ x["Query"] , x["Passage"]], 512, tokenizer, True, return_tensors="pt"), axis=1) # Tokens for each

outputs_qa = inputs_qa.copy()
for i in range(inputs_qa.shape[0]):
    outputs_qa[i] = model(**inputs_qa[i]).last_hidden_state[:,0,:].detach().cpu().numpy()[0]

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=58.0, style=ProgressStyle(description_w…




In [13]:
# Copy outputs_qa to features
zero = np.zeros((1535, 768))
for i in range(1535):
  zero[i] = outputs_qa[i]

features=zero

# Standardize
means = np.mean(features,axis=0)
stdevs = np.std(features,axis=0)
features = (features-means)/stdevs

# Get labels
labels = triplets['rel']

# Re-sample with stratification
features, labels = resample(features, labels, n_samples=1535*5, replace=True, stratify=labels, random_state=0)
labels = labels.to_numpy()

### Cross-validation to choose best parameters for logistic regression

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split

In [15]:
folds = 4
kf = StratifiedKFold(n_splits=folds)       
cvalue = [0.0001, 0.001, 0.05, 0.07, 0.01, 0.1, ]
print('cvalue: ', cvalue)
errorTrain = []
errorValidation = []
for C in cvalue:
              
    tr_err = va_err = 0
    for tr_ix,va_ix in kf.split(labels,labels):

        # Get current train and valid feature matrixes
        X_train = features[tr_ix]
        X_valid = features[va_ix]
        # Get current train and valid classifications
        y_train = labels[tr_ix] 
        y_valid = labels[va_ix]
   
        # Get valid error
        reg = LogisticRegression(random_state=0, C=C, tol=0.000001, max_iter=1000000, class_weight='balanced', solver='liblinear')
        reg.fit(X_train , y_train)
        va_err = reg.score(X_valid, y_valid)
        tr_err = reg.score(X_train, y_train) 
        print('C,', C,tr_err, va_err)
          
    # Add errors to correspondent list
    errorValidation.append(va_err/folds)
    errorTrain.append(tr_err/folds)
    
print('errorValidation:', errorValidation)
print('errorTrain: ', errorTrain)

cvalue:  [0.0001, 0.001, 0.05, 0.07, 0.01, 0.1]
C, 0.0001 0.7124739402362752 0.7128712871287128
C, 0.0001 0.711952744961779 0.7003647733194373
C, 0.0001 0.7138637943015983 0.7003647733194373
C, 0.0001 0.7121764808059753 0.7122002085505735
C, 0.001 0.8200138985406532 0.815528921313184
C, 0.001 0.8078526754690758 0.795205836373111
C, 0.001 0.8141070187630299 0.8082334549244398
C, 0.001 0.8224769845405593 0.7977059436913452
C, 0.05 0.9477067407922168 0.9244398124022929
C, 0.05 0.9444058373870744 0.9260031266284523
C, 0.05 0.9444058373870744 0.9275664408546118
C, 0.05 0.9454577036651034 0.9191866527632951
C, 0.07 0.957088255733148 0.9338196977592496
C, 0.07 0.9517025712300209 0.9353830119854091
C, 0.07 0.9527449617790132 0.928087545596665
C, 0.07 0.9503213479242661 0.9249217935349322
C, 0.01 0.9007991660875608 0.8764981761334029
C, 0.01 0.9021890201528839 0.8895257946847316
C, 0.01 0.9025364836692147 0.8858780614903595
C, 0.01 0.9006426958485322 0.8753910323253389
C, 0.1 0.9623002084781098

In [19]:
# Fit LR model with Use C=0.001 and max_iter=100000 (gave better results afterwards)
reg = LogisticRegression(random_state=0, C=0.001, max_iter=100000, solver='liblinear')
reg.fit(features, labels)

# Save classifier to pickle
filehandler = open('/content/ProjectoRI2020/pickles/bert_classifier2.pkl', 'wb') 
pickle.dump(reg, filehandler)
filehandler.close()