In [1]:

!pip install pytorch_pretrained_bert pytorch-nlp

Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.8/123.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-nlp
  Downloading pytorch_nlp-0.5.0-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.1/90.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting boto3 (from pytorch_pretrained_bert)
  Downloading boto3-1.34.84-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=0.4.1->pytorch_pretrained_bert)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=0.4.1->pytorch_pretrained_bert)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64

In [2]:
import sys
import numpy as np
import random as rn
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
from torchnlp.datasets import imdb_dataset
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output

In [3]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

# Prepare the Data

In [5]:
train_data, test_data = imdb_dataset(train=True, test=True)
rn.shuffle(train_data)
rn.shuffle(test_data)
train_data = train_data[:1000]
test_data = test_data[:100]

In [43]:
train_data[999]

{'text': "Thomas Archer (Ron Eldard) has his child killed and his wife viciously attacked in a home invasion. Dr. Heller (Christopher Plummer) tries to help him through the post traumatic stress. Then Archer finds himself confronted with a man (Til Schweiger) bound and gagged to a chair. He is told this is the man who killed his child and attacked his wife and he can do whatever he likes to him. And there's a large assortment of instruments there to help him...<br /><br />Film is interesting at first (and shows real restraint in terms of blood and gore) but gets stupider by the minute and has some highly unlikely plot twists and turns. It all ends in a final twist that was so old and stupid that I was shocked anyone would actually think of using it anymore. How such talented actors like Eldard, Schweiger and Plummer got involved in crap like this is beyond me. This gets three stars for the acting but the stupid plot and truly unbelievable twists make this a chore to sit through.",
 'se

In [42]:
test_data[99]

{'text': "It's initial premise is based on the American Civil War but it's ultimately a love story. We start at the beginning of the war where the main characters (Kidman & Law) are obviously aware of each other and there's an obvious attraction, they have a passionate kiss on the day he leaves for the war. The main thrust of this film is for Law's character to return to Kidman's and his struggles to achieve that and her struggles to survive until he returns. The reason it fails to convince is that we don't see enough of this relationship before Law's character leaves for battle - it's difficult to believe the premise that 2 people yearn for each other so much given they've had so little contact. Everything else is just about fine, Renee Zellweger and her incumbent father and his entourage are lovely additions as is the threat from the gang chasing deserters. Sure it's a long film but it does hold the interest and the cinematography is great. An honourable attempt that doesn't quite ma

In [6]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), test_data)))

len(train_texts), len(train_labels), len(test_texts), len(test_labels)

(1000, 1000, 100, 100)

In [46]:
train_texts[999]

"Thomas Archer (Ron Eldard) has his child killed and his wife viciously attacked in a home invasion. Dr. Heller (Christopher Plummer) tries to help him through the post traumatic stress. Then Archer finds himself confronted with a man (Til Schweiger) bound and gagged to a chair. He is told this is the man who killed his child and attacked his wife and he can do whatever he likes to him. And there's a large assortment of instruments there to help him...<br /><br />Film is interesting at first (and shows real restraint in terms of blood and gore) but gets stupider by the minute and has some highly unlikely plot twists and turns. It all ends in a final twist that was so old and stupid that I was shocked anyone would actually think of using it anymore. How such talented actors like Eldard, Schweiger and Plummer got involved in crap like this is beyond me. This gets three stars for the acting but the stupid plot and truly unbelievable twists make this a chore to sit through."

In [48]:
train_labels[999]

'neg'

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# BERT is trained using the WordPiece tokenizations; it means it can be broken down into more than one sub-words.

100%|██████████| 231508/231508 [00:00<00:00, 1233288.54B/s]


In [8]:
tokenizer.tokenize('Hi my name is Dima')


['hi', 'my', 'name', 'is', 'dim', '##a']

In [9]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))
# since max sequence size for BERT IS 512, truncate any review that is longer than this

len(train_tokens), len(test_tokens)

(1000, 100)

In [51]:
train_tokens[999]

['[CLS]',
 'thomas',
 'archer',
 '(',
 'ron',
 'el',
 '##dar',
 '##d',
 ')',
 'has',
 'his',
 'child',
 'killed',
 'and',
 'his',
 'wife',
 'vicious',
 '##ly',
 'attacked',
 'in',
 'a',
 'home',
 'invasion',
 '.',
 'dr',
 '.',
 'heller',
 '(',
 'christopher',
 'plum',
 '##mer',
 ')',
 'tries',
 'to',
 'help',
 'him',
 'through',
 'the',
 'post',
 'traumatic',
 'stress',
 '.',
 'then',
 'archer',
 'finds',
 'himself',
 'confronted',
 'with',
 'a',
 'man',
 '(',
 'til',
 'sc',
 '##h',
 '##weig',
 '##er',
 ')',
 'bound',
 'and',
 'gag',
 '##ged',
 'to',
 'a',
 'chair',
 '.',
 'he',
 'is',
 'told',
 'this',
 'is',
 'the',
 'man',
 'who',
 'killed',
 'his',
 'child',
 'and',
 'attacked',
 'his',
 'wife',
 'and',
 'he',
 'can',
 'do',
 'whatever',
 'he',
 'likes',
 'to',
 'him',
 '.',
 'and',
 'there',
 "'",
 's',
 'a',
 'large',
 'assortment',
 'of',
 'instruments',
 'there',
 'to',
 'help',
 'him',
 '.',
 '.',
 '.',
 '<',
 'br',
 '/',
 '>',
 '<',
 'br',
 '/',
 '>',
 'film',
 'is',
 'intere

In [None]:
# # convert each token in each review to an id as present in the tokenizer vocab.
# train_token_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
# test_token_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))

# # pad 0 so we have same size of 512
# train_token_ids = pad_sequences(train_token_ids, maxlen=512, truncating='post', padding='post', dtype='int')
# test_token_ids = pad_sequences(test_token_ids, maxlen=512, truncating='post', padding='post', dtype='int')

# Below code, in one code

In [10]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, test_tokens_ids.shape

((1000, 512), (100, 512))

In [53]:
train_tokens_ids[999]

array([  101,  2726, 11024,  1006,  6902,  3449,  7662,  2094,  1007,
        2038,  2010,  2775,  2730,  1998,  2010,  2564, 13925,  2135,
        4457,  1999,  1037,  2188,  5274,  1012,  2852,  1012, 25038,
        1006,  5696, 22088,  5017,  1007,  5363,  2000,  2393,  2032,
        2083,  1996,  2695, 19686,  6911,  1012,  2059, 11024,  4858,
        2370, 12892,  2007,  1037,  2158,  1006, 18681,  8040,  2232,
       27204,  2121,  1007,  5391,  1998, 18201,  5999,  2000,  1037,
        3242,  1012,  2002,  2003,  2409,  2023,  2003,  1996,  2158,
        2040,  2730,  2010,  2775,  1998,  4457,  2010,  2564,  1998,
        2002,  2064,  2079,  3649,  2002,  7777,  2000,  2032,  1012,
        1998,  2045,  1005,  1055,  1037,  2312, 26285,  1997,  5693,
        2045,  2000,  2393,  2032,  1012,  1012,  1012,  1026,  7987,
        1013,  1028,  1026,  7987,  1013,  1028,  2143,  2003,  5875,
        2012,  2034,  1006,  1998,  3065,  2613, 19355,  1999,  3408,
        1997,  2668,

In [11]:
train_y = np.array(train_labels) == 'pos'
test_y = np.array(test_labels) == 'pos'
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)
#our target variable is currently a list of neg and pos strings. convert it to numpy arrays of booleans


((1000,), (100,), 0.489, 0.5)

In [56]:
train_y

array([ True,  True,  True,  True,  True,  True,  True,  True, False,
       False, False, False,  True, False,  True, False, False, False,
       False,  True, False,  True,  True,  True, False, False,  True,
       False, False, False,  True,  True,  True, False, False, False,
        True, False,  True, False,  True,  True,  True, False, False,
        True, False, False,  True,  True, False, False,  True,  True,
        True, False,  True,  True,  True,  True, False,  True, False,
        True, False,  True, False, False, False, False,  True, False,
        True, False,  True,  True,  True, False,  True, False,  True,
       False, False, False,  True,  True, False, False, False, False,
        True, False,  True,  True, False,  True, False,  True, False,
       False,  True,  True,  True, False,  True, False, False,  True,
        True, False,  True, False,  True,  True, False,  True, False,
       False, False, False,  True, False, False,  True, False, False,
       False, False,

In [12]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]


In [58]:
train_masks[0]

[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0

In [60]:
len(train_masks[0])

512

In [61]:
len(train_masks)

1000

# Baseline

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [14]:
baseline_model = make_pipeline(CountVectorizer(ngram_range=(1,3)), LogisticRegression()).fit(train_texts, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
baseline_predicted = baseline_model.predict(test_texts)

In [16]:
print(classification_report(test_labels, baseline_predicted))

              precision    recall  f1-score   support

         neg       0.82      0.82      0.82        50
         pos       0.82      0.82      0.82        50

    accuracy                           0.82       100
   macro avg       0.82      0.82      0.82       100
weighted avg       0.82      0.82      0.82       100



# Bert Model

In [17]:

#Ech model in PyTorch is a nn.Module object. Every model we built must provide 2 methods.
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()
        # declare BERT model that we will fine tune, the linear layer, the sigmoid actiavtion
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    #Forward method is the actual code that runs during the forward pass(like the predict method in sklearn or keras)
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba
    # here we take the token input and pass it to the BERT model. The output of BERT is 2 variables.
    # we use only the second one (the _name is used to emphasize that this variable is not used)
    # take the pooled output and pass it to the linear layer.
    # sigmoid activate

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [19]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'0.0M'

In [20]:
bert_clf = BertBinaryClassifier()
bert_clf = bert_clf.cuda() # GPU

100%|██████████| 407873900/407873900 [00:11<00:00, 35883556.70B/s]


In [21]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'439.065088M'

In [22]:

x = torch.tensor(train_tokens_ids[:3]).to(device)
y, pooled = bert_clf.bert(x, output_all_encoded_layers=False)
x.shape, y.shape, pooled.shape


(torch.Size([3, 512]), torch.Size([3, 512, 768]), torch.Size([3, 768]))

In [23]:
y = bert_clf(x)
y.cpu().detach().numpy()

array([[0.40444466],
       [0.49784023],
       [0.36928013]], dtype=float32)

In [24]:

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'5799.899648M'

In [25]:
y, x, pooled = None, None, None
torch.cuda.empty_cache()
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'5799.899648M'

# Fine-tune BERT

In [26]:

BATCH_SIZE = 4
EPOCHS = 10

In [27]:

train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'447.584768M'

In [28]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [29]:
param_optimizer = list(bert_clf.sigmoid.named_parameters())
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [30]:
optimizer = Adam(bert_clf.parameters(), lr=3e-6)

In [31]:
torch.cuda.empty_cache()

In [32]:

for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
        logits = bert_clf(token_ids, masks)

        loss_func = nn.BCELoss()

        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()


        bert_clf.zero_grad()
        batch_loss.backward()


        clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
        optimizer.step()

        clear_output(wait=True)
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))


Epoch:  10
249/250.0 loss: 0.04551121397386305 


In [33]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()

        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])


In [34]:

np.mean(bert_predicted)


0.46

In [35]:

print(classification_report(test_y, bert_predicted))

              precision    recall  f1-score   support

       False       0.89      0.96      0.92        50
        True       0.96      0.88      0.92        50

    accuracy                           0.92       100
   macro avg       0.92      0.92      0.92       100
weighted avg       0.92      0.92      0.92       100

