In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import torch
import transformers
import inspect
import time
import logging

from tqdm import trange, tqdm, tqdm_notebook, tqdm_pandas, tqdm_gui
from datetime import datetime
from tqdm import tqdm
from transformers import BertConfig, BertModel, BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_constant_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


'GeForce RTX 2080 Ti'

# Load data & pre-processing

In [2]:
def preprocessing(df):
    """
    Preprocessing step
    As above dataframe heads, there is a lot of <br /> character 
    """
    df.sentence = df.sentence.str.replace('<br />','')
    df.sentence = df.sentence.str.lower()
    return df

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

train = preprocessing(train)
test = preprocessing(test)

print('Train data:\n{}\n\nTest data:\n{}'.format(train.head(5), test.head(5)))

Train data:
                                            sentence  sentiment  polarity
0  this is a very bland and inert production of o...          2         0
1  i've seen this film in avant-premiere at imagi...          7         1
2  revolt of the zombies (2 outta 5 stars) no, th...          4         0
3  may contain minor spoilers.dressed to kill, ha...          7         1
4  (spoilers)i shoulda figured. the dvd didn't ev...          2         0

Test data:
                                            sentence  sentiment  polarity
0  i loved this movie so much. i'm a big fan of a...         10         1
1  the stark, cold landscape of big sky country, ...          9         1
2  this cheapo exploitation flick is some genuine...          2         0
3  this movie has been promoting in everywhere in...          1         0
4  this is a great off-the-wall romantic comedy a...          8         1


# Tokenization & Create inputs for model

In [15]:
class BertModelBonz():
    def __init__(self, model='bert-base-uncased', max_len=512, batch_size=6):
        self.pre_trained_model = model
        self.max_len = max_len
        self.batch_size = batch_size
        self.tokenizer = BertTokenizer.from_pretrained(self.pre_trained_model)
        self.tokenizer.max_len = max_len
        # Setting model
        self.model = BertForSequenceClassification.from_pretrained(self.pre_trained_model)
        self.model.cuda()
        self.optimizer = AdamW(params = self.model.parameters(), lr=1e-5)
    
    def create_ids(self, sentences):
        logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR) #Disable tokenizer logs, it's really annoy
        input_ids = []
        for sen in tqdm_notebook(sentences):
            tmp = self.tokenizer.encode(sen)
            input_ids.append(tmp)
        input_ids = pad_sequences(input_ids, 
                                  maxlen=self.max_len, 
                                  dtype='int64', 
                                  truncating='post', 
                                  padding='post')
        return input_ids
    
    def prepare_data(self, input_ids, input_labels=None):
        input_ids = torch.tensor(self.create_ids(input_ids))
        if input_labels is None:
            return DataLoader(TensorDataset(input_ids), 
                              batch_size=self.batch_size)
        else:
            input_labels = torch.tensor(input_labels)
            return DataLoader(TensorDataset(input_ids, input_labels), 
                              batch_size=self.batch_size)
        
    def flat_accuracy(self, preds, labels):
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
    def train(self, dataloader, epochs=4):
        self.train_loss_set =[]
        for _ in trange(epochs, desc="Epoch"):
            # Training model
            self.model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for input_ids, input_labels in tqdm_notebook(dataloader):
                self.optimizer.zero_grad()
                loss = self.model(input_ids=input_ids.cuda(), labels=input_labels.cuda())[0]
                self.train_loss_set.append(loss)    
                loss.backward()
                self.optimizer.step()
                
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
            print("Train loss: {}".format(tr_loss/nb_tr_steps))

            # Evaluation
            self.model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_labels in dataloader:
                with torch.no_grad():
                    logits = self.model(input_ids.cuda())[0]
                logits = logits.detach().cpu().numpy()
                label_ids = input_labels.to('cpu').numpy()
                tmp_eval_accuracy = self.flat_accuracy(logits, label_ids)
                eval_accuracy += tmp_eval_accuracy
                nb_eval_steps += 1

            print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

        


In [16]:
bert_model = BertModelBonz()
train.dataloader = bert_model.prepare_data(input_ids=train['sentence'], input_labels=train['polarity'])

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))




In [17]:
bert_model.train(train.dataloader)

Epoch:   0%|                                                                                     | 0/4 [00:00<?, ?it/s]

HBox(children=(IntProgress(value=0, max=4167), HTML(value='')))


Train loss: 0.2487156836560498
Validation Accuracy: 0.9609631229501715


Epoch:  25%|██████████████████▎                                                      | 1/4 [31:07<1:33:21, 1867.15s/it]

HBox(children=(IntProgress(value=0, max=4167), HTML(value='')))


Train loss: 0.12560456689024305
Validation Accuracy: 0.9865610751139933


Epoch:  50%|███████████████████████████████████▌                                   | 2/4 [1:02:17<1:02:15, 1868.00s/it]

HBox(children=(IntProgress(value=0, max=4167), HTML(value='')))


Train loss: 0.06755735430474685
Validation Accuracy: 0.9911207103431747


Epoch:  75%|██████████████████████████████████████████████████████▊                  | 3/4 [1:33:26<31:08, 1868.54s/it]

HBox(children=(IntProgress(value=0, max=4167), HTML(value='')))


Train loss: 0.04795780858196882
Validation Accuracy: 0.9886809055275604


Epoch: 100%|█████████████████████████████████████████████████████████████████████████| 4/4 [2:04:38<00:00, 1869.53s/it]


In [18]:
torch.save(bert_model.model, 'bert_eb1024_1e5_e4.pth')

In [146]:
"""
Constructs a BertTokenizer.
:class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece

Args:
    vocab_file: Path to a one-wordpiece-per-line vocabulary file
    do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
    max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
        minimum of this value (if specified) and the underlying BERT model's sequence length.
    never_split: List of tokens which will never be split during tokenization. Only has an effect when
        do_wordpiece_only=False
"""
from keras.preprocessing.sequence import pad_sequences

MAX_LEN = 512
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR) #Disable tokenizer logs, it's really annoy

def create_ids(sentences):
    input_ids = []
    for sen in tqdm_notebook(sentences):
        tmp = tokenizer.encode(sen)
        #tmp = tokenizer.build_inputs_with_special_tokens(tmp)
        input_ids.append(tmp)
    input_ids = pad_sequences(input_ids, 
                              maxlen=MAX_LEN, 
                              dtype='int64', 
                              truncating='post', 
                              padding='post')
    return input_ids

"""
def create_attention_masks(input_ids):
    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    return attention_masks
"""

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', max_)
tokenizer.max_len = 1024

input_ids = create_ids(train['sentence'])

#attention_masks = create_attention_masks(input_ids)

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))










 73%|██████████████████████████████████████████████████████▋                    | 18220/25000 [01:15<00:20, 325.19it/s]
























Exception ignored in: <function tqdm.__del__ at 0x000001CDA78D0400>
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\tqdm\_tqdm.py", line 931, in __del__
    self.close()
  File "C:\ProgramData\Anaconda3\lib\site-packages\tqdm\_tqdm.py", line 1133, in close
    self._decr_instances(self)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tqdm\_tqdm.py", line 496, in _decr_instances
    cls.monitor.exit()
  File "C:\ProgramData\Anaconda3\lib\site-packages\tqdm\_monitor.py", line 52, in exit
    self.join()
  File "C:\ProgramData\Anaconda3\lib\threading.py", line 1029, in join
    raise RuntimeError("cannot join current thread")
RuntimeError: cannot join current thread





# Build Model

In [241]:
# Need to convert to torch.tensor since Pytorch only accept that inputs :|

train_inputs = torch.tensor(input_ids)
train_labels = torch.tensor(train.polarity)

"""
# This is for testing only, please comment it if un-needed

train_inputs = train_inputs[:20]
train_labels = train_labels[:20]

"""


In [242]:
BATCH_SIZE = 3

train_data = TensorDataset(train_inputs,
                           train_labels)

train_dataloader = DataLoader(train_data,  
                              batch_size=BATCH_SIZE)


In [247]:
train_dataloader = DataLoader(TensorDataset(train_inputs), batch_size=3)
for t in train_dataloader:
    print(t[0])
    print(t[1])

tensor([[  101,  2023,  2003,  ...,     0,     0,     0],
        [  101,  1045,  1005,  ...,     0,     0,     0],
        [  101, 10073,  1997,  ...,     0,     0,     0]])


IndexError: list index out of range

In [204]:
config = BertConfig(max_position_embeddings=1024)

In [177]:
#model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) #default with 512
model = BertForSequenceClassification(config) #test 1024 embedding
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [178]:
param = model.parameters()

"""
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
"""

optimizer = AdamW(params = param, lr=2e-5)

#scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=100)

In [180]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
    
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    # Train the data for one epoch
    for batch in tqdm_notebook(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        loss = model(b_input_ids, labels=b_labels)[0]
        train_loss_set.append(loss)    
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()


        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in train_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
          # Forward pass, calculate logit predictions
          logits = model(b_input_ids)[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    
#torch.save(model, 'bert_2e5_e4.pth')


Epoch:   0%|                                                                                     | 0/4 [00:00<?, ?it/s]

HBox(children=(IntProgress(value=0, max=8334), HTML(value='')))





Train loss: 0.704124375137174
Validation Accuracy: 0.4999600031997444


Epoch:  25%|██████████████████▎                                                      | 1/4 [39:58<1:59:56, 2398.86s/it]

HBox(children=(IntProgress(value=0, max=8334), HTML(value='')))


Train loss: 0.7008407298633742
Validation Accuracy: 0.4999600031997444


Epoch:  50%|███████████████████████████████████▌                                   | 2/4 [1:19:56<1:19:57, 2398.59s/it]

HBox(children=(IntProgress(value=0, max=8334), HTML(value='')))


Train loss: 0.7007369220914788
Validation Accuracy: 0.4999600031997444


Epoch:  75%|██████████████████████████████████████████████████████▊                  | 3/4 [1:59:48<39:56, 2396.49s/it]

HBox(children=(IntProgress(value=0, max=8334), HTML(value='')))


Train loss: 0.69972404903049
Validation Accuracy: 0.4999600031997444


Epoch: 100%|█████████████████████████████████████████████████████████████████████████| 4/4 [2:39:46<00:00, 2396.88s/it]


# Predict test data

In [10]:
model = bert_model.model
tokenizer = bert_model.tokenizer

In [21]:
def create_ids(sentences):
    input_ids = []
    for sen in tqdm_notebook(sentences):
        tmp = bert_model.tokenizer.encode(sen)
        #tmp = tokenizer.build_inputs_with_special_tokens(tmp)
        input_ids.append(tmp)
    input_ids = pad_sequences(input_ids, 
                              maxlen=512, 
                              dtype='int64', 
                              truncating='post', 
                              padding='post')
    return input_ids


# Prepare test data

test_ids = create_ids(test.sentence)






HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))




In [22]:
test_inputs = torch.tensor(test_ids)
test_labels = torch.tensor(test.polarity)

test_dataset = TensorDataset(test_inputs, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=6)
# Prediction on test set

# Put model in evaluation mode
bert_model.model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in tqdm_notebook(test_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        logits = bert_model.model(b_input_ids)[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

HBox(children=(IntProgress(value=0, max=4167), HTML(value='')))




In [23]:
pred = [j for i in predictions for j in i]
pred = np.argmax(pred, axis=1)

In [24]:
print(classification_report(pred, test.polarity))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92     11488
           1       0.97      0.89      0.93     13512

   micro avg       0.92      0.92      0.92     25000
   macro avg       0.92      0.93      0.92     25000
weighted avg       0.93      0.92      0.92     25000



In [83]:
config = BertConfig()
#config.output_hidden_states=True
config

{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}