In [5]:
import tensorflow as tf

In [6]:
!pip install sklearn
!pip install matplotlib
!pip install transformers
!pip install tqdm
!pip install pandas
!pip install numpy
!pip install torch



In [7]:
!pip install ipywidgets



In [8]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


from transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
from transformers import AdamW

from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1080'

In [10]:
df = pd.read_csv("./IMDB Dataset.csv", delimiter=',')

In [11]:
df.shape

(49999, 2)

In [12]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [13]:
sentences = df.review.values

In [14]:
sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]

In [15]:
labels = df.sentiment.values
labels = [1 if label == 'positive' else 0 for label in labels]

In [16]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

In [17]:
tokenized_texts = [tokenizer.tokenize(sentence) for sentence in sentences]

In [18]:
print (tokenized_texts[0])

['▁one', '▁of', '▁the', '▁other', '▁reviewer', 's', '▁has', '▁mentioned', '▁that', '▁after', '▁watching', '▁just', '▁1', '▁', 'oz', '▁episode', '▁you', "'", 'll', '▁be', '▁hooked', '.', '▁they', '▁are', '▁right', ',', '▁as', '▁this', '▁is', '▁exactly', '▁what', '▁happened', '▁with', '▁me', '.', '<', 'br', '▁', '/', '>', '<', 'br', '▁', '/', '>', 'the', '▁first', '▁thing', '▁that', '▁struck', '▁me', '▁about', '▁', 'oz', '▁was', '▁its', '▁brutality', '▁and', '▁un', 'fli', 'nch', 'ing', '▁scenes', '▁of', '▁violence', ',', '▁which', '▁set', '▁in', '▁right', '▁from', '▁the', '▁word', '▁go', '.', '▁trust', '▁me', ',', '▁this', '▁is', '▁not', '▁a', '▁show', '▁for', '▁the', '▁faint', '▁', 'hearted', '▁or', '▁timid', '.', '▁this', '▁show', '▁pulls', '▁no', '▁punches', '▁with', '▁regards', '▁to', '▁drugs', ',', '▁sex', '▁or', '▁violence', '.', '▁its', '▁is', '▁hardcore', ',', '▁in', '▁the', '▁classic', '▁use', '▁of', '▁the', '▁word', '.', '<', 'br', '▁', '/', '>', '<', 'br', '▁', '/', '>', 'it',

In [19]:
MAX_LEN = 128

In [20]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [21]:
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [22]:
input_ids

array([[  65,   20,   18, ...,  167, 3151,  669],
       [  24, 3239,  293, ..., 1062,   21, 2062],
       [  17,  150,  449, ...,   65,   20, 3302],
       ...,
       [  17,  150,  569, ...,   27,  116, 5182],
       [  17,  150,   26, ...,   19,   33,  106],
       [ 116,   65, 5883, ...,   74,  248,   52]])

In [23]:
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [24]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=1029, test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=1029, test_size=0.2)

In [25]:
# split the data set into training, validation and test set
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [33]:
batch_size = 8

In [34]:
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [35]:
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [36]:
model.cuda()

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e

In [37]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [38]:
# This variable contains all of the hyperparemeter information and the learning rate.
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

In [39]:
train_loss_set = []

epochs = 10

for _ in trange(epochs, desc="Epoch"):
  # Set our model to training mode (as opposed to evaluation mode)
    model.train()
  
  # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
  
    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]
        train_loss_set.append(loss.item())    

        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
    
    
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))

Epoch:  10%|███████▏                                                                | 1/10 [19:13<2:53:01, 1153.45s/it]

Train loss: 0.3051495655851439


Epoch:  20%|██████████████▍                                                         | 2/10 [38:28<2:33:57, 1154.67s/it]

Train loss: 0.20469167952947318


Epoch:  30%|█████████████████████▌                                                  | 3/10 [57:25<2:13:43, 1146.21s/it]

Train loss: 0.13702648311513477


Epoch:  40%|████████████████████████████                                          | 4/10 [1:16:00<1:53:24, 1134.15s/it]

Train loss: 0.0898479012012016


Epoch:  50%|███████████████████████████████████                                   | 5/10 [1:34:48<1:34:19, 1131.84s/it]

Train loss: 0.06475000863778987


Epoch:  60%|██████████████████████████████████████████                            | 6/10 [1:53:32<1:15:16, 1129.19s/it]

Train loss: 0.05123644689866633


Epoch:  70%|██████████████████████████████████████████████████▍                     | 7/10 [2:12:10<56:16, 1125.38s/it]

Train loss: 0.044708164950605714


Epoch:  80%|█████████████████████████████████████████████████████████▌              | 8/10 [2:30:48<37:26, 1123.09s/it]

Train loss: 0.04086637857187597


Epoch:  90%|████████████████████████████████████████████████████████████████▊       | 9/10 [2:49:30<18:42, 1122.86s/it]

Train loss: 0.0381726009403239


Epoch: 100%|███████████████████████████████████████████████████████████████████████| 10/10 [3:08:08<00:00, 1128.89s/it]

Train loss: 0.03348584321711096





In [40]:
torch.save(model.state_dict(), './xlnet_model_1029_100_epoch.ckpt')

In [41]:
with torch.no_grad():
    correct = 0
    total = 0
    for i, batch in enumerate(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        # print (outputs)
        prediction = torch.argmax(outputs[0],dim=1)
        total += b_labels.size(0)
        correct+=(prediction==b_labels).sum().item()

In [42]:
print('Test Accuracy of the model on vla data is: {} %'.format(100 * correct / total))

Test Accuracy of the model on vla data is: 88.71 %
