**Installing All Necessary Libraries**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip3 install torch numpy datasets transformers   --quiet

[K     |████████████████████████████████| 270 kB 27.6 MB/s 
[K     |████████████████████████████████| 2.9 MB 53.4 MB/s 
[K     |████████████████████████████████| 125 kB 59.4 MB/s 
[K     |████████████████████████████████| 52 kB 1.8 MB/s 
[K     |████████████████████████████████| 1.3 MB 53.6 MB/s 
[K     |████████████████████████████████| 243 kB 71.8 MB/s 
[K     |████████████████████████████████| 895 kB 35.6 MB/s 
[K     |████████████████████████████████| 3.3 MB 43.3 MB/s 
[K     |████████████████████████████████| 636 kB 41.0 MB/s 
[K     |████████████████████████████████| 294 kB 74.1 MB/s 
[K     |████████████████████████████████| 160 kB 70.4 MB/s 
[?25h

**Importing All required functions,classes**

In [None]:
from transformers import AutoModelForSequenceClassification,BertModel
from torch.utils.data import DataLoader,Dataset,IterableDataset
from transformers import AdamW
from transformers import get_scheduler
import torch


**We are using  the Pretrained bert-base-cased tokeinzer using the autotokenizer function, and I am downloading the  pretrained bert_model to look at Its config file**

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# model.config
bert_model = BertModel.from_pretrained('bert-base-cased')
print(bert_model.config)





Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}



## Dataloader

**this is the custom dataloader, As firstly we are inherting from the torch Dataset class and we are overloading some important functions then we are adding  the 'get_csv' function that is reading the rows from the csv and then the 'prepare' function that  is calling the tokenizer on each of the text reviews , finally the inputs are returned in the format the model would expect.**

In [None]:
import csv
from itertools import cycle



class IMDb_loader(Dataset):
  def __init__(self,name):
    self.rows= self.get_csv(name)
    self.data = []
    self.prepare()

  def get_csv(self,file_path):
    output =[]
    with open(file_path,'r',newline='') as cfile:
      for row in csv.reader(cfile,delimiter=','):
        # print(len(row))
        output.append(row)
    return output

  def __len__(self):
    return self.len

  def __getitem__(self,idx):
    return self.data[idx]
  
  
  def prepare(self):

    for row in self.rows:
      if row[1] not in ["1","0"]:
        continue
      output = {**{k:torch.tensor(v) for k,v in tokenizer(row[0],padding='max_length',truncation=True).items()},'labels':int(row[1])}
      self.data.append(output)

    self.len = len(self.data)



**We are calling the dataloader on each of the csv files for train,test and dev the total number of records in each of them are printed**

In [None]:

train_datax = IMDb_loader('drive/MyDrive/IMDB/data/Train.csv')
eval_datax = IMDb_loader('drive/MyDrive/IMDB/data/Valid.csv')
test_datax = IMDb_loader('drive/MyDrive/IMDB/data/Test.csv')

print(len(train_datax))
print(len(eval_datax))
print(len(test_datax))


train_dataloaderx = DataLoader(train_datax,shuffle=True,batch_size=8)
eval_dataloaderx = DataLoader(eval_datax,shuffle=True,batch_size=8)
test_dataloader = DataLoader(test_datax,shuffle=True,batch_size=8)

40000
5000
5000


## Model

**This is the custom class for the model it inherits the nn.Module class, we are initializing the bert model, a dense layer and the dropout layer, the forward functiond defines how the inputs will be processed and which layers will be called when, we first call the dropout then the dense layer and then compute the loss , returning that**




In [None]:
from torch import nn
from torch.nn import CrossEntropyLoss

class BertForIMDB(nn.Module):
  def __init__(self,name,num_labels,dropout,trained=False):
    super(BertForIMDB, self).__init__()
    self.num_labels = num_labels
    self.model = BertModel.from_pretrained(name)
    self.dense = nn.Linear(self.model.config.hidden_size,num_labels)
    self.dropout = nn.Dropout(self.model.config.hidden_dropout_prob)

  def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None,):
      
      outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
      
      pooled_outputs = outputs[1]
      
      pooled_outputs = self.dropout(pooled_outputs)
      logits = self.dense(pooled_outputs)
      loss_fn = CrossEntropyLoss()
      loss = loss_fn(logits.view(-1,self.num_labels),labels.view(-1))
      return loss,logits

# Testing the model to see if inputs and outputs are proper
bert_my = BertForIMDB('bert-base-cased',2,0.1)
inputs = tokenizer("hello, I have bought a New Car",padding='max_length',truncation=True)
print(inputs.keys())
inputs['labels']=[0]
inputs={k:torch.unsqueeze(torch.tensor(v),0) for k,v in inputs.items()}
print(bert_my(**inputs))

# torch.save(bert_my.state_dict(), 'drive/MyDrive/IMDB/test.pt')



Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
(tensor(0.9703, grad_fn=<NllLossBackward>), tensor([[0.1833, 0.6773]], grad_fn=<AddmmBackward>))


**preparing the optimizer and the scheduler also listing out the number of steps the model will be performing, total epochs and the batch size**

In [None]:
from transformers import AdamW
from transformers import get_scheduler
optimizer = AdamW(bert_my.parameters(),lr=5e-5)
num_epochs = 1
batch_size = 8
num_training_steps= num_epochs*(len(train_dataloaderx))
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps

)
print(num_training_steps,num_epochs,batch_size)


5000 1 8


**Shifting the model to CUDA**

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
bert_my.to(device)
print(device)

cuda


**This is the Training Loop, Here we are getting the prepared batched inputs then processing them through the model, calculating the loss, backpropogating the gradients and then saving the loss and accuracy for future visualization**

In [None]:
from tqdm.auto import tqdm
from datasets import load_metric
import itertools



bert_my.train()
metric = load_metric('f1')
step=0
running_corrects = 0
logs = {}
losses = []
accs =[]
progress_bar = tqdm(range(num_training_steps))


for _ in range(num_epochs):
  bert_my.train()
  for batch in train_dataloaderx:
      step+=1
      # print(step," of ",num_training_steps)
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = bert_my(**batch)
      loss,logits = outputs
      loss.backward()
      # print(loss)
      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      
      preds=  torch.argmax(logits,dim=-1)
      running_corrects += torch.sum(preds == batch['labels'])
      accuracy = running_corrects.item()/(step*batch_size)
      logs['step']=step
      logs['log loss'] = loss.item()
      logs['accuracy'] = accuracy

      losses.append(loss.item())
      accs.append(accuracy)
      
      print(logs)
      progress_bar.update(1)
  bert_my.eval()
  print("performing eval...>")
  for ebatch in eval_dataloaderx:
      batch = {k: v.to(device) for k, v in ebatch.items()}
      with torch.no_grad():
          outputs = bert_my(**batch)

      _,logits = outputs
      predictions = torch.argmax(logits, dim=-1)
      metric.add_batch(predictions=predictions, references=batch["labels"])
      

  print(metric.compute())
print("training complete")
# torch.save(bert_my.state_dict(),'drive/MyDrive/IMDB/mybert/saved_3.pt')

  0%|          | 0/5000 [00:00<?, ?it/s]

{'step': 1, 'log loss': 0.6407774686813354, 'accuracy': 0.75}
{'step': 2, 'log loss': 0.7802416682243347, 'accuracy': 0.5625}
{'step': 3, 'log loss': 0.7236126065254211, 'accuracy': 0.5}
{'step': 4, 'log loss': 0.7639678716659546, 'accuracy': 0.5}
{'step': 5, 'log loss': 0.6835905909538269, 'accuracy': 0.5}
{'step': 6, 'log loss': 0.6443446278572083, 'accuracy': 0.5416666666666666}
{'step': 7, 'log loss': 0.8907490372657776, 'accuracy': 0.48214285714285715}
{'step': 8, 'log loss': 0.6509594321250916, 'accuracy': 0.5}
{'step': 9, 'log loss': 0.6403478980064392, 'accuracy': 0.5277777777777778}
{'step': 10, 'log loss': 0.8023250699043274, 'accuracy': 0.5125}
{'step': 11, 'log loss': 0.614784836769104, 'accuracy': 0.5227272727272727}
{'step': 12, 'log loss': 0.7423601746559143, 'accuracy': 0.5208333333333334}
{'step': 13, 'log loss': 0.5169732570648193, 'accuracy': 0.5384615384615384}
{'step': 14, 'log loss': 0.7427998781204224, 'accuracy': 0.5267857142857143}
{'step': 15, 'log loss': 0.58

## Inference block

**Loading the model from storage that can now be used for evaluation or inference and preparing inputs**

In [None]:
from torch import nn
from torch.nn import CrossEntropyLoss
import torch
from transformers import AdamW
from transformers import get_scheduler


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

train_datax = IMDb_loader('data/Train.csv')
eval_datax = IMDb_loader('data/Valid.csv')
test_datax = IMDb_loader('data/Test.csv')


train_dataloaderx = DataLoader(train_datax,shuffle=True,batch_size=8)
eval_dataloaderx = DataLoader(eval_datax,shuffle=True,batch_size=8)
test_dataloader = DataLoader(test_datax,shuffle=True,batch_size=8)



my_trained =  BertForIMDB('bert-base-cased',2,0.1)
my_trained.load_state_dict(torch.load('best_model.pt'))
my_trained.to(device)

**Evaluating Result on the TEST split**

In [None]:
from datasets import load_metric
my_trained.eval()
f1_metric = load_metric('f1')
accuracy = load_metric('accuracy')
progress_bar = tqdm(len(test_dataloader))

print("performing eval...>")
step = 0 
for ebatch in test_dataloader:
    batch = {k: v.to(device) for k, v in ebatch.items()}
    with torch.no_grad():
        outputs = my_trained(**batch)
    step+=1
    _,logits = outputs
    predictions = torch.argmax(logits, dim=-1)
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])
    accuracy.add_batch(predictions=predictions, references=batch["labels"])
    

print("results on Test split")
print(f1_metric.compute())
print(accuracy.compute())
# torch.save(model.state_dict(),'drive/MyDrive/IMDB/models/saved.pt')

performing eval...>
results on Test split
{'f1': 0.9371131962467559}
{'accuracy': 0.937}


**Evaluating the model on VALIDATION/DEV split**

In [None]:
my_trained.eval()
f1_metric = load_metric('f1')
accuracy = load_metric('accuracy')
progress_bar = tqdm(range(len(eval_dataloaderx)))
print("performing eval...>")
step = 0 
for ebatch in eval_dataloaderx:
    batch = {k: v.to(device) for k, v in ebatch.items()}
    with torch.no_grad():
        outputs = my_trained(**batch)
    step+=1
    _,logits = outputs
    predictions = torch.argmax(logits, dim=-1)
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])
    accuracy.add_batch(predictions=predictions, references=batch["labels"])
    progress_bar.update(1)

print("results on evaluation split")
print(f1_metric.compute())
print(accuracy.compute())

  0%|          | 0/625 [00:00<?, ?it/s]

performing eval...>
results on evaluation split
{'f1': 0.9338146811070999}
{'accuracy': 0.934}


**Evaluating the model on TRAIN split**

In [None]:
my_trained.eval()
f1_metric = load_metric('f1')
accuracy = load_metric('accuracy')
progress_bar = tqdm(range(len(train_dataloaderx)))
print("performing eval...>")
step = 0 
for ebatch in train_dataloaderx:
    batch = {k: v.to(device) for k, v in ebatch.items()}
    with torch.no_grad():
        outputs = my_trained(**batch)
    step+=1
    _,logits = outputs
    predictions = torch.argmax(logits, dim=-1)
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])
    accuracy.add_batch(predictions=predictions, references=batch["labels"])
    progress_bar.update(1)

print("results on train split")
print(f1_metric.compute())
print(accuracy.compute())

  0%|          | 0/5000 [00:00<?, ?it/s]

performing eval...>
results on train split
{'f1': 0.9692083584488649}
{'accuracy': 0.96935}
