In [3]:
!pip install -q kaggle
from google.colab import files
uploaded = files.upload()

In [6]:
 ! mkdir ~/.kaggle
 ! cp kaggle.json ~/.kaggle/
 ! chmod 600 ~/.kaggle/kaggle.json

In [7]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
!unzip '/content/imdb-dataset-of-50k-movie-reviews.zip'

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 47% 12.0M/25.7M [00:00<00:00, 47.8MB/s]
100% 25.7M/25.7M [00:00<00:00, 102MB/s] 
Archive:  /content/imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [8]:
!kaggle datasets download -d abhishek/bert-base-uncased
!unzip '/content/bert-base-uncased.zip'

Downloading bert-base-uncased.zip to /content
 97% 379M/389M [00:04<00:00, 99.5MB/s]
100% 389M/389M [00:04<00:00, 97.6MB/s]
Archive:  /content/bert-base-uncased.zip
  inflating: config.json             
  inflating: pytorch_model.bin       
  inflating: vocab.txt               


In [9]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 15.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 57.6MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 51.8MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=e1bd3555053f915514

In [16]:
import transformers
from tqdm import tqdm
import pandas as pd
import torch

In [11]:
df_main = pd.read_csv('/content/IMDB Dataset.csv')
df_main.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [42]:
df_positive = df_main[df_main['sentiment'] == 'positive'].head(5000)
df_negative = df_main[df_main['sentiment'] == 'negative'].head(5000)
df_final = pd.concat([df_positive, df_negative])
df_final.to_csv('imdb_short.csv', index=None)

In [43]:
df = pd.read_csv('/content/imdb_short.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,"Petter Mattei's ""Love in the Time of Money"" is...",positive
4,"Probably my all-time favorite movie, a story o...",positive


In [53]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_LEN = 100
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5
BERT_PATH= '/content/'
MODEL_PATH = '/content/model.bin'
TRAINING_FILE = '/content/imdb_short.csv'
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH, do_lower_case=True
)

In [18]:
import torch.nn as nn
import torch

In [19]:
class BERTBasedUncased(nn.Module):
  def __init__(self):
    super(BERTBasedUncased, self).__init__()
    self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
    self.bert_drop = nn.Dropout(0.3)
    self.out = nn.Linear(768, 1)

  def forward(self, ids, mask, token_type_ids):
    _, o2 = self.bert(
        ids, 
        attention_mask = mask,
        token_type_ids = token_type_ids, return_dict=False
    )
    #print("hello",o2)
    bo = self.bert_drop(o2)
    output = self.out(bo)
    return output

In [20]:
class BERTDataset:
  def __init__(self, review, target):
    self.review = review
    self.target = target
    self.tokenizer = TOKENIZER
    self.max_len = MAX_LEN
    
  def __len__(self):
    return len(self.review)
  
  def __getitem__(self, item):
    review = str(self.review[item])
    review = " ".join(review.split())

    inputs = self.tokenizer.encode_plus(
        review,
        None,
        add_special_tokens=True,
        max_length = self.max_len,
        truncation=True
    )
    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]

    padding_length = self.max_len - len(ids)
    ids = ids + ([0] * padding_length)
    mask = mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([0] * padding_length)

    return {
        'ids': torch.tensor(ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        'target': torch.tensor(self.target[item], dtype=torch.float)
    }

In [21]:
def loss_fn(outputs, targets):
  return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

def train_fn(data_loader, model, optimizer, device, scheduler):
  model.train()

  for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
    ids = d["ids"]
    token_type_ids = d["token_type_ids"]
    mask = d["mask"]
    targets = d["target"]

    ids = ids.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.float)

    optimizer.zero_grad()
    outputs = model(
        ids = ids,
        mask = mask,
        token_type_ids = token_type_ids
    )

    loss = loss_fn(outputs, targets)
    loss.backward()
    optimizer.step()
    scheduler.step()

def eval_fn(data_loader, model, device):
  model.eval()
  fin_targets = []
  fin_outputs = []
  with torch.no_grad():
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
      ids = d['ids']
      token_type_ids = d['token_type_ids']
      mask = d['mask']
      targets = d['target']

      ids = ids.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)
      targets = targets.to(device, dtype=torch.float)

      outputs = model(
          ids = ids,
          mask = mask,
          token_type_ids = token_type_ids
      )
      fin_targets.extend(targets.cpu().detach().numpy().tolist())
      fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
  return fin_outputs, fin_targets
    

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np
from transformers import AdamW, get_linear_schedule_with_warmup

In [44]:
def run():
  df = pd.read_csv(TRAINING_FILE).fillna("none")
  df.sentiment = df.sentiment.apply(
      lambda x: 1 if x == "positive" else 0
  )
  df_train, df_valid = train_test_split(
      df,
      test_size = 0.1,
      random_state=42,
      stratify=df.sentiment.values
      )
  
  df_train = df_train.reset_index(drop=True)
  df_valid = df_valid.reset_index(drop=True)

  train_dataset = BERTDataset(
      review=df_train.review.values,
      target = df_train.sentiment.values
  )
  train_data_loader = torch.utils.data.DataLoader(
      train_dataset,
      batch_size=TRAIN_BATCH_SIZE,
      num_workers=4
  )
  valid_dataset = BERTDataset(
      review=df_valid.review.values,
      target = df_valid.sentiment.values
      ) 
  valid_data_loader = torch.utils.data.DataLoader(
      valid_dataset,
      batch_size=VALID_BATCH_SIZE,
      num_workers=4
  )
  device = torch.device(DEVICE)
  model = BERTBasedUncased()
  model.to(device)
  
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optimizer_parameters = [
    {'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay':0.001},
    {'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0},
  ]

  num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
  optimizer = AdamW(optimizer_parameters, lr=3e-5)
  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps = num_train_steps
  )

  

  best_accuracy = 0
  for epoch in range(EPOCHS):
    train_fn(train_data_loader, model, optimizer, device, scheduler)
    outputs, targets = eval_fn(valid_data_loader, model, device)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    print(f"Accuracy Score = {accuracy}")
    if accuracy > best_accuracy:
      torch.save(model.state_dict(), MODEL_PATH)
      best_accuracy = accuracy

    

In [38]:
def sentence_prediction(sentence, model): 
  tokenizer = TOKENIZER
  max_length = MAX_LEN
  review = str(sentence)
  review = " ".join(review.split())

  inputs = tokenizer.encode_plus(
        review,
        None,
        add_special_tokens=True,
        max_length = MAX_LEN,
        truncation=True
    )
  ids = inputs["input_ids"]
  mask = inputs["attention_mask"]
  token_type_ids = inputs["token_type_ids"]

  padding_length = MAX_LEN - len(ids)
  ids = ids + ([0] * padding_length)
  mask = mask + ([0] * padding_length)
  token_type_ids = token_type_ids + ([0] * padding_length)

  ids =  torch.tensor(ids, dtype=torch.long).unsqueeze(0)
  mask = torch.tensor(mask, dtype=torch.long).unsqueeze(0)
  token_type_ids = torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0)

  ids = ids.to(DEVICE, dtype=torch.long)
  token_type_ids = token_type_ids.to(DEVICE, dtype=torch.long)
  mask = mask.to(DEVICE, dtype=torch.long)
  #targets = targets.to(DEVICE, dtype=torch.float)

  outputs = model(
      ids = ids,
      mask = mask,
      token_type_ids = token_type_ids
  )

  outputs = torch.sigmoid(outputs).cpu().detach().numpy()
  return outputs[0][0]

In [54]:
if __name__=='__main__': 
  run()
  model = BERTBasedUncased()
  model.load_state_dict(torch.load(MODEL_PATH))
  model.to(DEVICE)
  sentence = "The weather is nice today"
  prediction = sentence_prediction(sentence, model)
  print(prediction)
  sentence = "The weather is bad"
  prediction = sentence_prediction(sentence, model)
  print(prediction)




  0%|          | 0/1125 [00:00<?, ?it/s][A[A[A


  0%|          | 1/1125 [00:00<08:17,  2.26it/s][A[A[A


  0%|          | 2/1125 [00:00<06:45,  2.77it/s][A[A[A


  0%|          | 3/1125 [00:00<05:42,  3.27it/s][A[A[A


  0%|          | 4/1125 [00:00<04:59,  3.74it/s][A[A[A


  0%|          | 5/1125 [00:01<04:28,  4.17it/s][A[A[A


  1%|          | 6/1125 [00:01<04:07,  4.53it/s][A[A[A


  1%|          | 7/1125 [00:01<03:53,  4.78it/s][A[A[A


  1%|          | 8/1125 [00:01<03:43,  5.00it/s][A[A[A


  1%|          | 9/1125 [00:01<03:35,  5.17it/s][A[A[A


  1%|          | 10/1125 [00:02<03:29,  5.32it/s][A[A[A


  1%|          | 11/1125 [00:02<03:26,  5.40it/s][A[A[A


  1%|          | 12/1125 [00:02<03:23,  5.46it/s][A[A[A


  1%|          | 13/1125 [00:02<03:22,  5.50it/s][A[A[A


  1%|          | 14/1125 [00:02<03:20,  5.54it/s][A[A[A


  1%|▏         | 15/1125 [00:02<03:18,  5.58it/s][A[A[A


  1%|▏         | 16/1125 [00:03<03:18, 

Accuracy Score = 0.879





  0%|          | 0/1125 [00:00<?, ?it/s][A[A[A


  0%|          | 1/1125 [00:00<07:47,  2.41it/s][A[A[A


  0%|          | 2/1125 [00:00<06:32,  2.86it/s][A[A[A


  0%|          | 3/1125 [00:00<05:36,  3.33it/s][A[A[A


  0%|          | 4/1125 [00:00<04:58,  3.75it/s][A[A[A


  0%|          | 5/1125 [00:01<04:30,  4.14it/s][A[A[A


  1%|          | 6/1125 [00:01<04:13,  4.42it/s][A[A[A


  1%|          | 7/1125 [00:01<04:02,  4.62it/s][A[A[A


  1%|          | 8/1125 [00:01<03:52,  4.80it/s][A[A[A


  1%|          | 9/1125 [00:01<03:45,  4.94it/s][A[A[A


  1%|          | 10/1125 [00:02<03:40,  5.05it/s][A[A[A


  1%|          | 11/1125 [00:02<03:36,  5.16it/s][A[A[A


  1%|          | 12/1125 [00:02<03:32,  5.23it/s][A[A[A


  1%|          | 13/1125 [00:02<03:29,  5.30it/s][A[A[A


  1%|          | 14/1125 [00:02<03:27,  5.35it/s][A[A[A


  1%|▏         | 15/1125 [00:03<03:26,  5.37it/s][A[A[A


  1%|▏         | 16/1125 [00:03<03:25, 

Accuracy Score = 0.863






  0%|          | 0/1125 [00:00<?, ?it/s][A[A[A


  0%|          | 1/1125 [00:00<05:15,  3.56it/s][A[A[A


  0%|          | 2/1125 [00:00<04:54,  3.81it/s][A[A[A


  0%|          | 3/1125 [00:00<04:29,  4.17it/s][A[A[A


  0%|          | 4/1125 [00:00<04:12,  4.44it/s][A[A[A


  0%|          | 5/1125 [00:01<04:03,  4.60it/s][A[A[A


  1%|          | 6/1125 [00:01<03:56,  4.73it/s][A[A[A


  1%|          | 7/1125 [00:01<03:52,  4.82it/s][A[A[A


  1%|          | 8/1125 [00:01<03:43,  5.00it/s][A[A[A


  1%|          | 9/1125 [00:01<03:37,  5.12it/s][A[A[A


  1%|          | 10/1125 [00:02<03:34,  5.19it/s][A[A[A


  1%|          | 11/1125 [00:02<03:34,  5.19it/s][A[A[A


  1%|          | 12/1125 [00:02<03:32,  5.25it/s][A[A[A


  1%|          | 13/1125 [00:02<03:29,  5.30it/s][A[A[A


  1%|          | 14/1125 [00:02<03:27,  5.34it/s][A[A[A


  1%|▏         | 15/1125 [00:02<03:28,  5.32it/s][A[A[A


  1%|▏         | 16/1125 [00:03<03:27,

Accuracy Score = 0.857






  0%|          | 0/1125 [00:00<?, ?it/s][A[A[A


  0%|          | 1/1125 [00:00<06:09,  3.04it/s][A[A[A


  0%|          | 2/1125 [00:00<05:27,  3.43it/s][A[A[A


  0%|          | 3/1125 [00:00<04:50,  3.86it/s][A[A[A


  0%|          | 4/1125 [00:00<04:25,  4.23it/s][A[A[A


  0%|          | 5/1125 [00:01<04:12,  4.44it/s][A[A[A


  1%|          | 6/1125 [00:01<04:04,  4.57it/s][A[A[A


  1%|          | 7/1125 [00:01<03:52,  4.80it/s][A[A[A


  1%|          | 8/1125 [00:01<03:49,  4.86it/s][A[A[A


  1%|          | 9/1125 [00:01<03:42,  5.02it/s][A[A[A


  1%|          | 10/1125 [00:02<03:40,  5.07it/s][A[A[A


  1%|          | 11/1125 [00:02<03:40,  5.06it/s][A[A[A


  1%|          | 12/1125 [00:02<03:36,  5.15it/s][A[A[A


  1%|          | 13/1125 [00:02<03:35,  5.16it/s][A[A[A


  1%|          | 14/1125 [00:02<03:32,  5.23it/s][A[A[A


  1%|▏         | 15/1125 [00:03<03:29,  5.30it/s][A[A[A


  1%|▏         | 16/1125 [00:03<03:27,

Accuracy Score = 0.862






  0%|          | 0/1125 [00:00<?, ?it/s][A[A[A


  0%|          | 1/1125 [00:00<07:23,  2.53it/s][A[A[A


  0%|          | 2/1125 [00:00<06:19,  2.96it/s][A[A[A


  0%|          | 3/1125 [00:00<05:26,  3.44it/s][A[A[A


  0%|          | 4/1125 [00:00<04:49,  3.87it/s][A[A[A


  0%|          | 5/1125 [00:01<04:37,  4.04it/s][A[A[A


  1%|          | 6/1125 [00:01<04:19,  4.32it/s][A[A[A


  1%|          | 7/1125 [00:01<04:03,  4.59it/s][A[A[A


  1%|          | 8/1125 [00:01<03:57,  4.71it/s][A[A[A


  1%|          | 9/1125 [00:01<03:53,  4.78it/s][A[A[A


  1%|          | 10/1125 [00:02<03:54,  4.76it/s][A[A[A


  1%|          | 11/1125 [00:02<03:50,  4.83it/s][A[A[A


  1%|          | 12/1125 [00:02<03:44,  4.96it/s][A[A[A


  1%|          | 13/1125 [00:02<03:40,  5.04it/s][A[A[A


  1%|          | 14/1125 [00:02<03:38,  5.09it/s][A[A[A


  1%|▏         | 15/1125 [00:03<03:39,  5.05it/s][A[A[A


  1%|▏         | 16/1125 [00:03<03:35,

Accuracy Score = 0.863
0.84317344
0.4332454
