#### Sentiment classification with BERT

In [1]:
### Import and prepare dataset

from torchtext.datasets import IMDB
train_ds = IMDB('./data/imdb/train', split='train')
train_ds = list(train_ds)

test_ds = IMDB('./data/imdb/test', split='test')
test_dataset = list(test_ds)

In [2]:
from torch.utils.data.dataset import random_split
train_dataset, valid_dataset = random_split(train_ds, [20000, 5000])

In [3]:
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertForSequenceClassification
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

In [4]:
import torch
last_texts = None
def label_pipeline(label):
    return 1 if label == 'pos' else 0

def collate_batch(batch):
    labels, texts, texts_lenghts = [], [], []
    for label, text in batch:
        labels.append(label_pipeline(label))
        if not isinstance(text, str):
            print('Is not string')
            print(text)
            raise ValueError('Text is not string')
        texts.append(text)
    labels = torch.tensor(labels, dtype=torch.long)
    last_texts = texts
    texts = tokenizer(texts, return_tensors='pt', truncation=True, padding=True, max_length=512)   
    return texts, labels   
    #3326993

In [5]:
from torch.utils.data import DataLoader
batch_size = 32
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch, num_workers=20)
# valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch, num_workers=20)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch, num_workers=20)



In [6]:
from torch.functional import F


def train(model, dataloader, optimizer, device, progress_bar):
    model.train()
    epoch_loss, epoch_acc = 0., 0.
    num_samples = len(dataloader.dataset)
    for input_batch, labels_batch in dataloader:

        text_batch = input_batch['input_ids'].to(device)
        attn_batch = input_batch['attention_mask'].to(device)
        labels_batch = labels_batch.to(device)

        model_output = model(text_batch, attention_mask=attn_batch, labels=labels_batch)
        loss, logits = model_output['loss'], model_output['logits']

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        progress_bar.update(1)
        epoch_loss += loss.item() * labels_batch.size(0)
        epoch_acc += (torch.argmax(logits, 1) == labels_batch).float().sum().item()
    return epoch_acc/num_samples, epoch_loss/num_samples    
        
        
def evaluate(model, dataloader, device):
    model.eval()
    epoch_loss, epoch_acc = 0., 0.
    num_samples = len(dataloader.dataset)
    with torch.no_grad():
        for input_batch, labels_batch in dataloader:

            text_batch = input_batch['input_ids'].to(device)
            attn_batch = input_batch['attention_mask'].to(device)
            labels_batch = labels_batch.to(device)

            model_output = model(text_batch, attention_mask=attn_batch, labels=labels_batch)
            loss, logits = model_output['loss'], model_output['logits']

            epoch_loss += loss.item() * labels_batch.size(0)
            epoch_acc += (torch.argmax(logits, 1) == labels_batch).float().sum().item()
    return epoch_acc/num_samples, epoch_loss/num_samples   

In [7]:
from tqdm.auto import tqdm
from torch import nn
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
device = torch.device('cuda')
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [9]:
train_bool = True

if train_bool:
    epochs = 5
    progress_bar = tqdm(range(epochs*len(train_dataloader)))
    for epoch in range(epochs):
        train_acc, train_loss = train(model, train_dataloader, optimizer, device, progress_bar)
        valid_acc, valid_loss = evaluate(model, valid_dataloader, device)
        print(f'Epoch: {epoch}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Valid Loss: {valid_loss:.4f}, Valid Acc: {valid_acc:.4f}')
        torch.save(model.state_dict(), 'bert_sentiment_cls_twitter.pt')
else:
    model.load_state_dict(torch.load('bert_sentiment_cls.pt'))

  0%|          | 0/16055 [00:00<?, ?it/s]

Epoch: 0, Train Loss: 0.1449, Train Acc: 0.9413, Valid Loss: 0.0625, Valid Acc: 0.9732
Epoch: 1, Train Loss: 0.0567, Train Acc: 0.9788, Valid Loss: 0.0609, Valid Acc: 0.9736
Epoch: 2, Train Loss: 0.0383, Train Acc: 0.9864, Valid Loss: 0.0911, Valid Acc: 0.9712
Epoch: 3, Train Loss: 0.0262, Train Acc: 0.9911, Valid Loss: 0.0665, Valid Acc: 0.9774
Epoch: 4, Train Loss: 0.0184, Train Acc: 0.9934, Valid Loss: 0.0771, Valid Acc: 0.9770


In [53]:
type(last_texts)

NoneType

In [10]:
evaluate(model, test_dataloader, device)

(0.6744, 2.3958536054380226)

In [8]:
import pandas as pd
#import pytorch random_split
from torch.utils.data.dataset import random_split
data_file = 'data/twitter/Twitter_Data.csv'
twitter_df = pd.read_csv(data_file).dropna()
# twitter_df['clean_text'].dropna(inplace=True)
twitter_df = twitter_df[ twitter_df['category'] != 0.0]
twitter_df['cat'] = twitter_df['category'].map({1.0: 'pos', -1.0: 'neg'})
twitter_list = twitter_df[['cat', 'clean_text']].to_numpy().tolist()
#map over the list and convert to tuple
twitter_list = list(map(lambda x: (x[0], x[1]), twitter_list))

train_dataset, valid_dataset = random_split(twitter_list, [len(twitter_list) - 5000, 5000])

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch, num_workers=20, drop_last=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch, num_workers=20, drop_last=True)


Unnamed: 0,clean_text,category,cat
0,when modi promised “minimum government maximum...,-1.0,neg
2,what did just say vote for modi welcome bjp t...,1.0,pos
3,asking his supporters prefix chowkidar their n...,1.0,pos
4,answer who among these the most powerful world...,1.0,pos
8,with upcoming election india saga going import...,1.0,pos
...,...,...,...
162972,engine growth modi unveils indias first 12000 ...,1.0,pos
162973,modi promised 2014 lok sabha elections that be...,1.0,pos
162975,why these 456 crores paid neerav modi not reco...,-1.0,neg
162976,dear rss terrorist payal gawar what about modi...,-1.0,neg


In [29]:
test_twitter_dl = DataLoader(twitter_l1, batch_size=32, shuffle=False, collate_fn=collate_batch, num_workers=20)

In [9]:
#01 8000 110211
evaluate(model, test_twitter_dl, device)

NameError: name 'test_twitter_dl' is not defined

In [39]:
twitter_df.iloc[34578]['clean_text']

'many feel that kumar epitomised the ‘real bjp’ with its merits and demerits his relationship with modi the years that followed marks the difference between the bjp under atal bihari vajpayee advani and joshi and under modi and shah writes '