# Assignment3 with BERT

## Task 1: Exploratory Data Analysis and Preprocessing

In [1]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [7]:
dataset = pd.read_csv("dataset_for_bert.csv")

In [8]:
dataset.head()

Unnamed: 0,Text,Label
0,The food at snack is a selection of popular Gr...,0
1,Snack is great place for a Â casual sit down l...,0
2,Love this place! Â Try the Chicken sandwich or...,0
3,My friend and I were intrigued by the nightly ...,0
4,"For lunch, my friend and I had: -Lamb sandwich...",1


In [22]:
dataset["Text"]=dataset["Text"].astype(str)

In [9]:
dataset.Label.value_counts()

1    84008
0     6898
Name: Label, dtype: int64

In [10]:
dataset.Label.value_counts()

1    84008
0     6898
Name: Label, dtype: int64

## Task 2: Training/Validation Split

In [11]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_val, y_train, y_val = train_test_split(
    dataset.index.values,
    dataset.Label.values,
    test_size = 0.2,
    random_state = 17,
    stratify = dataset.Label.values
)

In [14]:
dataset["data_type"]=["not_set"] * dataset.shape[0]

In [15]:
dataset.loc[X_train,"data_type"] = "train"
dataset.loc[X_val,"data_type"] = "val"

In [17]:
dataset.groupby(["Label","data_type"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Text
Label,data_type,Unnamed: 2_level_1
0,train,5518
0,val,1380
1,train,67205
1,val,16802


In [42]:
test_dataframe = pd.read_csv('./test.tsv', sep = '\t')

In [158]:
test_dataframe.head()

Unnamed: 0,id,review
0,1,Human Hurricane!: Would you like to sleep in t...
1,2,A Mom: I bought this with all kinds of expecta...
2,3,Good Read: I judge all books that I read by a ...
3,4,It's awesome: DVD set is exactly what you'd bu...
4,5,Great Movie!!!: This definatly the best Godzil...


In [43]:
test_dataframe["label"]=0

## Task 3: Loading Tokenizer and Encoding our Data

In [18]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [19]:
tokenizer = BertTokenizer.from_pretrained(
    "bert-base-cased",
    do_lower_case = True
)

In [23]:
encoded_data_train = tokenizer.batch_encode_plus(
    dataset[dataset["data_type"] == "train"].Text.values,
    add_special_tokens = True,
    return_attention_mask = True,
    pad_to_max_length = True,
    max_length = 256,
    return_tensors = 'pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    dataset[dataset["data_type"] == "val"].Text.values,
    add_special_tokens = True,
    return_attention_mask = True,
    pad_to_max_length = True,
    max_length = 256,
    return_tensors = 'pt'
)

In [None]:
encoded_data_test = tokenizer.batch_encode_plus(
    test_dataframe.Text.values,
    add_special_tokens = True,
    return_attention_mask = True,
    pad_to_max_length = True,
    max_length = 256,
    return_tensors = 'pt'
)

In [24]:
input_ids_train  = encoded_data_train["input_ids"]
attention_mask_train = encoded_data_train["attention_mask"]
labels_train  =  torch.tensor(dataset[dataset["data_type"]=="train"].Label.values)

In [25]:
input_ids_val  = encoded_data_val["input_ids"]
attention_mask_val = encoded_data_val["attention_mask"]
labels_val  =  torch.tensor(dataset[dataset["data_type"]=="val"].Label.values)

In [26]:
dataset_train = TensorDataset(input_ids_train,
                            attention_mask_train,  
                            labels_train)
dataset_val = TensorDataset(input_ids_val,
                            attention_mask_val,  
                            labels_val)

In [27]:
len(dataset_train)

72724

In [28]:
len(dataset_val)

18182

## Task 4: Setting up BERT Pretrained Model

In [29]:
from transformers import BertForSequenceClassification

In [30]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False            
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

## Task 5: Creating Data Loaders

In [31]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [32]:
batch_size = 4 #4 
dataloader_train = DataLoader(
    dataset_train,
    sampler = RandomSampler(dataset_train),
    batch_size = batch_size
)
dataloader_val = DataLoader(
    dataset_val,
    sampler = RandomSampler(dataset_val),
    batch_size = batch_size
)

## Task 6: Setting Up Optimizer and Scheduler

In [33]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [34]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)

In [35]:
epochs =10 
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*epochs

)


## Task 7: Defining our Performance Metrics

In [36]:
import numpy as np

In [37]:
from sklearn.metrics import f1_score

In [38]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds,axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average="weighted")

In [62]:
for label in range(2):
    print (label)

0
1


In [39]:
def accuracy_per_class(preds, labels):
    #label_dict_inverse = {v: k for k,v in label_dict.items()}
    
    preds_flat = np.argmax(preds,axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
   #     print(f'class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

## Task 8: Creating our Training Loop

In [40]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [41]:
device = torch.device('cuda'if torch.cuda.is_available else 'cpu')
model.to(device)

print(device)

cuda


In [42]:
def evaluate(dataloader_val):

    model.eval()
    
    predictions, true_vals = [], []
    loss_val_total =0
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals


In [43]:
def predict(dataloader_val):

    model.eval()
    
    predictions, true_vals = [], []
    #review_id = []
    for batch in dataloader_val:
     #   reviewid = batch["id"]

        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],

                 }
        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
       # loss_val_total += loss.item()
    #    _,logits = torch.max(outputs)
        logits = logits.detach().cpu().numpy()
    #    label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
    #    review_id.append(reviewid)
    #    true_vals.append(label_ids)
    
    #loss_val_avg = loss_val_total/len(dataloader_val) 
   # predictions = torch.stack(predictions).cpu()
  #  reviewids = torch.stack(review_id).cpu()
    predictions = np.concatenate(predictions, axis=0)
   # true_vals = np.concatenate(true_vals, axis=0)
            
    return predictions


In [44]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train,
                        desc = 'Epoch {:1d}'.format(epoch),
                        leave = False,
                        disable = False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs={
            'input_ids'      :batch[0],
            'attention_mask':batch[1],
            'labels'        :batch[2]
        }
        
        output= model(**inputs)
        
        loss = output[0]
        loss_train_total += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss':'{:.3f}'.format(loss.item()/len(batch))})
        
    torch.save(model.state_dict(),f'Models/BERT_ft_epoch{epoch}.model')
    tqdm.write('\nEpoch {epoch}')
        
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss:{loss_train_avg}')
    
    val_loss, predictions,true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions,true_vals)
    tqdm.write(f'Valisdation loss:{val_loss}')
    tqdm.write(f'F1 Score (weighted):{val_f1}')
    

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/18181 [00:00<?, ?it/s]


Epoch {epoch}
Training loss:0.3274706305983516
Valisdation loss:0.2954352231697989
F1 Score (weighted):0.8877558777791131


Epoch 2:   0%|          | 0/18181 [00:00<?, ?it/s]


Epoch {epoch}
Training loss:0.30757380781136473
Valisdation loss:0.32050324577379896
F1 Score (weighted):0.8962896632229957


Epoch 3:   0%|          | 0/18181 [00:00<?, ?it/s]


Epoch {epoch}
Training loss:0.2750112107815792
Valisdation loss:0.36964633153859866
F1 Score (weighted):0.8990723174467825


Epoch 4:   0%|          | 0/18181 [00:00<?, ?it/s]


Epoch {epoch}
Training loss:0.22127421619915885
Valisdation loss:0.48491282010079845
F1 Score (weighted):0.9022631657442864


Epoch 5:   0%|          | 0/18181 [00:00<?, ?it/s]


Epoch {epoch}
Training loss:0.1481775517642015
Valisdation loss:0.6278637153181097
F1 Score (weighted):0.8973896347547512


Epoch 6:   0%|          | 0/18181 [00:00<?, ?it/s]


Epoch {epoch}
Training loss:0.0937353299418662
Valisdation loss:0.6814263397905882
F1 Score (weighted):0.9005479448009565


Epoch 7:   0%|          | 0/18181 [00:00<?, ?it/s]


Epoch {epoch}
Training loss:0.05397778180669602
Valisdation loss:0.8204517117390697
F1 Score (weighted):0.9019345168083256


Epoch 8:   0%|          | 0/18181 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Task 9: Loading and Evaluating our Model

In [45]:
model = BertForSequenceClassification.from_pretrained("bert-base-cased",
                                                      num_labels=2 ,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [46]:
CUDA_LAUNCH_BLOCKING=1
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [47]:
model.load_state_dict(
    torch.load('Models/BERT_ft_epoch7.model',
               map_location=torch.device('cuda')))


<All keys matched successfully>

In [50]:
_,predictions,true_vals= evaluate(dataloader_val)

In [51]:
for i in range(1,8):
    print ("*****************It is {} model".format(i))
    model.load_state_dict(
    torch.load('Models/BERT_ft_epoch{}.model'.format(i),
               map_location=torch.device('cuda')))
    _,predictions,true_vals= evaluate(dataloader_val)
    accuracy_per_class(predictions,true_vals)


*****************It is 1 model
Accuracy: 1/1380

Accuracy: 16801/16802

*****************It is 2 model
Accuracy: 85/1380

Accuracy: 16732/16802

*****************It is 3 model
Accuracy: 149/1380

Accuracy: 16604/16802

*****************It is 4 model
Accuracy: 253/1380

Accuracy: 16412/16802

*****************It is 5 model
Accuracy: 259/1380

Accuracy: 16266/16802

*****************It is 6 model
Accuracy: 228/1380

Accuracy: 16428/16802

*****************It is 7 model
Accuracy: 207/1380

Accuracy: 16520/16802



In [52]:
accuracy_per_class(predictions,true_vals)

Accuracy: 207/1380

Accuracy: 16520/16802



# Task 10: Predicting Test Dataset

In [44]:
encoded_data_test = tokenizer.batch_encode_plus(
    test_dataframe.review.values,
    add_special_tokens = True,
    return_attention_mask = True,
    pad_to_max_length = True,
    max_length = 256,
    return_tensors = 'pt'
)



In [45]:
input_ids_test  = encoded_data_test["input_ids"]
attention_mask_test = encoded_data_test["attention_mask"]
labels_test  =  torch.tensor(test_dataframe.label.values)

In [46]:
dataset_test = TensorDataset(input_ids_test,
                            attention_mask_test,
                            labels_test)

In [47]:
dataloader_test = DataLoader(
    dataset_test,
    sampler = SequentialSampler(dataset_test),
    batch_size = 1
)

In [56]:
testpredictions= predict(dataloader_test)

In [57]:
test_pred = np.argmax(testpredictions,axis=1).flatten()
test_pred

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

In [58]:
test_dataframe["label"]=pd.DataFrame(test_pred)

In [51]:
test_dataframe.drop(["review"],inplace=True,axis=1)

In [59]:
test_dataframe[["id","label"]].to_csv("submission_model10.csv",index=False)

In [125]:
len(dataset_test)

6000