Distill-Bert fine-tuning with Huggingface and PyTorch

In [None]:
! pip install transformers

In [None]:
import pandas as pd
from sklearn import metrics
from tqdm import tqdm
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import transformers

In [None]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_train = pd.read_csv("/content/drive/My Drive/distill-bert/train.csv")[["text","target"]]

In [None]:
df_train.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
df_train.text[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [None]:
df_train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [None]:
# longest tweet length
max(df_train['text'].apply(len))

157

In [None]:
# Setting some configs.
MAX_LEN = 160
BATCH_SIZE = 16
LEARNING_RATE = 1e-05

DISTILL_BERT_PATH = '/content/drive/My Drive/distill-bert'
MODEL_PATH = "pytorch_model"
tokenizer = transformers.DistilBertTokenizer.from_pretrained(
    DISTILL_BERT_PATH,
    do_lower_case=True
)

In [None]:
# Creating the dataset object
class tweet_Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        tweet = str(self.data.text[index])
        tweet = " ".join(tweet.split())
        inputs = self.tokenizer.encode_plus(
            tweet,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.target[index], dtype=torch.float)
        }
        
    def __len__(self):
        return len(self.data)

In [None]:

# Creating the dataset and dataloader for the neural network

train_size = 0.85
train_dataset=df_train.sample(frac=train_size,random_state=200).reset_index(drop=True)
valid_dataset=df_train.drop(train_dataset.index).reset_index(drop=True)


print("FULL Dataset: {}".format(df_train.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALID Dataset: {}".format(valid_dataset.shape))

training_set = tweet_Dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = tweet_Dataset(valid_dataset, tokenizer, MAX_LEN)

FULL Dataset: (7613, 2)
TRAIN Dataset: (6471, 2)
VALID Dataset: (1142, 2)


In [None]:
training_set[0]

{'ids': tensor([  101,  1030,  4907,  2854,  9102,  2102,  1030,  3881, 29378,  7811,
          2002,  1005,  1055, 17162,  2083,  2023,  2208,  2007,  1996,  2190,
         22150,  4813,  2664,  1012,  2498, 10299,  1996, 25030,  1049,  2549,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [None]:
train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

valid_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

train_dl = DataLoader(training_set, **train_params)
valid_dl = DataLoader(testing_set, **valid_params)

In [None]:
class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.distill_bert = transformers.DistilBertModel.from_pretrained(DISTILL_BERT_PATH)
        self.drop = torch.nn.Dropout(0.3)
        self.out = torch.nn.Linear(768, 1)
    
    def forward(self, ids, mask):
        distilbert_output = self.distill_bert(ids, mask)
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        output_1 = self.drop(pooled_output)
        output = self.out(output_1)
        return output

In [None]:
model = DistillBERTClass()
model.to(device)

DistillBERTClass(
  (distill_bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Line

In [None]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets)
    
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def eval_fn(data_loader, model):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        fin_outputs = np.array(fin_outputs) >= 0.5
        f1_score = metrics.f1_score(fin_targets, fin_outputs)
    return f1_score

In [None]:
def fit(num_epochs, model, loss_fn, opt, train_dl, valid_dl):
    
    for epoch in range(num_epochs):
        model.train()
        for _,data in enumerate(train_dl, 0):
          ids = data['ids'].to(device, dtype = torch.long)
          mask = data['mask'].to(device, dtype = torch.long)
          targets = data['targets'].to(device, dtype = torch.float)
          outputs = model(ids, mask).squeeze()
          loss = loss_fn(outputs, targets)
          loss.backward()
          opt.step()
          opt.zero_grad()

        valid_acc = eval_fn(valid_dl, model)
        print('Epoch [{}/{}], Train Loss: {:.4f} and Validation acc {:.4f} and loss {:.4f}'.format(epoch+1, num_epochs, loss.item(),valid_acc, 1.1))

In [None]:
fit(7, model, loss_fn, optimizer, train_dl  , valid_dl)

100%|██████████| 72/72 [00:03<00:00, 18.69it/s]


Epoch [1/7], Train Loss: 0.1868 and Validation acc 0.8503 and loss 1.1000


100%|██████████| 72/72 [00:03<00:00, 18.55it/s]


Epoch [2/7], Train Loss: 0.0505 and Validation acc 0.8640 and loss 1.1000


100%|██████████| 72/72 [00:03<00:00, 18.65it/s]


Epoch [3/7], Train Loss: 0.8142 and Validation acc 0.8996 and loss 1.1000


100%|██████████| 72/72 [00:03<00:00, 18.66it/s]


Epoch [4/7], Train Loss: 0.3129 and Validation acc 0.9331 and loss 1.1000


100%|██████████| 72/72 [00:03<00:00, 18.64it/s]


Epoch [5/7], Train Loss: 0.1110 and Validation acc 0.9486 and loss 1.1000


100%|██████████| 72/72 [00:03<00:00, 18.70it/s]


Epoch [6/7], Train Loss: 0.0615 and Validation acc 0.9462 and loss 1.1000


100%|██████████| 72/72 [00:03<00:00, 18.68it/s]

Epoch [7/7], Train Loss: 0.0856 and Validation acc 0.9486 and loss 1.1000





In [None]:
def sentence_prediction(sentence):
    max_len = MAX_LEN
    tweet = str(sentence)
    tweet = " ".join(tweet.split())
    inputs = tokenizer.encode_plus(
            tweet,
            None,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
        )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]


    ids = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    mask = torch.tensor(mask, dtype=torch.long).unsqueeze(0)

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)

    outputs = model(ids=ids, mask=mask)

    outputs = torch.sigmoid(outputs).cpu().detach().numpy()
    return outputs[0][0] > 0.5

In [None]:
sentence_prediction("Lol this movie is an absolute disaster!")

False

In [None]:
sentence_prediction("We are experiencing slight tremors in London right now")

True

In [None]:
1+1

2