**NLP Coursework - Fancy Model - DistilBERT**

In [1]:
#install the dataset and transformer
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=

In [2]:
#PACKAGES USED:

#basic
import numpy as np
import pandas as pd

#to load dataset
from datasets import load_dataset

#for pre-processing
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

#for feature extraction and modeling
import gzip
import shutil
import time

import requests
import torch 
import torch.nn.functional as F
import torchtext

import transformers
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

In [3]:
#Step 1: Load the dataset and split into train, validation and test dataset
dataset = load_dataset('rotten_tomatoes')

x_data = dataset['train']['text'] + dataset['validation']['text'] + dataset['test']['text']

train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

print(train_dataset)
print(val_dataset)
print(test_dataset)

# Split the dataset into reviews and labels as x and y. 
x_train = train_dataset['text']
y_train = train_dataset['label']

x_val = val_dataset['text']
y_val = val_dataset['label']

x_test = val_dataset['text']
y_test = val_dataset['label']

Downloading builder script:   0%|          | 0.00/5.03k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.25k [00:00<?, ?B/s]

Downloading and preparing dataset rotten_tomatoes/default to /root/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46...


Downloading data:   0%|          | 0.00/488k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Dataset rotten_tomatoes downloaded and prepared to /root/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})


PRE_PROCESSING

In [4]:
# Step 2: Pre-processing

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text.lower())
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Stem the tokens
    #stemmer = PorterStemmer()
    #stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    #since stemming is creating some spelling mistakes which may affect the contextual representation, we have not performed this. eg: Centurys ----stemming---> centuri

    #lemmatizing the tokens
    lemmatizer = WordNetLemmatizer()
    lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the stemmed tokens
    preprocessed_text = ' '.join(lemma_tokens)
    
    return preprocessed_text

# apply this in x_train, x_val, and x_test,
x_train_preprocessed = [preprocess_text(text) for text in x_train]
x_val_preprocessed = [preprocess_text(text) for text in x_val]
x_test_preprocessed = [preprocess_text(text) for text in x_test]

print(x_train_preprocessed[0:2])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


['rock destined st century new conan he going make splash even greater arnold schwarzenegger jeanclaud van damme steven segal', 'gorgeously elaborate continuation lord ring trilogy huge column word adequately describe cowriterdirector peter jackson expanded vision j r r tolkien middleearth']


**FEATURE EXTRACTION AND MODELING - MODEL 4**

In [16]:
#General Setting
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

CUDA_LAUNCH_BLOCKING=1

NUM_EPOCHS = 3

# Load the tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

#tokenizing the data
train_encodings = tokenizer(list(x_train), truncation=True, padding=True)
valid_encodings = tokenizer(list(x_val), truncation=True, padding=True)
test_encodings = tokenizer(list(x_test), truncation=True, padding=True)


class rtdataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = rtdataset(train_encodings, y_train)
valid_dataset = rtdataset(valid_encodings, y_val)
test_dataset = rtdataset(test_encodings, y_test)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=8, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)

# Load Model

#model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)
model.train()

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [17]:
#Train Model

def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for batch_idx, batch in enumerate(data_loader):

            ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits = outputs['loss'], outputs['logits']

            _, predicted_labels = torch.max(logits, 1)

            num_examples += labels.size(0)

            correct_pred += (predicted_labels == labels).sum()
    return correct_pred.float()/num_examples * 100

In [18]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    
    model.train()
    
    for batch_idx, batch in enumerate(train_loader):
        
        ### Prepare data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        ### Forward
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs['loss'], outputs['logits']
        
        ### Backward
        optim.zero_grad()
        loss.backward()
        optim.step()
        
        ### Logging
        if not batch_idx % 100:
            print (f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
                   f'Batch {batch_idx:04d}/{len(train_loader):04d} | '
                   f'Loss: {loss:.4f}')
            
    model.eval()

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

#the train and validation codes are refered from hugging face DistlBERT page and https://www.youtube.com/watch?v=emDmznRlsWw.

Epoch: 0001/0003 | Batch 0000/1067 | Loss: 0.6662
Epoch: 0001/0003 | Batch 0100/1067 | Loss: 0.3149
Epoch: 0001/0003 | Batch 0200/1067 | Loss: 0.4098
Epoch: 0001/0003 | Batch 0300/1067 | Loss: 0.1472
Epoch: 0001/0003 | Batch 0400/1067 | Loss: 0.3062
Epoch: 0001/0003 | Batch 0500/1067 | Loss: 0.6221
Epoch: 0001/0003 | Batch 0600/1067 | Loss: 0.2858
Epoch: 0001/0003 | Batch 0700/1067 | Loss: 0.2472
Epoch: 0001/0003 | Batch 0800/1067 | Loss: 0.2759
Epoch: 0001/0003 | Batch 0900/1067 | Loss: 0.4818
Epoch: 0001/0003 | Batch 1000/1067 | Loss: 0.2167
training accuracy: 93.95%
valid accuracy: 83.96%
Time elapsed: 1.57 min
Epoch: 0002/0003 | Batch 0000/1067 | Loss: 0.0855
Epoch: 0002/0003 | Batch 0100/1067 | Loss: 0.1328
Epoch: 0002/0003 | Batch 0200/1067 | Loss: 0.4208
Epoch: 0002/0003 | Batch 0300/1067 | Loss: 0.3094
Epoch: 0002/0003 | Batch 0400/1067 | Loss: 0.3879
Epoch: 0002/0003 | Batch 0500/1067 | Loss: 0.0728
Epoch: 0002/0003 | Batch 0600/1067 | Loss: 0.7728
Epoch: 0002/0003 | Batch 070

In [8]:
#ploting the incorrect predictions for analysis

incorrect_examples = []
for i in range(len(test_dataset)):
    example = test_dataset[i]
    input_ids = torch.tensor(example['input_ids']).unsqueeze(0).to(DEVICE)
    attention_mask = torch.tensor(example['attention_mask']).unsqueeze(0).to(DEVICE)
    labels = torch.tensor(example['labels']).unsqueeze(0).to(DEVICE)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        _, predicted = torch.max(outputs[1], dim=1)
        
    if predicted != labels:
        incorrect_examples.append(example)

incorrect_texts = []
incorrect_labels = []

for example in incorrect_examples:
    text = tokenizer.decode(example['input_ids'], skip_special_tokens=True)
    labels = example['labels']
    
    incorrect_texts.append(text)
    incorrect_labels.append(labels)

df_incorrect_predictions_DistilBERT = pd.DataFrame({
    'text': incorrect_texts,
    'label': incorrect_labels
})

  input_ids = torch.tensor(example['input_ids']).unsqueeze(0).to(DEVICE)
  attention_mask = torch.tensor(example['attention_mask']).unsqueeze(0).to(DEVICE)
  labels = torch.tensor(example['labels']).unsqueeze(0).to(DEVICE)


In [10]:
print(df_incorrect_predictions_DistilBERT)
# Save the DataFrame to a CSV file
df_incorrect_predictions_DistilBERT.to_csv('df_incorrect_predictions_DistilBERT.csv', index=False)
from google.colab import files

# Download the CSV file
files.download('df_incorrect_predictions_DistilBERT.csv')

                                                  text      label
0    the importance of being earnest, so thick with...  tensor(1)
1    made for teens and reviewed as such, this is r...  tensor(1)
2    baby - faced renner is eerily convincing as th...  tensor(1)
3    there's absolutely no reason why blue crush, a...  tensor(1)
4    the best movie in many a moon about the passio...  tensor(1)
..                                                 ...        ...
185  wallace directs with such patronising reverenc...  tensor(0)
186  earnest and tentative even when it aims to shock.  tensor(0)
187  a sometimes incisive and sensitive portrait th...  tensor(0)
188  hilarious musical comedy though stymied by acc...  tensor(0)
189  if you are into splatter movies, then you will...  tensor(0)

[190 rows x 2 columns]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>