# Pytorch DistilBERT Fine-tuning Pretrained

In [2]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import transformers
import json
from tqdm import tqdm

from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
from transformers import DistilBertModel, DistilBertTokenizer
from transformers import BertModel, BertTokenizer

import torch.nn as nn
from transformers import AutoModelForSequenceClassification

from transformers import AutoModel, AutoTokenizer


import logging
from sklearn.metrics import classification_report
logging.basicConfig(level=logging.ERROR)
from sklearn.metrics import confusion_matrix


## 1. Load data

In [3]:
device = torch.device('mps' if torch.backends.mps.is_available() else ('cuda' if torch.cuda.is_available() else 'cpu'))
print(device)

mps


In [4]:
# Collab
collab = False

if collab:
  pat = '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'
  !git clone https://{pat}@github.com/gianclbal/ALMA-TACIT.git
  print("Running in collab")
  training_df = pd.read_csv('/content/ALMA-TACIT/data-analysis/new_data/attainment/attainment_fall_2019_fall_2023_full_sentence_training_data.csv',encoding='utf-8')
  print(training_df.shape)
  test_df = pd.read_csv('/content/ALMA-TACIT/data-analysis/new_data/attainment/attainment_fall_2019_fall_2023_full_sentence_test_data.csv',encoding='utf-8')
  print(test_df.shape)
  augmented_data = pd.read_csv("/content/ALMA-TACIT/data-analysis/new_data/attainment/augmented_dataset/atn_augmented_dataset_1155.csv")
  
  print("Training and test sets loaded.")
else:
  print("Running locally")
  # training_df = pd.read_csv('../../new_data/attainment/attainment_fall_2019_fall_2023_full_sentence_training_data.csv',encoding='utf-8')
  # print(training_df.shape)
  # test_df = pd.read_csv('../../new_data/attainment/attainment_fall_2019_fall_2023_full_sentence_test_data.csv',encoding='utf-8')
  # print(test_df.shape)
  # augmented_data = pd.read_csv("../../new_data/attainment/augmented_dataset/atn_augmented_dataset_1155.csv")
  
  merged_aspirational_df = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/merged_Aspirational_sentence_level_batch_1_jaccard.csv", encoding='utf-8')
  merged_aspirational_df
  training_df, test_df = train_test_split(merged_aspirational_df, test_size=0.2, random_state=18, stratify=merged_aspirational_df['label'])
  print
  print("Training and test sets loaded.")


Running locally
Training and test sets loaded.


In [5]:
training_df.head()

Unnamed: 0,sentence,label,phrase
1341,one is obviously this class is a requirement f...,0,['I am here so that I can enhance my education...
3597,to get a better and broader perspective of life.,1,['I am here because I wanted a significant cha...
389,that is what i am in school for but it is not ...,0,['I would like to be a cardiothoracic surgeon.']
1798,"i should have dropped the class, it was a very...",0,['This semester I want to prove to myself that...
1827,i was apart of a sci course last semester and ...,0,['I am here because I am dedicated to my succe...


In [6]:
test_df.head()

Unnamed: 0,sentence,label,phrase
447,also i want to be an optometrist and this clas...,1,['Also I want to be an optometrist and this cl...
727,unfortunately it is a requirement for biology ...,0,"[""I am here because this class is one of many ..."
2681,i am here to learn beyond my prior knowledge.,0,"[""I'm here because I need this class to gradua..."
1538,i have no specific direction in my life other ...,0,['I want to try to get into veterinary school ...
2807,im here because i want to pursue a career in t...,0,"['More specifically, this class will get me on..."


## 2. Data loader

In [7]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 150
BATCH_SIZE = 6
# WEIGHT_DECAY = 0.01
# EPOCHS = 1
LEARNING_RATE = 2e-5

# model names
roberta_name = "roberta-base"
bert_name = "bert-base-uncased"
distilbert_name = "distilbert-base-uncased"

list_of_model_names =[roberta_name, bert_name, distilbert_name]


In [8]:
class SentenceData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, model):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe["sentence"]
        self.targets = self.data["label"]
        self.max_len = max_len
        self.model = model

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        if self.model == "roberta-base" or self.model == "bert-base-uncased":
            token_type_ids = inputs["token_type_ids"]
        else:
            token_type_ids = None

        return_dict = {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

        if token_type_ids is not None:
            return_dict['token_type_ids'] = torch.tensor(token_type_ids, dtype=torch.float)

        return return_dict


In [9]:
def data_loader(train_df, test_df, max_len, list_of_model_names):

    datasets = {}

    X = train_df['sentence']
    y = train_df['label']

    # Split the data
    train_dataset, validation_dataset = train_test_split(train_df, test_size=0.1, random_state=18, stratify=training_df.label)

    train_dataset.reset_index(drop=True, inplace=True)
    validation_dataset.reset_index(drop=True, inplace=True)
    test_df.reset_index(drop=True, inplace=True)

    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("VALIDATION Dataset: {}".format(validation_dataset.shape))
    print("TEST Dataset: {}".format(test_df.shape))

    # data loader parameters
    train_params = {'batch_size': BATCH_SIZE,
                # 'shuffle': True,
                'num_workers': 0
                }

    validate_params = {'batch_size': BATCH_SIZE,
                    # 'shuffle': True,
                    'num_workers': 0
                    }
    test_params = {'batch_size': BATCH_SIZE,
                    # 'shuffle': True,
                    'num_workers': 0
                    }

    for model_name in list_of_model_names:
        training_set = SentenceData(train_dataset, AutoTokenizer.from_pretrained(model_name), max_len, model_name)
        validate_set = SentenceData(validation_dataset, AutoTokenizer.from_pretrained(model_name), max_len, model_name)
        testing_set = SentenceData(test_df, AutoTokenizer.from_pretrained(model_name), max_len, model_name)

        training_loader = DataLoader(training_set, **train_params)
        validate_loader = DataLoader(validate_set, **validate_params)
        testing_loader = DataLoader(testing_set, **test_params)

        datasets[model_name] = {'train': training_loader, 'test': testing_loader, 'validate': validate_loader}

    return datasets

In [10]:
exp_4_datasets = data_loader(train_df=training_df,
            test_df=test_df,
            max_len=MAX_LEN,
            list_of_model_names=list_of_model_names)

TRAIN Dataset: (3386, 3)
VALIDATION Dataset: (377, 3)
TEST Dataset: (941, 3)


In [11]:
exp_4_datasets["distilbert-base-uncased"]["train"].dataset.data.head()

Unnamed: 0,sentence,label,phrase
0,i am majoring in biology and i am planning on ...,1,['planning on declaring a concentration in zoo...
1,it helps in understanding the mechanics of the...,0,"[""I'm in this class because it's a step to ach..."
2,i knew i was going to major in cinema and sfsu...,0,['I am here at SFSU due to its cinema program']
3,i know it takes time and patience to get to wh...,0,"['In the future, Im hoping to be in the medica..."
4,i just hope i can pull it off.,0,['Taking this course is a first step to unders...


## 3. Defining the model

In [12]:
model_name = "distilbert-base-uncased"

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
model = model.to(device)

In [15]:
from sklearn.metrics import f1_score
from torch.optim import Adam

In [16]:
# Define the optimizer and loss function
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

In [17]:
EPOCHS = 5

In [18]:
train_loader = exp_4_datasets[model_name]["train"]
val_loader = exp_4_datasets[model_name]["validate"]

: 

In [19]:
for epoch in range(EPOCHS):
    model.train()
    total_loss_train = 0
    correct_predictions_train = 0

    for _, batch in enumerate(train_loader, 0):
        optimizer.zero_grad()

        input_ids = batch['ids'].to(device)
        attention_mask = batch['mask'].to(device)
        labels = batch['targets'].to(device)

        print("here")

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = criterion(logits, labels)
        total_loss_train += loss.item()
        correct_predictions_train += torch.sum(torch.argmax(logits, dim=1) == labels)

        loss.backward()
        optimizer.step()

    model.eval()
    total_loss_val = 0
    correct_predictions_val = 0

    with torch.no_grad():
        for _, batch in enumerate(val_loader, 0):
            input_ids = batch['ids'].to(device)
            attention_mask = batch['mask'].to(device)
            labels = batch['targets'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = criterion(logits, labels)
            total_loss_val += loss.item()
            correct_predictions_val += torch.sum(torch.argmax(logits, dim=1) == labels)

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f'Train Loss: {total_loss_train / len(train_loader)}, Train Accuracy: {correct_predictions_train.double() / len(train_loader.dataset)}')
    print(f'Validation Loss: {total_loss_val / len(val_loader)}, Validation Accuracy: {correct_predictions_val.double() / len(val_loader.dataset)}')

print("Training complete.")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


here
