In [1]:
import os
#Successfuly imported 
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.examples.pbt_transformers.utils import download_data, \
    build_compute_metrics_fn
from ray.tune.schedulers import PopulationBasedTraining
from transformers import glue_tasks_num_labels, AutoConfig, \
    AutoModelForSequenceClassification, AutoTokenizer, Trainer, GlueDataset, \
    GlueDataTrainingArguments, TrainingArguments

from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

from transformers import EarlyStoppingCallback

## 1.  Fine tuning of Transformer for some down stream task

##### Step 1: Defining model for Fine tuning

In [3]:
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

##### Step 2: Preapring Dataset for Learning and Testing

In [39]:
#Using IMDB dataset for finetuning process
train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])

#randomly selecting a small set of data
X = list(train_dataset["text"])
y = list(train_dataset["label"])
df = pd.DataFrame()
df['text']=X
df['label'] = y
df = df.sample(n=1000)

In [18]:
# Preprocess data
X_train, X_val, y_train, y_val = train_test_split(list(df['text']), list(df['label']), test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [19]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [20]:
#creating dataset
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

##### Step 3: Define metrics to be computed

In [21]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, 
            "precision": precision, 
            "recall": recall, 
            "f1": f1}

##### Step 4: Define Training Arguments
- All parameters provided in TrainingArguments can be used for optimization</br>
- Define evalustion strategy (prefered evealution strategy is steps if GPU memory is less)

In [22]:
args = TrainingArguments(
    output_dir="imdb_small_model",
    evaluation_strategy="steps",
    eval_steps=400,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    save_steps=3000,
    seed=0,
    load_best_model_at_end=True,
)

##### Step 5: Trainer class has training (for training) and evalaution dataset (data used fro generalization)
- We can define compute metrics which will be used for optimal learning

In [23]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

##### Transformer Fine tuning starts....

In [41]:
# Train pre-trained model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.528117,0.878333,0.930502,0.814189,0.868468
2,0.144932,0.587792,0.883333,0.906475,0.851351,0.878049


Attempted to log scalar metric eval_loss:
0.5281171798706055
Attempted to log scalar metric eval_accuracy:
0.8783333333333333
Attempted to log scalar metric eval_precision:
0.9305019305019305
Attempted to log scalar metric eval_recall:
0.8141891891891891
Attempted to log scalar metric eval_f1:
0.8684684684684685
Attempted to log scalar metric epoch:
1.0
Attempted to log scalar metric loss:
0.14493174743652343
Attempted to log scalar metric learning_rate:
5.416666666666667e-06
Attempted to log scalar metric epoch:
1.6666666666666665
Attempted to log scalar metric eval_loss:
0.5877916216850281
Attempted to log scalar metric eval_accuracy:
0.8833333333333333
Attempted to log scalar metric eval_precision:
0.9064748201438849
Attempted to log scalar metric eval_recall:
0.8513513513513513
Attempted to log scalar metric eval_f1:
0.8780487804878048
Attempted to log scalar metric epoch:
2.0
Attempted to log scalar metric total_flos:
987291795456000
Attempted to log scalar metric epoch:
2.0


TrainOutput(global_step=600, training_loss=0.1265499778588613)

In [13]:
#X_test = list(test_dataset["text"])
#y = list(test_dataset["label"])
# Tokenize test data
#X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512) 
# Create torch dataset
#test_dataset = Dataset(X_test_tokenized) 
# Load trained model
#model_path = "output/checkpoint-4000"
#model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=2) 
# Define test trainer
#test_trainer = Trainer(model) 
# Make prediction
#raw_pred, _, _ = test_trainer.predict(test_dataset) 
# Preprocess raw predictions
#y_pred = np.argmax(raw_pred, axis=1)

## 2.  Hyperparameter optimization during Fine tuning 
##### Step 1 : intialize model with model_init 

In [33]:
def model_init():
    model_name = "distilbert-base-uncased"
    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)
    return model

##### Step 2: Defining Trainer
- passing training data (learning data) and eval_datset (for hyper parameter optimization)

In [31]:
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

RuntimeError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 11.17 GiB total capacity; 10.65 GiB already allocated; 6.81 MiB free; 10.79 GiB reserved in total by PyTorch)

##### Step 3: Defining Computing metrics

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, 
            "precision": precision, 
            "recall": recall, 
            "f1": f1}

##### Defaut objective is the sum of all metrics when metrics are provided, so we have to maximize it.
- If compute_objective is explicitly not defined, then during HPO API tries to maximize sum of of all metrics returned from compute_metric in key, value form

In [40]:
#torch.cuda.empty_cache()
best_trial  = trainer.hyperparameter_search(
    direction="maximize",
    n_trials=2 # number of trials used for HPO from all available configuration of hyperparameter 
)

RuntimeError: To use hyperparameter search, you need to pass your model through a model_init function.

In [24]:
#best hyperparameters after completion of HPO process
for n, v in best_trial.hyperparameters.items():
    print(n,v)

learning_rate 6.443985063216559e-05
num_train_epochs 1
seed 21
per_device_train_batch_size 8


## 3. Even one can choose hyperparameters which one want to optimize

In [37]:
#torch.cuda.memory_summary(device=None, abbreviated=False)
torch.cuda.empty_cache()

In [38]:
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

RuntimeError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 11.17 GiB total capacity; 10.65 GiB already allocated; 6.81 MiB free; 10.79 GiB reserved in total by PyTorch)

##### Defining hyperparameter seacrh space configuration

In [27]:
def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
        "seed": trial.suggest_int("seed", 1, 40),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32, 64]),
    }



##### To get best hyperparameter configuration among defined hyperparameter spaces

In [None]:
#torch.cuda.empty_cache()
best_trial_with_given_configuration  = trainer.hyperparameter_search(
    direction="maximize",
    n_trials=2,
    hp_space=my_hp_space
)

##### Optimizing Hyperparameter for certain objectives during hypertuning it could be space, time etc
- We can choose any of the metric defined in compute_metrics returned dictionary and consider as sole objective.</br>
- e.g If your key is accuracy then you could return metrics["accuracy"] from the compute_objective function.
- At a time we can optimize one objective
- We can combine multiple metrics and return one business objective 

In [None]:
#defining compute_objective
def business_objective(metric):
    return metrics['accuracy']

In [None]:
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [None]:
#business_objective function can be any key from compute_metrics returned dictionary, in this scenarion we are 
#optimizing accuracy for hyperparameter optimization
best_trial_with_given_objective = trainer.hyperparameter_search(direction="maximize",
                              n_trials=2,
                              hp_space=my_hp_space
                              compute_objective=business_objective)