In [2]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_checkpoint = 'distilbert-base-uncased'

id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0, "Positive": 1}

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = 2, id2label = id2label, label2id = label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
dataset = load_dataset("shawhin/imdb-truncated")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space = True)

def  tokenize_function(examples):
    text = examples["text"]

    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text, 
        return_tensors="np",
        truncation = True,
        max_length=512
    )

    return tokenized_inputs

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map: 100%|██████████| 1000/1000 [00:00<00:00, 4865.73 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 4206.68 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [6]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis = 1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

# Checking the untrained model performance

In [11]:
text_list = ["It was so so good.", "The movie was terrible.", "It was better than the other movie.", "Not worth watching.", "Poorly directed movie"]

print("Untrained model predictions:")
print("-----------------------")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
-----------------------
It was so so good. - Positive
The movie was terrible. - Positive
It was better than the other movie. - Positive
Not worth watching. - Positive
Poorly directed movie - Positive


# Fine tuning with LoRA

In [12]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                         r = 4,
                         lora_alpha=32,
                         lora_dropout=-0.01,
                         target_modules=['q_lin']
                         )

In [13]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,221,124 || all params: 67,584,004 || trainable%: 1.8068239934408148


In [14]:
lr = 1e-3
batch_size = 4
num_epochs = 10

training_args = TrainingArguments(
    output_dir=model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [15]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  0%|          | 0/2500 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                  
 10%|█         | 250/2500 [01:44<11:43,  3.20it/s]

{'eval_loss': 0.32543832063674927, 'eval_accuracy': {'accuracy': 0.886}, 'eval_runtime': 36.9374, 'eval_samples_per_second': 27.073, 'eval_steps_per_second': 6.768, 'epoch': 1.0}


 20%|██        | 500/2500 [02:53<10:34,  3.15it/s]  

{'loss': 0.4157, 'learning_rate': 0.0008, 'epoch': 2.0}


                                                  
 20%|██        | 500/2500 [03:31<10:34,  3.15it/s]

{'eval_loss': 0.4837573766708374, 'eval_accuracy': {'accuracy': 0.871}, 'eval_runtime': 37.5609, 'eval_samples_per_second': 26.623, 'eval_steps_per_second': 6.656, 'epoch': 2.0}


                                                    
 30%|███       | 750/2500 [05:17<06:00,  4.86it/s]

{'eval_loss': 0.5439528822898865, 'eval_accuracy': {'accuracy': 0.88}, 'eval_runtime': 37.2546, 'eval_samples_per_second': 26.842, 'eval_steps_per_second': 6.711, 'epoch': 3.0}


 40%|████      | 1000/2500 [06:25<07:51,  3.18it/s] 

{'loss': 0.1876, 'learning_rate': 0.0006, 'epoch': 4.0}


                                                   
 40%|████      | 1000/2500 [07:02<07:51,  3.18it/s]

{'eval_loss': 0.9312925934791565, 'eval_accuracy': {'accuracy': 0.862}, 'eval_runtime': 36.8873, 'eval_samples_per_second': 27.11, 'eval_steps_per_second': 6.777, 'epoch': 4.0}


                                                     
 50%|█████     | 1250/2500 [08:49<06:33,  3.17it/s]

{'eval_loss': 0.7720233201980591, 'eval_accuracy': {'accuracy': 0.893}, 'eval_runtime': 38.5705, 'eval_samples_per_second': 25.927, 'eval_steps_per_second': 6.482, 'epoch': 5.0}


 60%|██████    | 1500/2500 [09:58<04:32,  3.67it/s]  

{'loss': 0.0777, 'learning_rate': 0.0004, 'epoch': 6.0}


                                                   
 60%|██████    | 1500/2500 [10:41<04:32,  3.67it/s]

{'eval_loss': 0.8918026685714722, 'eval_accuracy': {'accuracy': 0.875}, 'eval_runtime': 42.8093, 'eval_samples_per_second': 23.359, 'eval_steps_per_second': 5.84, 'epoch': 6.0}


                                                     
 70%|███████   | 1750/2500 [12:32<03:43,  3.35it/s]

{'eval_loss': 0.9249178171157837, 'eval_accuracy': {'accuracy': 0.884}, 'eval_runtime': 36.7373, 'eval_samples_per_second': 27.22, 'eval_steps_per_second': 6.805, 'epoch': 7.0}


 80%|████████  | 2000/2500 [13:39<02:13,  3.74it/s]  

{'loss': 0.0137, 'learning_rate': 0.0002, 'epoch': 8.0}


                                                   
 80%|████████  | 2000/2500 [14:15<02:13,  3.74it/s]

{'eval_loss': 0.9254177212715149, 'eval_accuracy': {'accuracy': 0.892}, 'eval_runtime': 36.7523, 'eval_samples_per_second': 27.209, 'eval_steps_per_second': 6.802, 'epoch': 8.0}


                                                     
 90%|█████████ | 2250/2500 [16:00<00:51,  4.82it/s]

{'eval_loss': 0.9345467686653137, 'eval_accuracy': {'accuracy': 0.889}, 'eval_runtime': 36.7036, 'eval_samples_per_second': 27.245, 'eval_steps_per_second': 6.811, 'epoch': 9.0}


100%|██████████| 2500/2500 [17:08<00:00,  3.67it/s]

{'loss': 0.0167, 'learning_rate': 0.0, 'epoch': 10.0}


                                                   
100%|██████████| 2500/2500 [17:45<00:00,  2.35it/s]

{'eval_loss': 0.9756042957305908, 'eval_accuracy': {'accuracy': 0.889}, 'eval_runtime': 36.7279, 'eval_samples_per_second': 27.227, 'eval_steps_per_second': 6.807, 'epoch': 10.0}
{'train_runtime': 1065.0825, 'train_samples_per_second': 9.389, 'train_steps_per_second': 2.347, 'train_loss': 0.1422684534072876, 'epoch': 10.0}





TrainOutput(global_step=2500, training_loss=0.1422684534072876, metrics={'train_runtime': 1065.0825, 'train_samples_per_second': 9.389, 'train_steps_per_second': 2.347, 'train_loss': 0.1422684534072876, 'epoch': 10.0})

## Finally testing our fine tuned model

In [18]:
text_list = ["It was so so good.", "The movie was terrible.", "It was better than the other movie.", "Not worth watching.", "Poorly directed movie"]

print("Trained model predictions:")
print("-----------------------")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model.to("cpu")(inputs).logits
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Trained model predictions:
-----------------------
It was so so good. - Positive
The movie was terrible. - Negative
It was better than the other movie. - Positive
Not worth watching. - Negative
Poorly directed movie - Negative
