First approach to check the finetunning workflow explained at https://www.youtube.com/watch?v=eC6Hd1hFvos

In [6]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

# Import dataset for training

In [7]:
dataset = load_dataset("imdb")
dataset.shape

{'train': (25000, 2), 'test': (25000, 2), 'unsupervised': (50000, 2)}

In [8]:
# extract a subset for training
size = dataset.shape['train'][0]
subsample_size = 1000
rand_index = np.random.randint(size, size=subsample_size)

# # extract train and test data
x_train =dataset['train'][rand_index]['text']
y_train =dataset['train'][rand_index]['label']

x_test = dataset['test'][rand_index]['text']
y_test = dataset['test'][rand_index]['label']

dataset = DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
                        'validation':Dataset.from_dict({'label':y_test,'text':x_test})})

dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

# Import model

In [9]:
model_name = "distilbert-base-uncased" # binary model

# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id)

# import tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Prepare data

In [10]:
# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

def tokenize_text(examples):
    text = examples["text"]

    tokenizer.truncation_size = "left"

    tokenized_inputs = tokenizer(
        text, 
        return_tensors='np',
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [11]:
tokenized_dataset = dataset.map(tokenize_text, batched=True)

# prepare a data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [12]:
# import accuracy
accuracy = evaluate.load("accuracy")

# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

# Fine tune

In [13]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# hyperparameters
lr = 1e-3
batch_size = 10
num_epochs = 2

training_args = TrainingArguments(
    output_dir= "Personal-model-emotion", # directory to be saved
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch", # models saved after each epoch
    load_best_model_at_end=True,
)

# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


In [14]:
trainer.train()

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 0.29455116391181946, 'eval_accuracy': {'accuracy': 0.879}, 'eval_runtime': 64.2454, 'eval_samples_per_second': 15.565, 'eval_steps_per_second': 1.557, 'epoch': 1.0}




  0%|          | 0/100 [00:00<?, ?it/s]

{'eval_loss': 0.2893153131008148, 'eval_accuracy': {'accuracy': 0.896}, 'eval_runtime': 55.9912, 'eval_samples_per_second': 17.86, 'eval_steps_per_second': 1.786, 'epoch': 2.0}




{'train_runtime': 449.9433, 'train_samples_per_second': 4.445, 'train_steps_per_second': 0.445, 'train_loss': 0.32815303802490237, 'epoch': 2.0}


TrainOutput(global_step=200, training_loss=0.32815303802490237, metrics={'train_runtime': 449.9433, 'train_samples_per_second': 4.445, 'train_steps_per_second': 0.445, 'total_flos': 257685113884320.0, 'train_loss': 0.32815303802490237, 'epoch': 2.0})

# Evaluation

In [7]:
def evaluate_model(text='I am really happy!', model=model, tokenizer=tokenizer):
    model.to('mps') # moving to mps for Mac (can alternatively do 'cpu')

    # define label maps
    id2label = {0: "Negative", 1: "Positive"}

    print("Trained model predictions:")
    print("--------------------------")

    inputs = tokenizer.encode(text, return_tensors="pt").to("mps") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])
    

In [21]:
evaluate_model("Sometimes I feel depressed")

Trained model predictions:
--------------------------
Sometimes I feel depressed - Negative


# Load model

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig
import os

In [9]:
def load_model(checkpoint="2000", model_name="Personal-model-emotion"):
    output_dir = os.path.join(model_name, "checkpoint-" + checkpoint)

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(output_dir)

    # Load the model
    model = AutoModelForSequenceClassification.from_pretrained(output_dir)

    # If using PEFT and LoRA, you need to re-apply the LoRA configurations
    peft_config = LoraConfig.from_pretrained(output_dir)
    model = get_peft_model(model, peft_config)

    return model, tokenizer


model, tokenizer = load_model()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
evaluate_model(text=" I feel depressed, but I am always happy!")

Trained model predictions:
--------------------------
 I feel depressed, but I am always happy! - Positive
