# Importing and creating the dataset

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Define dataset splits first
splits = ["train", "test"]

# Load dataset splits
dataset_name = "imdb"
def get_dataset(dataset_name, num_of_records):
    ds = {split: ds for split, ds in zip(splits, load_dataset(dataset_name, split=splits))}
    
    # Thin out the dataset to make it run faster for this example
    for split in splits:
        ds[split] = ds[split].shuffle(seed=42).select(range(num_of_records))
    return ds   

# Show the dataset
ds = get_dataset(dataset_name, 800)
ds

  from .autonotebook import tqdm as notebook_tqdm


{'train': Dataset({
     features: ['text', 'label'],
     num_rows: 800
 }),
 'test': Dataset({
     features: ['text', 'label'],
     num_rows: 800
 })}

# Tokenizing

In [2]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
# Preprocessing function
def preprocess_function(examples):
    
    # Tokenize correctly for batched processing
    return tokenizer(examples["text"], padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=128)

# Tokenize dataset
tokenized_ds = {}
for split in splits:
    tokenized_ds[split] = ds[split].map(preprocess_function, batched=True)

tokenized_ds['train'] = tokenized_ds['train'].rename_column('label', 'labels')
tokenized_ds['test'] = tokenized_ds['test'].rename_column('label', 'labels')

print(tokenized_ds)

{'train': Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 800
}), 'test': Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 800
})}


# Initializing the base model

In [3]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=2,
    id2label={0: "NEGATIVE", 1: "POSITIVE"},
    label2id={"NEGATIVE": 0, "POSITIVE": 1},
)
model.config.pad_token_id = tokenizer.pad_token_id
for param in model.base_model.parameters():
    param.requires_grad = False

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training the base model

In [4]:
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}


# The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/positive_or_negative",
        # Set the learning rate
        learning_rate= 2e-3,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        # Evaluate and save the model after each epoch
        evaluation_strategy="epoch",
        save_strategy="epoch",
        # Set the learning rate
        num_train_epochs=4,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.9394,2.191201,0.53
2,1.2197,1.202786,0.66625
3,0.9178,1.280138,0.67
4,0.7295,1.223691,0.6725


TrainOutput(global_step=3200, training_loss=1.1743237590789795, metrics={'train_runtime': 109.3044, 'train_samples_per_second': 29.276, 'train_steps_per_second': 29.276, 'total_flos': 209037400473600.0, 'train_loss': 1.1743237590789795, 'epoch': 4.0})

# Creating a PEFT Config

In [6]:
! pip install peft
! pip install -U peft transformers




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: C:\Users\ahmed\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: C:\Users\ahmed\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


# Converting a Transformers Model into a PEFT Mode

In [7]:

from peft import get_peft_model, LoraConfig, TaskType,PeftModel

lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.01,
    #lora_bias
)


In [8]:
lora_model = get_peft_model(model, lora_config)
#model.print_trainable_parameters()



# Training with a PEFT Model

In [9]:
peft_trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./data/peft_positive_or_negative",
        # Set the learning rate
        #learning_rate= 2e-3,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        # Evaluate and save the model after each epoch
        evaluation_strategy="epoch",
        save_strategy="epoch",
        # Set the learning rate
        num_train_epochs=4,
        warmup_steps=50,
        weight_decay=0.01,
        label_names=["labels"]
       
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics
    
)

  peft_trainer = Trainer(


In [10]:
peft_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.047371,0.6825
2,No log,0.896388,0.7175
3,0.562000,0.829879,0.73625
4,0.562000,0.81498,0.74


TrainOutput(global_step=800, training_loss=0.5178721237182617, metrics={'train_runtime': 71.1512, 'train_samples_per_second': 44.975, 'train_steps_per_second': 11.244, 'total_flos': 211034308608000.0, 'train_loss': 0.5178721237182617, 'epoch': 4.0})

# Saving a Trained PEFT Model

In [11]:
lora_model.save_pretrained("gpt-lora")
tokenizer.save_pretrained("gpt-lora-tokenizer")

('gpt-lora-tokenizer\\tokenizer_config.json',
 'gpt-lora-tokenizer\\special_tokens_map.json',
 'gpt-lora-tokenizer\\vocab.json',
 'gpt-lora-tokenizer\\merges.txt',
 'gpt-lora-tokenizer\\added_tokens.json',
 'gpt-lora-tokenizer\\tokenizer.json')

# Inference with PEFT

In [13]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModelForSequenceClassification, AutoPeftModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

lora_model = AutoPeftModelForSequenceClassification.from_pretrained("gpt-lora").to(device)
lora_tokenizer = AutoTokenizer.from_pretrained("gpt-lora-tokenizer")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Testing the classifier

In [14]:
def classify_text(text):
    text=[text]
    inputs = lora_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    input_ids= inputs['input_ids']
    attention_mask = inputs['attention_mask']
    classification=""
    with torch.no_grad():
        outputs = lora_model(torch.tensor(input_ids, device=device), attention_mask=torch.tensor(attention_mask, device=device))
    probs = torch.softmax(outputs.logits, dim=1)
    predicted_labels = torch.argmax(probs, dim=1)
    predicted_labels
    sentiments = ["NEGATIVE", "POSITIVE"]
    for i, text in enumerate(text):
        print(f"Text: {text}")
        print(f"Predicted Sentiment: {sentiments[predicted_labels[i].item()]} (Probability: {probs[i][predicted_labels[i]].item()})")
        

In [19]:
classify_text("This vacuum cleaner has more suction power than a black hole. My carpets are spotless!")

Text: This vacuum cleaner has more suction power than a black hole. My carpets are spotless!
Predicted Sentiment: POSITIVE (Probability: 0.9776392579078674)


  outputs = lora_model(torch.tensor(input_ids, device=device), attention_mask=torch.tensor(attention_mask, device=device))


In [23]:
classify_text("The location was perfect, but the room had bedbugs and the AC was broken.")

  outputs = lora_model(torch.tensor(input_ids, device=device), attention_mask=torch.tensor(attention_mask, device=device))


Text: The location was perfect, but the room had bedbugs and the AC was broken.
Predicted Sentiment: NEGATIVE (Probability: 0.9508062601089478)
