In [1]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
import torch
from torch.utils.data import Dataset

class AGNewsDataset(Dataset):
    def __init__(self, dataset_dict, tokenizer, split="train", max_length=512):
        self.dataset = dataset_dict[split]
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        # Combine title and text for better context
        text = f"{item['text']}"
        
        # length and proper padding
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_special_tokens_mask=True,
            return_tensors=None  # Don't return as tensors yet
        )
        
       
        input_ids = torch.tensor(encoding['input_ids'], dtype=torch.long)
        attention_mask = torch.tensor(encoding['attention_mask'], dtype=torch.long)
        
        # labels for language modeling
        labels = input_ids.clone()
        labels[attention_mask == 0] = -100  
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

In [2]:
pip install --upgrade datasets huggingface_hub

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
dataset = load_dataset("ag_news")
print(f"Dataset loaded. Train size: {len(dataset['train'])}")

README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset loaded. Train size: 120000


In [4]:
# Load model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set padding token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False  # For training



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
# Create custom datasets
train_dataset = AGNewsDataset(dataset, tokenizer, "train")
test_dataset = AGNewsDataset(dataset, tokenizer, "test")

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Verify dataset output format
sample_output = train_dataset[0]
print("\nSample output keys:", sample_output.keys())
print("Sample output shapes:")
for key, val in sample_output.items():
    print(f"{key}: {val.shape}")

Train dataset size: 120000
Test dataset size: 7600

Sample output keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Sample output shapes:
input_ids: torch.Size([512])
attention_mask: torch.Size([512])
labels: torch.Size([512])


In [6]:
# LoRA config
config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# LoRA to the model
lora_model = get_peft_model(model, config)
print("\nTrainable parameters:")
lora_model.print_trainable_parameters()

# cache
torch.cuda.empty_cache()




Trainable parameters:
trainable params: 405,504 || all params: 124,845,312 || trainable%: 0.32480514766946156


In [None]:
# training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, #tried 16,8 
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=500,
    save_strategy="epoch",
    save_total_limit=2,
    max_grad_norm=1.0,
    warmup_steps=500,
    fp16=True  
)

In [None]:
# Trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Start
trainer.train()

In [7]:
print("\nSaving model...")
lora_model.save_pretrained("ag_news_gpt2_lora")



Saving model...


In [8]:
def evaluate_model(model, dataset, tokenizer, batch_size=8):
    model.eval()
    total_loss = 0
    total_samples = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # dataset in batches
    with torch.no_grad():
        for i in range(0, len(dataset), batch_size):
            end_idx = min(i + batch_size, len(dataset))
            
            
            batch_items = [dataset[j] for j in range(i, end_idx)]
            input_ids = torch.stack([item['input_ids'] for item in batch_items]).to(device)
            attention_mask = torch.stack([item['attention_mask'] for item in batch_items]).to(device)
            labels = torch.stack([item['labels'] for item in batch_items]).to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item() * (end_idx - i)
            total_samples += (end_idx - i)
    
    return total_loss / total_samples

# Now evaluate the model
print("\nEvaluating fine-tuned model...")
try:
    finetuned_loss = evaluate_model(lora_model, test_dataset, tokenizer)
    finetuned_perplexity = torch.exp(torch.tensor(finetuned_loss))
    print(f"Fine-tuned Model Perplexity: {finetuned_perplexity:.2f}")
except Exception as e:
    print(f"Error during evaluation: {str(e)}")
    print("Let's check the dataset structure:")
    sample_item = test_dataset[0]
    print("\nSample dataset item structure:")
    for key, value in sample_item.items():
        print(f"{key}: {type(value)}, shape: {value.shape if hasattr(value, 'shape') else 'N/A'}")


Evaluating fine-tuned model...
Fine-tuned Model Perplexity: 72.18


In [9]:
from peft import AutoPeftModelForCausalLM
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("\nVerifying saved model...")
loaded_model = AutoPeftModelForCausalLM.from_pretrained("ag_news_gpt2_lora")
loaded_model = loaded_model.to(device)

# Test generation
test_text = "Wall Street on Monday as"
inputs = tokenizer(test_text, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
outputs = loaded_model.generate(input_ids=inputs["input_ids"], max_new_tokens=50)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\nSample generation:")
print(generated_text)


Verifying saved model...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Sample generation:
Wall Street on Monday as the Federal Reserve's decision to raise interest rates on its $1.25 trillion bond-buying program was announced.

The Fed's decision to raise rates on its $1.25 trillion bond-buying program was announced Monday.



In [10]:
test_prompts = [
    "The technology  announced",
    "In sports , the team",
    "The economy  signs of",
    "Scientists a new"
]

print("\nAdditional Generation Examples:")
for prompt in test_prompts:
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = loaded_model.generate(input_ids=inputs["input_ids"], max_new_tokens=53)
    print(f"\nPrompt: {prompt}")
    print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Additional Generation Examples:


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Prompt: The technology  announced
Generated: The technology  announced by the company is a "smart" way to detect and track the movement of a person's body.

The company is also developing a new way to track the movement of a person's body.

The company is also developing a new way to track


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Prompt: In sports , the team
Generated: In sports, the team's goal is to win the championship.

"We're not going to be a team that's going to win the championship," said coach Mike Krzyzewski. "We're going to be a team that's going to win the championship. We're


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Prompt: The economy  signs of
Generated: The economy  signs of recovery  and the economy  signs of recovery  and the economy  signs of recovery  and the economy  signs of recovery  and the economy  signs of recovery  and the economy  signs of recovery  and the economy  signs

Prompt: Scientists a new
Generated: Scientists a new study of the effects of the Zika virus on the brain of a young girl who was infected with the virus has found that the virus can cause a brain abnormality.

The girl, who was born with a brain abnormality, was infected with the virus in
