In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-assignment3/test_samples.csv
/kaggle/input/llm-assignment3/train_samples.csv
/kaggle/input/llm-assignment3/validation_samples.csv


In [3]:
!pip install transformers datasets peft

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [4]:
import os
import time
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import DataLoader
from torch.cuda.amp import GradScaler, autocast
import shutil

# Set device for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load datasets with specified column names
train_dataset = load_dataset(
    'csv', 
    data_files='/kaggle/input/llm-assignment3/train_samples.csv', 
    column_names=['premise', 'hypothesis'], 
    split='train'
)

validation_dataset = load_dataset(
    'csv', 
    data_files='/kaggle/input/llm-assignment3/validation_samples.csv', 
    column_names=['premise', 'hypothesis'], 
    split='train'
)

# Verify number of examples loaded
print(f"Number of training examples: {len(train_dataset)}")
print(f"Number of validation examples: {len(validation_dataset)}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2", 
    torch_dtype=torch.float16, 
    device_map="auto", 
    trust_remote_code=True
).to(device)

# Set pad token to eos token if it doesn't exist
tokenizer.pad_token = tokenizer.eos_token

# QLoRA Configuration with TaskType.CAUSAL_LM
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

# Apply QLoRA model adaptation
model = get_peft_model(model, lora_config)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples['premise'],
        examples['hypothesis'],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# Apply tokenization to training and validation datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_validation = validation_dataset.map(tokenize_function, batched=True)

# Custom dataset class to avoid redundant tensor conversions
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'attention_mask': torch.tensor(self.attention_mask[idx]),
            'labels': torch.tensor(self.input_ids[idx]),  # For language modeling
        }

# Create custom datasets
custom_train_dataset = CustomDataset(tokenized_train)
custom_validation_dataset = CustomDataset(tokenized_validation)

# Define training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/phi2-finetuned",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    logging_dir='/kaggle/working/logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=5,
    load_best_model_at_end=True,
)

# DataLoader for batching
train_dataloader = DataLoader(custom_train_dataset, batch_size=training_args.per_device_train_batch_size, shuffle=True)
validation_dataloader = DataLoader(custom_validation_dataset, batch_size=training_args.per_device_eval_batch_size)

# Optimizer and mixed-precision setup
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()
accumulation_steps = 4

# Training loop for 5 epochs with timing and saving after each epoch
for epoch in range(training_args.num_train_epochs):
    start_time = time.time()
    model.train()
    total_loss = 0

    for i, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss / accumulation_steps

        scaler.scale(loss).backward()

        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item()

    # Calculate and print time taken for the epoch
    epoch_time = time.time() - start_time
    print(f"Epoch {epoch + 1}/{training_args.num_train_epochs}, Loss: {total_loss / len(train_dataloader)}, Time taken: {epoch_time:.2f} seconds")

    # Save model and tokenizer after each epoch
    epoch_output_dir = f"/kaggle/working/phi2-finetuned-epoch-{epoch + 1}"
    model.save_pretrained(epoch_output_dir)
    tokenizer.save_pretrained(epoch_output_dir)
    shutil.make_archive(epoch_output_dir, 'zip', epoch_output_dir)
    print(f"Model zipped and saved as {epoch_output_dir}.zip")


# Final save for the last model state
model.save_pretrained("/kaggle/working/phi2-finetuned-final")
tokenizer.save_pretrained("/kaggle/working/phi2-finetuned-final")


Using device: cuda
Number of training examples: 1001
Number of validation examples: 101


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

  scaler = GradScaler()
  with autocast():


Epoch 1/5, Loss: 0.09430356358562211, Time taken: 1121.98 seconds
Model zipped and saved as /kaggle/working/phi2-finetuned-epoch-1.zip
Epoch 2/5, Loss: 0.013739965788621853, Time taken: 1121.90 seconds
Model zipped and saved as /kaggle/working/phi2-finetuned-epoch-2.zip
Epoch 3/5, Loss: 0.012722672756433814, Time taken: 1121.67 seconds
Model zipped and saved as /kaggle/working/phi2-finetuned-epoch-3.zip
Epoch 4/5, Loss: 0.01225256772021522, Time taken: 1121.59 seconds
Model zipped and saved as /kaggle/working/phi2-finetuned-epoch-4.zip
Epoch 5/5, Loss: 0.011845991141967603, Time taken: 1121.67 seconds
Model zipped and saved as /kaggle/working/phi2-finetuned-epoch-5.zip


('/kaggle/working/phi2-finetuned-final/tokenizer_config.json',
 '/kaggle/working/phi2-finetuned-final/special_tokens_map.json',
 '/kaggle/working/phi2-finetuned-final/vocab.json',
 '/kaggle/working/phi2-finetuned-final/merges.txt',
 '/kaggle/working/phi2-finetuned-final/added_tokens.json',
 '/kaggle/working/phi2-finetuned-final/tokenizer.json')

In [5]:
shutil.make_archive("/kaggle/working/phi2-finetuned-final", 'zip', "/kaggle/working/phi2-finetuned-final")

'/kaggle/working/phi2-finetuned-final.zip'

Model Testing

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from datasets import load_dataset
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model once
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2").to(device)

# Set pad token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load test dataset
test_dataset = load_dataset(
    'csv', 
    data_files='/kaggle/input/llm-assignment3/test_samples.csv', 
    column_names=['premise', 'hypothesis'], 
    split='train'
)

# Tokenize the test dataset
def tokenize_function(examples):
    return tokenizer(
        examples['premise'],
        examples['hypothesis'],
        padding="max_length",
        truncation=True,
        max_length=512
    )

tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Custom dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'attention_mask': torch.tensor(self.attention_mask[idx]),
            'labels': torch.tensor(self.input_ids[idx]),  # For language modeling
        }

# Create test DataLoader
custom_test_dataset = CustomDataset(tokenized_test)
test_dataloader = DataLoader(custom_test_dataset, batch_size=1)

# Function to calculate token-level accuracy
def calculate_token_accuracy(predictions, labels):
    predictions = predictions.view(-1)
    labels = labels.view(-1)
    valid_indices = labels != -100  # Ignore padding tokens
    return accuracy_score(labels[valid_indices].cpu(), predictions[valid_indices].cpu())

# Evaluate each epoch's adapter model without reloading the base model
accuracies = []

for epoch in range(1, 6):
    print(f"\nEvaluating model from epoch {epoch}...")

    # Load adapter weights for the epoch
    adapter_dir = f"/kaggle/input/epoch-models/phi2-finetuned-epoch-{epoch}"
    model = PeftModel.from_pretrained(model, adapter_dir).to(device)
    
    total_token_accuracy = 0
    total_batches = 0

    model.eval()
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Generate predictions with `max_new_tokens`
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=50)

            # Calculate token-level accuracy
            predictions = outputs[:, :input_ids.shape[1]]  # Trim to input length
            total_token_accuracy += calculate_token_accuracy(predictions, input_ids)
            total_batches += 1

    # Calculate average token-level accuracy
    avg_token_accuracy = total_token_accuracy / total_batches
    accuracies.append(avg_token_accuracy)
    print(f"Token-level accuracy for epoch {epoch}: {avg_token_accuracy:.4f}")

# Print all accuracies
for epoch, accuracy in enumerate(accuracies, 1):
    print(f"Epoch {epoch} model token-level accuracy: {accuracy:.4f}")


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]


Evaluating model from epoch 1...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pa

Token-level accuracy for epoch 1: 1.0000

Evaluating model from epoch 2...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Token-level accuracy for epoch 2: 1.0000

Evaluating model from epoch 3...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Token-level accuracy for epoch 3: 1.0000

Evaluating model from epoch 4...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Token-level accuracy for epoch 4: 1.0000

Evaluating model from epoch 5...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Token-level accuracy for epoch 5: 1.0000
Epoch 1 model token-level accuracy: 1.0000
Epoch 2 model token-level accuracy: 1.0000
Epoch 3 model token-level accuracy: 1.0000
Epoch 4 model token-level accuracy: 1.0000
Epoch 5 model token-level accuracy: 1.0000


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2").to(device)

# Set pad token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load test dataset
test_dataset = load_dataset(
    'csv', 
    data_files='/kaggle/input/llm-assignment3/test_samples.csv', 
    column_names=['premise', 'hypothesis'], 
    split='train'
)

# Tokenize the test dataset
def tokenize_function(examples):
    return tokenizer(
        examples['premise'],
        examples['hypothesis'],
        padding="max_length",
        truncation=True,
        max_length=512
    )

tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Custom dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'attention_mask': torch.tensor(self.attention_mask[idx]),
            'labels': torch.tensor(self.input_ids[idx]),  # For language modeling
        }

# Create test DataLoader
custom_test_dataset = CustomDataset(tokenized_test)
test_dataloader = DataLoader(custom_test_dataset, batch_size=1)

# Function to calculate token-level accuracy
def calculate_token_accuracy(predictions, labels):
    predictions = predictions.view(-1)
    labels = labels.view(-1)
    valid_indices = labels != -100  # Ignore padding tokens
    return accuracy_score(labels[valid_indices].cpu(), predictions[valid_indices].cpu())

# Evaluate pre-trained model
total_token_accuracy = 0
total_batches = 0

model.eval()
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Get logits from the model output
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Get the most likely token predictions
        predictions = torch.argmax(logits, dim=-1)

        # Calculate token-level accuracy
        total_token_accuracy += calculate_token_accuracy(predictions, input_ids)
        total_batches += 1

# Calculate average token-level accuracy
avg_token_accuracy = total_token_accuracy / total_batches
print(f"Token-level accuracy of the pre-trained model: {avg_token_accuracy:.4f}")



tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

KeyboardInterrupt: 