In [1]:
# Required installations
!pip install transformers datasets torch evaluate bitsandbytes peft accelerate &> /dev/null

import os
import time
import torch
import psutil
import numpy as np
from datasets import load_dataset, Dataset
from google.colab import drive
from torch.cuda import max_memory_allocated, reset_peak_memory_stats, memory_reserved
from torch.cuda.amp import autocast
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorWithPadding
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    TaskType
)

# Configuration
class Config:
    MODEL_NAME = "microsoft/phi-2"
    NUM_LABELS = 3
    MAX_LENGTH = 128
    SEED = 42
    TRAIN_BATCH_SIZE = 8
    EVAL_BATCH_SIZE = 8
    LEARNING_RATE = 2e-4
    NUM_EPOCHS = 5
    WEIGHT_DECAY = 0.01
    WARMUP_RATIO = 0.1
    LORA_R = 16
    LORA_ALPHA = 32
    LORA_DROPOUT = 0.1

    def __init__(self, output_dir):
        self.OUTPUT_DIR = output_dir
        self.MODEL_SAVE_DIR = os.path.join(output_dir, "phi2_qlora_nli")
        self.FINAL_MODEL_DIR = os.path.join(output_dir, "phi2_qlora_nli_final")
        self.OUTPUT_FILE = os.path.join(output_dir, "output.txt")
        self.ANALYSIS_FILE = os.path.join(output_dir, "analysis_results.txt")

class Logger:
    def __init__(self, output_file):
        self.output_file = output_file

    def log(self, message):
        print(message)
        with open(self.output_file, "a") as f:
            f.write(f"{message}\n")

class DataProcessor:
    @staticmethod
    def load_snli_data():
        """Load and preprocess SNLI dataset with specified sampling."""
        dataset = load_dataset("snli")

        def sample_data(examples, start, end, interval):
            return Dataset.from_dict({
                'premise': [examples[i]['premise'] for i in range(start, end, interval)][:1000],
                'hypothesis': [examples[i]['hypothesis'] for i in range(start, end, interval)][:1000],
                'label': [examples[i]['label'] if examples[i]['label'] != -1 else 0
                         for i in range(start, end, interval)][:1000]
            })

        train_data = sample_data(list(dataset["train"]), 0, 550000, 550)
        val_data = sample_data(list(dataset["validation"]), 0, 10000, 100)
        test_data = sample_data(list(dataset["test"]), 0, 10000, 100)

        return train_data, val_data, test_data

class ModelHandler:
    def __init__(self, config):
        self.config = config

    @staticmethod
    def get_model_size(model):
        """Calculate model size in MB"""
        param_size = 0
        for param in model.parameters():
            param_size += param.nelement() * param.element_size()
        buffer_size = 0
        for buffer in model.buffers():
            buffer_size += buffer.nelement() * buffer.element_size()
        size_all_mb = (param_size + buffer_size) / 1024**2
        return size_all_mb

    def get_base_model(self):
        """Initialize base Phi-2 model without QLoRA."""
        tokenizer = self._get_tokenizer()
        model = AutoModelForSequenceClassification.from_pretrained(
            self.config.MODEL_NAME,
            num_labels=self.config.NUM_LABELS,
            trust_remote_code=True,
            device_map="auto"
        )
        model.config.pad_token_id = tokenizer.pad_token_id
        return model, tokenizer

    def get_qlora_model(self):
        """Initialize Phi-2 model with QLoRA configuration."""
        tokenizer = self._get_tokenizer()
        model = self._initialize_quantized_model()
        model = self._apply_lora(model)
        return model, tokenizer

    def _get_tokenizer(self):
        tokenizer = AutoTokenizer.from_pretrained(self.config.MODEL_NAME, trust_remote_code=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.pad_token_id = tokenizer.eos_token_id
        return tokenizer

    def _initialize_quantized_model(self):
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True
        )

        model = AutoModelForSequenceClassification.from_pretrained(
            self.config.MODEL_NAME,
            num_labels=self.config.NUM_LABELS,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
        )
        model.config.pad_token_id = self._get_tokenizer().pad_token_id
        return prepare_model_for_kbit_training(model)

    def _apply_lora(self, model):
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=self.config.LORA_R,
            lora_alpha=self.config.LORA_ALPHA,
            lora_dropout=self.config.LORA_DROPOUT,
            target_modules=["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"]
        )
        return get_peft_model(model, lora_config)

class ResourceMonitor:
    @staticmethod
    def get_gpu_utilization():
        """Get GPU utilization percentage"""
        if torch.cuda.is_available():
            try:
                import pynvml
                pynvml.nvmlInit()
                handle = pynvml.nvmlDeviceGetHandleByIndex(0)
                info = pynvml.nvmlDeviceGetUtilizationRates(handle)
                return info.gpu
            except:
                return None
        return None

    @staticmethod
    def get_cpu_utilization():
        """Get CPU utilization percentage"""
        return psutil.cpu_percent()

    @staticmethod
    def get_ram_usage():
        """Get RAM usage in GB"""
        return psutil.Process(os.getpid()).memory_info().rss / 1024**3

    @staticmethod
    def get_gpu_memory_usage():
        """Get current GPU memory usage in GB"""
        if torch.cuda.is_available():
            return torch.cuda.memory_reserved() / 1024**3
        return 0

class TrainingUtils:
    @staticmethod
    def tokenize_function(examples, tokenizer, max_length):
        tokenized = tokenizer(
            examples["premise"],
            examples["hypothesis"],
            padding=False,
            truncation=True,
            max_length=max_length,
        )
        tokenized["labels"] = examples["label"]
        return tokenized

    @staticmethod
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return {
            "accuracy": (predictions == labels).astype(np.float32).mean().item(),
            "predictions": predictions.tolist(),
            "labels": labels.tolist()
        }

    @staticmethod
    def count_parameters(model):
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        return total_params, trainable_params

class CustomDataCollator(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)
        batch["labels"] = torch.tensor([f["labels"] for f in features])
        return batch

def main():
    # Setup
    drive.mount('/content/drive')
    config = Config('/content/drive/My Drive/Sem 7/LLM/Assignment 3/A3_Outputs')
    os.makedirs(config.OUTPUT_DIR, exist_ok=True)
    torch.manual_seed(config.SEED)
    logger = Logger(config.OUTPUT_FILE)

    # Clear output file
    open(config.OUTPUT_FILE, 'w').close()

    logger.log("Starting script execution...")
    logger.log(f"Output directory: {config.OUTPUT_DIR}")

    # Load datasets
    logger.log("Loading datasets...")
    data_processor = DataProcessor()
    train_data, val_data, test_data = data_processor.load_snli_data()

    # Initialize models
    model_handler = ModelHandler(config)

    # Initialize resource monitor
    resource_monitor = ResourceMonitor()

    # Evaluate base model
    logger.log("\nEvaluating base model...")
    base_model, base_tokenizer = model_handler.get_base_model()
    base_model_size = model_handler.get_model_size(base_model)
    total_params, _ = TrainingUtils.count_parameters(base_model)
    logger.log(f"Base model size: {base_model_size:.2f} MB")
    logger.log(f"Base model total parameters: {total_params:,}")

    # Prepare datasets
    tokenize_func = lambda x: TrainingUtils.tokenize_function(x, base_tokenizer, config.MAX_LENGTH)
    test_tokenized_base = test_data.map(tokenize_func, batched=True,
                                      remove_columns=['premise', 'hypothesis'])

    # Evaluate base model
    base_trainer = Trainer(
        model=base_model,
        tokenizer=base_tokenizer,
        compute_metrics=TrainingUtils.compute_metrics,
        data_collator=CustomDataCollator(tokenizer=base_tokenizer),
    )

    base_results = base_trainer.evaluate(test_tokenized_base)
    base_predictions = base_trainer.predict(test_tokenized_base)

    # Initialize and train QLoRA model
    logger.log("\nInitializing QLoRA model...")
    model, tokenizer = model_handler.get_qlora_model()
    qlora_model_size = model_handler.get_model_size(model)

    # Log parameter counts
    total_params, trainable_params = TrainingUtils.count_parameters(model)
    logger.log(f"Total parameters: {total_params:,}")
    logger.log(f"Trainable parameters: {trainable_params:,}")
    logger.log(f"Percentage of parameters fine-tuned: {(trainable_params/total_params)*100:.2f}%")

    # Prepare datasets for QLoRA
    logger.log("\nTokenizing datasets...")
    tokenize_func = lambda x: TrainingUtils.tokenize_function(x, tokenizer, config.MAX_LENGTH)
    train_tokenized = train_data.map(tokenize_func, batched=True,
                                   remove_columns=['premise', 'hypothesis'])
    val_tokenized = val_data.map(tokenize_func, batched=True,
                                remove_columns=['premise', 'hypothesis'])
    test_tokenized = test_data.map(tokenize_func, batched=True,
                                  remove_columns=['premise', 'hypothesis'])

    # Log initial resource usage
    initial_cpu = resource_monitor.get_cpu_utilization()
    initial_ram = resource_monitor.get_ram_usage()
    initial_gpu_util = resource_monitor.get_gpu_utilization()
    initial_gpu_mem = resource_monitor.get_gpu_memory_usage()

    # Training arguments
    training_args = TrainingArguments(
        output_dir=config.MODEL_SAVE_DIR,
        learning_rate=config.LEARNING_RATE,
        per_device_train_batch_size=config.TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=config.EVAL_BATCH_SIZE,
        num_train_epochs=config.NUM_EPOCHS,
        weight_decay=config.WEIGHT_DECAY,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        logging_steps=10,
        fp16=True,
        gradient_accumulation_steps=5,
        warmup_ratio=config.WARMUP_RATIO,
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        compute_metrics=TrainingUtils.compute_metrics,
        data_collator=CustomDataCollator(tokenizer=tokenizer),
    )

    # Train model
    if torch.cuda.is_available():
        reset_peak_memory_stats()

    logger.log("\nStarting training...")
    start_time = time.time()
    trainer.train()
    training_time = time.time() - start_time
    peak_memory = max_memory_allocated() / (1024**3) if torch.cuda.is_available() else 0

    # After training, get final resource usage
    final_cpu = resource_monitor.get_cpu_utilization()
    final_ram = resource_monitor.get_ram_usage()
    final_gpu_util = resource_monitor.get_gpu_utilization()
    final_gpu_mem = resource_monitor.get_gpu_memory_usage()

    # Enhanced results logging
    logger.log("\n=== Model Size Analysis ===")
    logger.log(f"Base model size: {base_model_size:.2f} MB")
    logger.log(f"QLoRA model size: {qlora_model_size:.2f} MB")
    logger.log(f"Size reduction: {((base_model_size - qlora_model_size) / base_model_size * 100):.2f}%")

    logger.log("\n=== Hardware Resource Usage ===")
    logger.log(f"Training time: {training_time:.2f} seconds")
    logger.log(f"Peak GPU memory usage: {peak_memory:.2f} GB")
    logger.log(f"Final GPU memory usage: {final_gpu_mem:.2f} GB")
    if initial_gpu_util is not None:
        logger.log(f"GPU utilization: {initial_gpu_util:.1f}% → {final_gpu_util:.1f}%")
    logger.log(f"CPU utilization: {initial_cpu:.1f}% → {final_cpu:.1f}%")
    logger.log(f"RAM usage: {initial_ram:.2f} GB → {final_ram:.2f} GB")

    # Save model
    logger.log("\nSaving final model...")
    trainer.save_model(config.FINAL_MODEL_DIR)

    # Evaluate fine-tuned model
    logger.log("\nEvaluating fine-tuned model...")
    ft_results = trainer.evaluate(test_tokenized)
    ft_predictions = trainer.predict(test_tokenized)

    # Log results
    logger.log("\n=== Results Summary ===")
    logger.log(f"Base model accuracy: {base_results['eval_accuracy']:.4f}")
    logger.log(f"Fine-tuned model accuracy: {ft_results['eval_accuracy']:.4f}")
    logger.log(f"Training time: {training_time:.2f} seconds")
    logger.log(f"Peak GPU memory usage: {peak_memory:.2f} GB")

    # Analyze results
    base_preds = base_predictions.predictions.argmax(axis=1)
    ft_preds = ft_predictions.predictions.argmax(axis=1)
    true_labels = base_predictions.label_ids

    corrected_cases = [i for i in range(len(true_labels))
                      if base_preds[i] != true_labels[i] and ft_preds[i] == true_labels[i]]
    still_wrong_cases = [i for i in range(len(true_labels))
                        if base_preds[i] != true_labels[i] and ft_preds[i] != true_labels[i]]

    logger.log(f"\nNumber of cases corrected by fine-tuning: {len(corrected_cases)}")
    logger.log(f"Number of cases still wrong after fine-tuning: {len(still_wrong_cases)}")

    # Save detailed analysis
    with open(config.ANALYSIS_FILE, "w") as f:
        f.write("=== Corrected Cases ===\n")
        for idx in corrected_cases[:5]:
            f.write(f"\nExample {idx}:\n")
            f.write(f"Premise: {test_data[idx]['premise']}\n")
            f.write(f"Hypothesis: {test_data[idx]['hypothesis']}\n")
            f.write(f"True label: {true_labels[idx]}\n")
            f.write(f"Base prediction: {base_preds[idx]}\n")
            f.write(f"Fine-tuned prediction: {ft_preds[idx]}\n")

        f.write("\n=== Still Wrong Cases ===\n")
        for idx in still_wrong_cases[:5]:
            f.write(f"\nExample {idx}:\n")
            f.write(f"Premise: {test_data[idx]['premise']}\n")
            f.write(f"Hypothesis: {test_data[idx]['hypothesis']}\n")
            f.write(f"True label: {true_labels[idx]}\n")
            f.write(f"Base prediction: {base_preds[idx]}\n")
            f.write(f"Fine-tuned prediction: {ft_preds[idx]}\n")

    logger.log(f"\nAll outputs have been saved to: {config.OUTPUT_DIR}")

if __name__ == "__main__":
    main()

Mounted at /content/drive
Starting script execution...
Output directory: /content/drive/My Drive/Sem 7/LLM/Assignment 3/A3_Outputs
Loading datasets...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]


Evaluating base model...


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model size: 10119.49 MB
Base model total parameters: 2,648,568,320


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Trainer is attempting to log a value of "[1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 0, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 1, 2]" of type <class 'list'> for key "eval/predictions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[1, 0, 1, 1, 1, 2, 1, 2, 0, 1, 0, 0, 0, 2, 1, 1, 2, 0, 0, 2, 1, 2, 1, 0, 2, 2, 2, 2, 2, 0, 0, 0, 1, 1, 0, 2, 1, 2, 2, 0, 0, 1, 1, 2, 1, 2, 1, 1, 1, 2, 0, 2, 1, 2, 1, 2, 2, 2, 2, 1, 0, 1, 0, 1, 2, 2, 2, 0, 1, 0, 0, 2, 2, 2, 1, 0, 0, 0, 2, 0, 2, 1, 2, 0, 1, 1, 0, 0, 1, 1, 2, 0, 1, 0, 0, 1, 2, 2, 2, 1]" of type <class 'list'> for key "eval/labels" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc



Initializing QLoRA model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total parameters: 1,398,149,120
Trainable parameters: 7,872,000
Percentage of parameters fine-tuned: 0.56%

Tokenizing datasets...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Predictions,Labels
1,1.3327,1.106526,0.4,"[1, 0, 1, 1, 2, 2, 2, 1, 0, 2, 0, 2, 1, 1, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 2, 1, 0, 2, 2, 0, 0, 0, 2, 0, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 2, 1, 0, 0, 2, 1, 1, 1, 0, 1, 2, 0, 1, 1, 0, 1, 0, 2, 1, 2, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 2, 1, 1, 0, 1, 0, 1, 0, 2, 0, 2, 0, 1]","[1, 2, 2, 0, 2, 1, 2, 2, 0, 2, 1, 2, 0, 0, 1, 1, 1, 2, 0, 1, 2, 0, 2, 2, 1, 0, 2, 2, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 2, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 2, 1, 1, 0, 1, 0, 2, 1, 0, 1, 1, 0, 2, 1, 0, 0, 0, 2, 2, 1, 1, 1, 1, 2, 0, 1, 0, 0, 2, 2, 2, 1, 2, 0, 2, 0, 1, 1, 0, 1, 1, 1, 2, 2, 1, 1, 1, 0, 0]"
2,0.7829,0.741942,0.67,"[1, 2, 2, 1, 2, 2, 2, 1, 0, 2, 1, 2, 0, 0, 1, 2, 1, 2, 0, 0, 2, 0, 2, 2, 2, 0, 2, 1, 0, 1, 0, 2, 0, 2, 1, 1, 0, 2, 1, 0, 2, 2, 0, 1, 2, 2, 1, 1, 0, 1, 0, 2, 1, 0, 0, 2, 0, 1, 1, 0, 1, 2, 0, 2, 2, 0, 0, 0, 2, 2, 2, 1, 0, 1, 1, 0, 2, 0, 0, 2, 2, 0, 1, 0, 1, 2, 0, 1, 1, 0, 1, 0, 1, 2, 0, 2, 2, 0, 0, 0]","[1, 2, 2, 0, 2, 1, 2, 2, 0, 2, 1, 2, 0, 0, 1, 1, 1, 2, 0, 1, 2, 0, 2, 2, 1, 0, 2, 2, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 2, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 2, 1, 1, 0, 1, 0, 2, 1, 0, 1, 1, 0, 2, 1, 0, 0, 0, 2, 2, 1, 1, 1, 1, 2, 0, 1, 0, 0, 2, 2, 2, 1, 2, 0, 2, 0, 1, 1, 0, 1, 1, 1, 2, 2, 1, 1, 1, 0, 0]"
3,0.5424,0.624722,0.74,"[1, 2, 1, 0, 2, 2, 2, 1, 1, 2, 1, 2, 0, 0, 1, 0, 1, 1, 0, 0, 2, 0, 2, 2, 2, 0, 1, 2, 0, 1, 0, 2, 0, 2, 1, 1, 0, 2, 1, 0, 1, 2, 0, 1, 2, 2, 1, 0, 0, 0, 0, 2, 1, 1, 0, 1, 0, 1, 1, 0, 1, 2, 0, 2, 1, 0, 0, 0, 2, 2, 2, 1, 0, 1, 1, 0, 1, 0, 0, 2, 2, 0, 1, 0, 1, 2, 0, 1, 1, 0, 1, 0, 1, 2, 0, 1, 1, 0, 0, 0]","[1, 2, 2, 0, 2, 1, 2, 2, 0, 2, 1, 2, 0, 0, 1, 1, 1, 2, 0, 1, 2, 0, 2, 2, 1, 0, 2, 2, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 2, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 2, 1, 1, 0, 1, 0, 2, 1, 0, 1, 1, 0, 2, 1, 0, 0, 0, 2, 2, 1, 1, 1, 1, 2, 0, 1, 0, 0, 2, 2, 2, 1, 2, 0, 2, 0, 1, 1, 0, 1, 1, 1, 2, 2, 1, 1, 1, 0, 0]"
4,0.3357,0.566746,0.78,"[1, 2, 1, 0, 2, 2, 2, 1, 0, 2, 1, 2, 0, 1, 1, 2, 1, 2, 0, 0, 2, 0, 2, 2, 2, 0, 1, 2, 0, 1, 0, 2, 0, 2, 1, 2, 0, 2, 1, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 2, 1, 1, 0, 1, 0, 1, 1, 1, 1, 2, 0, 2, 1, 0, 0, 0, 2, 2, 1, 1, 1, 1, 1, 0, 1, 0, 0, 2, 2, 2, 1, 2, 1, 2, 0, 1, 1, 0, 2, 1, 2, 2, 2, 1, 1, 0, 0, 0]","[1, 2, 2, 0, 2, 1, 2, 2, 0, 2, 1, 2, 0, 0, 1, 1, 1, 2, 0, 1, 2, 0, 2, 2, 1, 0, 2, 2, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 2, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 2, 1, 1, 0, 1, 0, 2, 1, 0, 1, 1, 0, 2, 1, 0, 0, 0, 2, 2, 1, 1, 1, 1, 2, 0, 1, 0, 0, 2, 2, 2, 1, 2, 0, 2, 0, 1, 1, 0, 1, 1, 1, 2, 2, 1, 1, 1, 0, 0]"
5,0.2638,0.593602,0.81,"[1, 2, 1, 0, 2, 2, 2, 1, 0, 2, 1, 2, 0, 1, 1, 0, 1, 1, 0, 0, 2, 0, 2, 2, 1, 0, 1, 2, 0, 1, 0, 2, 0, 2, 1, 2, 0, 1, 1, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 2, 1, 1, 0, 1, 0, 1, 1, 0, 1, 2, 0, 2, 1, 0, 0, 0, 2, 2, 1, 1, 1, 1, 1, 0, 1, 0, 0, 2, 2, 2, 1, 2, 1, 2, 0, 1, 1, 0, 2, 1, 1, 2, 2, 1, 1, 0, 0, 0]","[1, 2, 2, 0, 2, 1, 2, 2, 0, 2, 1, 2, 0, 0, 1, 1, 1, 2, 0, 1, 2, 0, 2, 2, 1, 0, 2, 2, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 2, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 2, 1, 1, 0, 1, 0, 2, 1, 0, 1, 1, 0, 2, 1, 0, 0, 0, 2, 2, 1, 1, 1, 1, 2, 0, 1, 0, 0, 2, 2, 2, 1, 2, 0, 2, 0, 1, 1, 0, 1, 1, 1, 2, 2, 1, 1, 1, 0, 0]"


Trainer is attempting to log a value of "[1, 0, 1, 1, 2, 2, 2, 1, 0, 2, 0, 2, 1, 1, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 2, 1, 0, 2, 2, 0, 0, 0, 2, 0, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 2, 1, 0, 0, 2, 1, 1, 1, 0, 1, 2, 0, 1, 1, 0, 1, 0, 2, 1, 2, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 2, 1, 1, 0, 1, 0, 1, 0, 2, 0, 2, 0, 1]" of type <class 'list'> for key "eval/predictions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[1, 2, 2, 0, 2, 1, 2, 2, 0, 2, 1, 2, 0, 0, 1, 1, 1, 2, 0, 1, 2, 0, 2, 2, 1, 0, 2, 2, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 2, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 2, 1, 1, 0, 1, 0, 2, 1, 0, 1, 1, 0, 2, 1, 0, 0, 0, 2, 2, 1, 1, 1, 1, 2, 0, 1, 0, 0, 2, 2, 2, 1, 2, 0, 2, 0, 1, 1, 0, 1, 1, 1, 2, 2, 1, 1, 1, 0, 0]" of type <class 'list'> for key "eval/labels" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
 


=== Model Size Analysis ===
Base model size: 10119.49 MB
QLoRA model size: 1741.52 MB
Size reduction: 82.79%

=== Hardware Resource Usage ===
Training time: 601.44 seconds
Peak GPU memory usage: 12.32 GB
Final GPU memory usage: 12.50 GB
CPU utilization: 35.4% → 65.7%
RAM usage: 4.27 GB → 4.35 GB

Saving final model...

Evaluating fine-tuned model...


Trainer is attempting to log a value of "[2, 0, 2, 1, 0, 2, 0, 2, 0, 1, 2, 1, 0, 2, 1, 1, 2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2, 1, 0, 2, 1, 2, 2, 0, 0, 2, 0, 2, 1, 2, 2, 2, 1, 2, 0, 2, 1, 2, 0, 2, 2, 2, 2, 1, 0, 1, 0, 1, 2, 2, 2, 0, 1, 0, 0, 2, 2, 2, 1, 0, 0, 0, 2, 0, 2, 2, 2, 0, 1, 2, 0, 1, 1, 1, 2, 0, 1, 1, 0, 1, 2, 2, 2, 1]" of type <class 'list'> for key "eval/predictions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[1, 0, 1, 1, 1, 2, 1, 2, 0, 1, 0, 0, 0, 2, 1, 1, 2, 0, 0, 2, 1, 2, 1, 0, 2, 2, 2, 2, 2, 0, 0, 0, 1, 1, 0, 2, 1, 2, 2, 0, 0, 1, 1, 2, 1, 2, 1, 1, 1, 2, 0, 2, 1, 2, 1, 2, 2, 2, 2, 1, 0, 1, 0, 1, 2, 2, 2, 0, 1, 0, 0, 2, 2, 2, 1, 0, 0, 0, 2, 0, 2, 1, 2, 0, 1, 1, 0, 0, 1, 1, 2, 0, 1, 0, 0, 1, 2, 2, 2, 1]" of type <class 'list'> for key "eval/labels" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.



=== Results Summary ===
Base model accuracy: 0.3800
Fine-tuned model accuracy: 0.8200
Training time: 601.44 seconds
Peak GPU memory usage: 12.32 GB

Number of cases corrected by fine-tuning: 58
Number of cases still wrong after fine-tuning: 4

All outputs have been saved to: /content/drive/My Drive/Sem 7/LLM/Assignment 3/A3_Outputs
