In [None]:
!pip install peft
!pip install accelerate
!pip install bitsandbytes  # for 8-bit optimization if needed
!pip install datasets




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, get_scheduler
from datasets import Dataset
import pandas as pd
import numpy as np
from peft import LoraConfig, get_peft_model, TaskType
from typing import List, Dict

# Ensure reproducibility
np.random.seed(42)
torch.manual_seed(42)




<torch._C.Generator at 0x7d087b279950>

In [None]:
def calculate_question_difficulty(text: str) -> float:
    """
    Calculate question difficulty based on various heuristics.
    """
    # Simple heuristics for difficulty scoring
    difficulty_score = 0

    # Length-based complexity
    difficulty_score += len(text.split()) * 0.01

    # Keyword-based complexity
    complex_keywords = ['analyze', 'evaluate', 'explain', 'compare', 'contrast', 'predict']
    difficulty_score += sum(word in text.lower() for word in complex_keywords) * 0.5

    # Number of technical terms (can be expanded)
    technical_terms = ['algorithm', 'theory', 'principle', 'methodology']
    difficulty_score += sum(term in text.lower() for term in technical_terms) * 0.3

    return difficulty_score

In [None]:
def prepare_data(data_path: str, tokenizer) -> Dataset:
    """
    Load and preprocess the dataset with proper labels for language modeling.
    """
    df = pd.read_csv(data_path,encoding="ISO-8859-1")

    # Format input text
    df["input_text"] = df.apply(
        lambda x: f"Question: {x['prompt']}\nA) {x['A']}\nB) {x['B']}\nC) {x['C']}\nD) {x['D']}\nE) {x['E']}\nAnswer: {x['answer']}\n",
        axis=1
    )

    # Tokenize the dataset
    def tokenize_function(examples):
        tokenized = tokenizer(
            examples["input_text"],
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors=None
        )

        # Set labels for causal language modeling
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized

    # Convert to Hugging Face Dataset and tokenize
    dataset = Dataset.from_pandas(df[["input_text"]])
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names
    )

    return tokenized_dataset

In [None]:
def create_curriculum_dataloaders(tokenized_dataset: Dataset, num_stages: int = 3):
    """
    Create curriculum learning stages based on sequence length instead of difficulty score.
    """
    # Use sequence length as a proxy for difficulty
    sequence_lengths = [sum(attention_mask) for attention_mask in tokenized_dataset['attention_mask']]

    # Create a new dataset with sequence lengths
    indexed_dataset = Dataset.from_dict({
        'index': range(len(tokenized_dataset)),
        'length': sequence_lengths
    })

    # Sort by sequence length
    sorted_indices = sorted(range(len(sequence_lengths)), key=lambda k: sequence_lengths[k])

    # Split into stages
    stage_size = len(sorted_indices) // num_stages
    stages = []

    for i in range(num_stages):
        start_idx = i * stage_size
        end_idx = (i + 1) * stage_size if i < num_stages - 1 else len(sorted_indices)
        stage_indices = sorted_indices[start_idx:end_idx]
        stages.append(tokenized_dataset.select(stage_indices))

    return stages

In [None]:
def fine_tune_model(
    dataset: Dataset,
    model_name: str = "EleutherAI/gpt-neo-1.3B",
    #model_name: str = "mistralai/Mistral-7B-v0.1",
    output_dir: str = "fine_tuned_model"
) -> tuple:
    os.environ["WANDB_DISABLED"] = "true"

    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        use_cache=False
    ).to('cuda' if torch.cuda.is_available() else 'cpu')

    # Configure LoRA
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=['q_proj', 'v_proj', 'k_proj'],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False
    )

    # Get PEFT model
    model = get_peft_model(model, lora_config)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="steps",
        eval_steps=100,
        learning_rate=2e-4,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        num_train_epochs=1,
        weight_decay=0.05,
        save_steps=500,
        save_total_limit=2,
        logging_dir="./logs",
        logging_steps=10,
        report_to="none",
        fp16=True,
        warmup_steps=100,
        dataloader_num_workers=0,
        remove_unused_columns=True,  # Changed to True
        no_cuda=False
    )

    # Create curriculum stages
    stages = create_curriculum_dataloaders(dataset, num_stages=3)

    # Custom data collator
    from transformers import default_data_collator

    # Train through curriculum stages
    for stage_idx, stage_dataset in enumerate(stages):
        print(f"\nTraining on curriculum stage {stage_idx + 1}/{len(stages)}")

        # Split into train and eval
        train_size = int(0.8 * len(stage_dataset))
        train_dataset = stage_dataset.select(range(train_size))
        eval_dataset = stage_dataset.select(range(train_size, len(stage_dataset)))

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=default_data_collator,  # Added data collator
        )

        trainer.train()

        eval_results = trainer.evaluate()
        print(f"Stage {stage_idx + 1} evaluation results:", eval_results)

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    return model, tokenizer

In [None]:
def generate_answer(question: str, model, tokenizer) -> str:
    """
    Generate an answer using the fine-tuned model.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer.encode(question, return_tensors="pt", truncation=True).to(device)

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=200,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()


In [None]:
def extract_selected_option(generated_answer: str, options: List[str]) -> str:
    """
    Extract the selected option from the generated answer.
    """
    for option in options:
        if option in generated_answer:
            return option
    return "N/A"


In [None]:
def main():
    # Paths and configurations
    dataset_path = "/content/drive/MyDrive/data/Hackathon_KB_updated.csv"
    fine_tuned_dir = "fine_tuned_model"

    # Initialize tokenizer first
    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # Prepare dataset and fine-tune model
    print("Preparing dataset...")
    dataset = prepare_data(dataset_path, tokenizer)  # Pass tokenizer to prepare_data

    print("Fine-tuning model...")
    model, tokenizer = fine_tune_model(dataset, output_dir=fine_tuned_dir)



    print("Processing test questions...")
    df = pd.read_csv("/content/drive/MyDrive/Hackathon_Question_set (1).csv")

    results = []
    for idx, row in df.iterrows():
        question = row['Question']
        options = [opt.strip() for opt in question.split() if opt.endswith(")")]
        generated_answer = generate_answer(question, model, tokenizer)
        selected_option = extract_selected_option(generated_answer, options)
        results.append({"Number": row['Number'], "Answer": selected_option})

        # Progress update
        if (idx + 1) % 10 == 0:
            print(f"Processed {idx + 1}/{len(df)} questions")

    # Save results
    results_df = pd.DataFrame(results)
    results_df.to_csv("answers.csv", index=False)
    print("Results saved to answers.csv")

if __name__ == "__main__":
    main()

Preparing dataset...


Map:   0%|          | 0/11975 [00:00<?, ? examples/s]

Fine-tuning model...

Training on curriculum stage 1/3


Step,Training Loss,Validation Loss
100,0.122,0.151664
200,0.1043,0.14772
300,0.088,0.147618


Stage 1 evaluation results: {'eval_loss': 0.146981343626976, 'eval_runtime': 101.2136, 'eval_samples_per_second': 7.894, 'eval_steps_per_second': 0.988, 'epoch': 1.0}

Training on curriculum stage 2/3


Step,Training Loss,Validation Loss
100,0.2236,0.306284
200,0.2186,0.302203
300,0.2178,0.298856


Stage 2 evaluation results: {'eval_loss': 0.2976418137550354, 'eval_runtime': 101.5432, 'eval_samples_per_second': 7.869, 'eval_steps_per_second': 0.985, 'epoch': 1.0}

Training on curriculum stage 3/3


Step,Training Loss,Validation Loss
100,0.3614,0.513434


In [None]:

    print("Processing test questions...")
    df = pd.read_csv("/content/Hackathon_Question_set_sample22.csv")

    results = []
    for idx, row in df.iterrows():
        question = row['Question']
        options = [opt.strip() for opt in question.split() if opt.endswith(")")]
        generated_answer = generate_answer(question, model, tokenizer)
        selected_option = extract_selected_option(generated_answer, options)
        results.append({"Number": row['Number'], "Answer": selected_option})

        # Progress update
        if (idx + 1) % 10 == 0:
            print(f"Processed {idx + 1}/{len(df)} questions")

    # Save results
    results_df = pd.DataFrame(results)
    results_df.to_csv("answers.csv", index=False)
    print("Results saved to answers.csv")

if __name__ == "__main__":
    main()