In [1]:
!pip install peft
!pip install accelerate
!pip install bitsandbytes  # for 8-bit optimization if needed
!pip install datasets
!pip install accelerate bitsandbytes
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install -U git+https://github.com/huggingface/peft.git



Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
D

In [2]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, get_scheduler
from datasets import Dataset
import pandas as pd
import numpy as np
from peft import LoraConfig, get_peft_model, TaskType
from typing import List, Dict

# Ensure reproducibility
np.random.seed(42)
torch.manual_seed(42)




<torch._C.Generator at 0x7a2fd824e430>

In [3]:
def calculate_question_difficulty(text: str) -> float:
    """
    Calculate question difficulty based on various heuristics.
    """
    # Simple heuristics for difficulty scoring
    difficulty_score = 0

    # Length-based complexity
    difficulty_score += len(text.split()) * 0.01

    # Keyword-based complexity
    complex_keywords = ['analyze', 'evaluate', 'explain', 'compare', 'contrast', 'predict']
    difficulty_score += sum(word in text.lower() for word in complex_keywords) * 0.5

    # Number of technical terms (can be expanded)
    technical_terms = ['algorithm', 'theory', 'principle', 'methodology']
    difficulty_score += sum(term in text.lower() for term in technical_terms) * 0.3

    return difficulty_score

In [4]:
def prepare_data(data_path: str, tokenizer) -> Dataset:
    """
    Load and preprocess the dataset.
    """
    df = pd.read_csv(data_path)

    # Format input text
    df["input_text"] = df.apply(
        lambda x: f"Question: {x['prompt']}\nA) {x['A']}\nB) {x['B']}\nC) {x['C']}\nD) {x['D']}\nE) {x['E']}\nAnswer: {x['answer']}</s>",
        axis=1
    )

    # Convert to Hugging Face Dataset
    dataset = Dataset.from_pandas(df[["input_text"]])

    # Tokenize the dataset
    def tokenize_function(examples):
        outputs = tokenizer(
            examples["input_text"],
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors=None
        )
        outputs["labels"] = outputs["input_ids"].copy()
        return outputs

    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names
    )

    return tokenized_dataset

In [5]:
def create_curriculum_dataloaders(tokenized_dataset: Dataset, num_stages: int = 3):
    """
    Create curriculum learning stages based on sequence length instead of difficulty score.
    """
    # Use sequence length as a proxy for difficulty
    sequence_lengths = [sum(attention_mask) for attention_mask in tokenized_dataset['attention_mask']]

    # Create a new dataset with sequence lengths
    indexed_dataset = Dataset.from_dict({
        'index': range(len(tokenized_dataset)),
        'length': sequence_lengths
    })

    # Sort by sequence length
    sorted_indices = sorted(range(len(sequence_lengths)), key=lambda k: sequence_lengths[k])

    # Split into stages
    stage_size = len(sorted_indices) // num_stages
    stages = []

    for i in range(num_stages):
        start_idx = i * stage_size
        end_idx = (i + 1) * stage_size if i < num_stages - 1 else len(sorted_indices)
        stage_indices = sorted_indices[start_idx:end_idx]
        stages.append(tokenized_dataset.select(stage_indices))

    return stages

In [6]:
def setup_model(model_name="facebook/opt-1.3b"):
    # Configure training optimizations
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True
    )
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    # Load model with optimizations
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        use_cache=False
    )

    # Configure LoRA
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
    )

    # Prepare model for training
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()
    model = get_peft_model(model, lora_config)

    # Print trainable parameters
    model.print_trainable_parameters()

    return model, tokenizer

In [7]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling  # Changed from default_data_collator
)

def fine_tune_model(
    dataset: Dataset,
    output_dir: str = "fine_tuned_model"
) -> tuple:
    """
    Fine-tune the LLM using LoRA and curriculum learning.
    """
    os.environ["WANDB_DISABLED"] = "true"

    # Initialize model and tokenizer
    model, tokenizer = setup_model()

    # Initialize data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="steps",
        eval_steps=100,
        learning_rate=2e-4,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=16,
        num_train_epochs=1,
        weight_decay=0.05,
        save_steps=500,
        save_total_limit=2,
        logging_dir="./logs",
        logging_steps=10,
        report_to="none",
        fp16=True,
        warmup_steps=100,
        dataloader_num_workers=0,
        remove_unused_columns=False,  # Changed to False
        gradient_checkpointing=True,
        max_grad_norm=0.3,
        ddp_find_unused_parameters=False
    )

    # Create curriculum stages
    stages = create_curriculum_dataloaders(dataset, num_stages=3)

    # Train through curriculum stages
    for stage_idx, stage_dataset in enumerate(stages):
        print(f"\nTraining on curriculum stage {stage_idx + 1}/{len(stages)}")

        # Split into train and eval
        train_size = int(0.8 * len(stage_dataset))
        train_dataset = stage_dataset.select(range(train_size))
        eval_dataset = stage_dataset.select(range(train_size, len(stage_dataset)))

        # Ensure datasets have the right format
        print("Training dataset features:", train_dataset.features)
        print("Sample training input:", train_dataset[0])

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
        )

        trainer.train()
        eval_results = trainer.evaluate()
        print(f"Stage {stage_idx + 1} evaluation results:", eval_results)

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    return model, tokenizer


In [8]:
def generate_answer(question: str, model, tokenizer) -> str:
    """
    Generate an answer using the fine-tuned model with format matching your data.
    """
    # Format prompt to match your data format
    prompt = f"""Question: {question}
Answer: """

    # Prepare input
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate with specific parameters
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=512,
            num_return_sequences=1,
            temperature=0.9,  # Increased for more randomness
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the part after "Answer:"
    answer_part = response.split("Answer:")[-1].strip()
    return answer_part

In [9]:
def extract_selected_option(generated_answer: str, options: List[str]) -> str:
    """
    Extract the selected option from the generated answer based on your data format.
    """
    # Clean and uppercase the answer
    answer_upper = generated_answer.upper().strip()

    # First check if the answer is just a letter
    if len(answer_upper) == 1 and answer_upper in ['A', 'B', 'C', 'D', 'E']:
        return answer_upper

    # Look for exact matches in your data format (e.g., "A)")
    for option in options:
        if option.upper() in answer_upper:
            return option[0]  # Return just the letter

    # Fallback: look for first occurrence of A, B, C, D, or E
    for char in answer_upper:
        if char in ['A', 'B', 'C', 'D', 'E']:
            return char

    return "N/A"



In [10]:
!pip install accelerate bitsandbytes
!pip install transformers>=4.34.0




In [11]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
The token `MIXTRAL` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authentica

In [12]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling  # Changed from default_data_collator
)
import torch

# Define model name
model_name = "mistralai/Mistral-7B-v0.1"

# Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="right",
    use_fast=True,
)
tokenizer.pad_token = tokenizer.eos_token

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config=bnb_config
)

# Enable memory optimizations
torch.cuda.empty_cache()
model.gradient_checkpointing_enable()




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [13]:
import torch
torch.cuda.empty_cache()
model.gradient_checkpointing_enable()


In [None]:
def main():
    # Paths and configurations
    dataset_path = "/content/Hackathon_KB.csv"
    fine_tuned_dir = "fine_tuned_model"

    # Prepare dataset
    print("Preparing dataset...")
    model, tokenizer = setup_model()  # Get tokenizer for data preparation
    dataset = prepare_data(dataset_path, tokenizer)

    print("Fine-tuning model...")
    model, tokenizer = fine_tune_model(dataset, output_dir=fine_tuned_dir)

    # Process test questions
    print("Processing test questions...")
    df = pd.read_csv("/content/Hackathon_Question_set_sample.csv")
    df['Question'] = df['Question'].fillna('').astype(str)

    results = []
    for idx, row in df.iterrows():
        question = row['Question']
        if not question or question.isspace():
            results.append({
                "Number": row['Number'],
                "Answer": "N/A",
                "Generated_Text": ""
            })
            continue

        try:
            options = [opt.strip() for opt in question.split() if opt.endswith(")")]
            generated_answer = generate_answer(question, model, tokenizer)
            selected_option = extract_selected_option(generated_answer, options)

            # Print for debugging
            print(f"\nQuestion {row['Number']}:")
            print(f"Generated text: {generated_answer}")
            print(f"Selected option: {selected_option}")

            results.append({
                "Number": row['Number'],
                "Answer": selected_option,
                "Generated_Text": generated_answer
            })

        except Exception as e:
            print(f"Error processing question {row['Number']}: {str(e)}")
            results.append({
                "Number": row['Number'],
                "Answer": "Error",
                "Generated_Text": str(e)
            })

    # Save results with generated text for analysis
    results_df = pd.DataFrame(results)
    results_df.to_csv("answers.csv", index=False)
    print("\nResults saved to answers.csv")

    # Print distribution of answers
    answer_dist = results_df['Answer'].value_counts()
    print("\nDistribution of answers:")
    print(answer_dist)

if __name__ == "__main__":
    main()


Preparing dataset...


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

trainable params: 2,359,296 || all params: 1,318,117,376 || trainable%: 0.1790


Map:   0%|          | 0/11876 [00:00<?, ? examples/s]

Fine-tuning model...
trainable params: 2,359,296 || all params: 1,318,117,376 || trainable%: 0.1790

Training on curriculum stage 1/3
Training dataset features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}
Sample training input: {'input_ids': [2, 45641, 35, 520, 21, 83, 12, 11127, 4790, 116, 50118, 250, 43, 18069, 50118, 387, 43, 17616, 50118, 347, 43, 19515, 50118, 495, 43, 13466, 50118, 717, 43, 6200, 50118, 33683, 35, 163, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

Step,Training Loss,Validation Loss
