# Setup

In [1]:
# Setup
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
# Install required packages
!pip install transformers
!pip install peft
!pip install accelerate
!pip install bitsandbytes
!pip install sentencepiece

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [3]:
# Load preprocessed data
BASE_PATH = "/content/drive/MyDrive/"
PROCESSED_DATA_PATH = f"{BASE_PATH}/data/processed/llama_training"

# Verify environment and data loading

In [5]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [6]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import json

In [7]:
# Clear GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [8]:
from huggingface_hub import login
login("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")

# My real token is removed prior to submitting this file for hand-in for security purpose

## Model Initialization Function

In [23]:
def initialize_model_4bit():
    print("Initializing 4-bit quantized model...")

    tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
    tokenizer.pad_token = tokenizer.eos_token

    # 4-bit quantization config
    from transformers import BitsAndBytesConfig
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True
    )

    # Define custom device map
    device_map = {
        'model.embed_tokens': 0,
        'model.layers': 0,
        'model.norm': 0,
        'lm_head': 0
    }

    # Initialize model
    model = LlamaForCausalLM.from_pretrained(
        "meta-llama/Llama-2-7b-hf",
        quantization_config=bnb_config,
        device_map=device_map,
        torch_dtype=torch.float16
    )

    # Configure LoRA with minimal parameters
    lora_config = LoraConfig(
        r=2,                     # Even smaller rank
        lora_alpha=4,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )

    # Prepare model for k-bit training
    from peft import prepare_model_for_kbit_training
    model = prepare_model_for_kbit_training(model)

    # Create PEFT model
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    return model, tokenizer

# Update how you call the initialization
model, tokenizer = initialize_model_4bit()

Initializing 4-bit quantized model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 1,048,576 || all params: 6,739,464,192 || trainable%: 0.0156


## Training Arguments

In [24]:
training_args = TrainingArguments(
    output_dir=f"{BASE_PATH}/outputs/llama_fine_tuned",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    warmup_steps=100,
    logging_steps=10,

    # Memory optimizations
    fp16=True,
    gradient_checkpointing=True,
    max_grad_norm=0.3,

    # Reduced evaluation frequency
    evaluation_strategy="steps",
    eval_steps=100,

    # Minimal checkpointing
    save_strategy="epoch",
    save_total_limit=1,

    # Other optimizations
    optim="adamw_torch",
    logging_dir=f"{BASE_PATH}/outputs/llama_fine_tuned/logs",
    report_to="none",

    # Remove unused columns to save memory
    remove_unused_columns=False,
)



In [25]:
# Debugging: Checking GPU memory before model initialization
import torch
print("GPU Information:")
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print(f"Free GPU Memory: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB used")

GPU Information:
GPU Available: True
GPU Device: Tesla T4
Total GPU Memory: 14.75 GB
Free GPU Memory: 5.39 GB used


## Initialize Model

In [27]:
# Initialize model and tokenizer
model, tokenizer = initialize_model_4bit()

Initializing 4-bit quantized model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 22.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 17.06 MiB is free. Process 13393 has 14.73 GiB memory in use. Of the allocated memory 14.22 GiB is allocated by PyTorch, and 405.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Dataset Preparation

In [17]:
# Prepare datasets with memory efficiency
def prepare_training_dataset(data_path):
    with open(data_path, 'r') as f:
        data = json.load(f)
    return Dataset.from_list(data)

# Load and split dataset
print("Preparing datasets...")
full_dataset = prepare_training_dataset(f"{PROCESSED_DATA_PATH}/processed_original_dataset.json")
train_test_split = full_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Training set size: {len(train_dataset)}")
print(f"Evaluation set size: {len(eval_dataset)}")

Preparing datasets...
Training set size: 225
Evaluation set size: 25


## Trainer Setup

In [18]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
        pad_to_multiple_of=8
    )
)

print("Training setup complete!")

Training setup complete!


## Start Training with Monitoring

In [19]:
# Start training with monitoring
print("Starting training process...")

try:
    training_output = trainer.train()

    print("\nTraining completed successfully!")
    print("\nTraining metrics:")
    print(f"Total training steps: {training_output.global_step}")
    print(f"Final training loss: {training_output.training_loss}")

    # Save the final model
    print("\nSaving model...")
    save_dir = f"{BASE_PATH}/outputs/llama_fine_tuned/final_model"
    trainer.save_model(save_dir)
    print(f"Model saved to: {save_dir}")

    # Save training states
    trainer.save_state()
    print("Training state saved!")

except Exception as e:
    print(f"\nAn error occurred during training: {str(e)}")

    # Try to save current state even if there's an error
    try:
        save_dir = f"{BASE_PATH}/outputs/llama_fine_tuned/emergency_save"
        trainer.save_model(save_dir)
        print(f"Emergency save completed to: {save_dir}")
    except Exception as save_error:
        print(f"Could not complete emergency save: {str(save_error)}")

print("\nTraining process finished!")

Starting training process...

An error occurred during training: You can't train a model that has been loaded in 8-bit or 4-bit precision with CPU or disk offload. If you want train the 8-bit or 4-bit model in CPU, please install bitsandbytes with multi-backend, see https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend
Could not complete emergency save: Cannot copy out of meta tensor; no data!

Training process finished!


## Changing to a Smaller Model
- Switching to OPT-1.3B

In [28]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import json

# First, clear GPU memory
print("Clearing GPU memory...")
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"GPU Memory after clearing: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB used")

def initialize_smaller_model():
    print("Initializing smaller model (OPT-1.3B)...")

    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
    tokenizer.pad_token = tokenizer.eos_token

    # Initialize model
    model = AutoModelForCausalLM.from_pretrained(
        "facebook/opt-1.3b",
        torch_dtype=torch.float16,
        device_map="auto"
    )

    # Configure LoRA
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )

    # Create PEFT model
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    return model, tokenizer

# Training arguments for smaller model
training_args = TrainingArguments(
    output_dir=f"{BASE_PATH}/outputs/opt_fine_tuned",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    warmup_steps=50,
    logging_steps=10,

    # Memory optimizations
    fp16=True,
    gradient_checkpointing=True,

    # Evaluation and saving
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="epoch",
    save_total_limit=1,

    # Other settings
    logging_dir=f"{BASE_PATH}/outputs/opt_fine_tuned/logs",
    report_to="none",
    remove_unused_columns=False,
)

# Initialize model
try:
    print("\nInitializing model...")
    model, tokenizer = initialize_smaller_model()

    # Prepare datasets
    print("\nPreparing datasets...")
    def prepare_training_dataset(data_path):
        with open(data_path, 'r') as f:
            data = json.load(f)
        return Dataset.from_list(data)

    full_dataset = prepare_training_dataset(f"{PROCESSED_DATA_PATH}/processed_original_dataset.json")
    train_test_split = full_dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = train_test_split['train']
    eval_dataset = train_test_split['test']

    print(f"Training set size: {len(train_dataset)}")
    print(f"Evaluation set size: {len(eval_dataset)}")

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False
        )
    )

    print("\nTraining setup complete!")

except Exception as e:
    print(f"\nError during setup: {str(e)}")

Clearing GPU memory...
GPU Memory after clearing: 14.22 GB used

Initializing model...
Initializing smaller model (OPT-1.3B)...




tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

trainable params: 1,572,864 || all params: 1,317,330,944 || trainable%: 0.1194

Preparing datasets...
Training set size: 225
Evaluation set size: 25

Training setup complete!


## Let's train!

In [31]:
!pip install transformers



In [37]:
# Prepare datasets with proper tokenization
def prepare_training_dataset(data_path):
    print("Loading and preparing dataset...")

    with open(data_path, 'r') as f:
        raw_data = json.load(f)

    # Convert data to the format expected by the model
    texts = []
    for item in raw_data:
        # Combine instruction and response in a structured format
        text = f"""### Instruction:
{item['text'].split('### Instruction:')[1].split('### Response:')[0].strip()}
### Response:
{item['text'].split('### Response:')[1].split('### End')[0].strip()}
### End"""
        texts.append({'text': text})

    # Create initial dataset
    dataset = Dataset.from_list(texts)

    # Tokenize the dataset
    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors=None
        )

    # Apply tokenization to the whole dataset
    tokenized_dataset = dataset.map(
        tokenize_function,
        remove_columns=['text'],
        desc="Tokenizing dataset",
        batch_size=len(dataset)
    )

    print(f"Dataset created with {len(tokenized_dataset)} examples")
    # Verify the format
    print("\nDataset format check:")
    print(f"Features: {tokenized_dataset.features}")
    first_example = tokenized_dataset[0]
    print(f"First example keys: {list(first_example.keys())}")

    return tokenized_dataset

print("\nPreparing datasets...")
try:
    # Prepare the datasets
    full_dataset = prepare_training_dataset(f"{PROCESSED_DATA_PATH}/processed_original_dataset.json")

    # Split into train and validation
    train_test_split = full_dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = train_test_split['train']
    eval_dataset = train_test_split['test']

    print(f"\nTraining set size: {len(train_dataset)}")
    print(f"Evaluation set size: {len(eval_dataset)}")

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False,
        )
    )

    print("\nTraining setup complete!")

except Exception as e:
    print(f"\nError during dataset preparation: {str(e)}")
    raise


Preparing datasets...
Loading and preparing dataset...


Tokenizing dataset:   0%|          | 0/250 [00:00<?, ? examples/s]

Dataset created with 250 examples

Dataset format check:
Features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}
First example keys: ['input_ids', 'attention_mask']

Training set size: 225
Evaluation set size: 25

Training setup complete!


In [38]:
from transformers import TrainerCallback  # Corrected import

print("Starting training process...")

try:
    # Add progress monitoring
    class TrainingMonitor(TrainerCallback):
        def on_log(self, args, state, control, logs=None, **kwargs):
            if state.is_world_process_zero:
                if 'loss' in logs:
                    print(f"Step {state.global_step}: Training loss = {logs['loss']:.4f}")
                if 'eval_loss' in logs:
                    print(f"Step {state.global_step}: Evaluation loss = {logs['eval_loss']:.4f}")

                # Monitor GPU memory
                if torch.cuda.is_available():
                    allocated = torch.cuda.memory_allocated(0) / 1024**3
                    cached = torch.cuda.memory_reserved(0) / 1024**3
                    print(f"GPU Memory: {allocated:.2f} GB allocated, {cached:.2f} GB cached")

    # Add callback to trainer
    trainer.add_callback(TrainingMonitor())

    # Start training
    training_output = trainer.train()

    print("\nTraining completed successfully!")
    print("\nFinal Training Metrics:")
    print(f"Total training steps: {training_output.global_step}")
    print(f"Final training loss: {training_output.training_loss:.4f}")

    # Save the final model
    print("\nSaving model...")
    save_dir = f"{BASE_PATH}/outputs/opt_fine_tuned/final_model"
    trainer.save_model(save_dir)
    print(f"Model saved to: {save_dir}")

    # Save training state
    trainer.save_state()
    print("Training state saved!")

    # Print some final statistics
    if torch.cuda.is_available():
        print("\nFinal GPU Memory Usage:")
        print(f"Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
        print(f"Cached: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

except Exception as e:
    print(f"\nAn error occurred during training: {str(e)}")

    # Try to save current state even if there's an error
    try:
        save_dir = f"{BASE_PATH}/outputs/opt_fine_tuned/emergency_save"
        trainer.save_model(save_dir)
        print(f"Emergency save completed to: {save_dir}")
    except Exception as save_error:
        print(f"Could not complete emergency save: {str(save_error)}")

finally:
    # Clean up
    print("\nCleaning up...")
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    print("Training process finished!")

Starting training process...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...



An error occurred during training: element 0 of tensors does not require grad and does not have a grad_fn
Emergency save completed to: /content/drive/MyDrive//outputs/opt_fine_tuned/emergency_save

Cleaning up...
Training process finished!
