In [2]:
!pip install -q --upgrade transformers datasets peft bitsandbytes trl
!pip install -q accelerate

In [3]:
from accelerate.utils import write_basic_config
write_basic_config()

PosixPath('/root/.cache/huggingface/accelerate/default_config.yaml')

In [4]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    AutoConfig
)
from transformers import EarlyStoppingCallback
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import login
from peft import LoraConfig, get_peft_model,PeftModel
from trl import SFTTrainer
import transformers
from torch.utils.data import DataLoader
from tqdm import tqdm  # For progress bar

In [5]:
seed = 7
torch.manual_seed(seed)
np.random.seed(seed)

In [6]:
print(f"Number of GPUs Available:{torch.cuda.device_count()}")

Number of GPUs Available:2


In [7]:
# token = "your_hugging_face_token"
login(token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:
save_dir = ""
model_id = "meta-llama/Llama-3.2-1B-Instruct"

In [9]:
lora_config = {"lora_config1" : LoraConfig(
    r=256,
    lora_alpha=512,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config2" : LoraConfig(
    r=256,
    lora_alpha=512,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config3" : LoraConfig(
    r=256,
    lora_alpha=512,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config4" : LoraConfig(
    r=256,
    lora_alpha=512,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config5" : LoraConfig(
    r=256,
    lora_alpha=512,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
)
}

In [10]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16
)

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
tokenizer.padding_side = 'left'

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [12]:
# Define default tokens
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"

special_tokens_dict = {}
if tokenizer.pad_token is None:
    special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN

if special_tokens_dict:
    tokenizer.add_special_tokens(special_tokens_dict)

In [13]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    token=token
)

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [14]:
def preprocess_function(examples):
    # Format inputs with the instruction
    inputs = [
        f"<|begin_of_text|><|start_header_id|>system <|end_header_id|>"
        "You are an expert math assistant<|eot_id|><|start_header_id|>user <|end_header_id|>"
        f"Solve the following math problem: {problem}\n"
        "Show all intermediate steps and please mandatorily include the final answer in LaTeX format in a box like \\boxed{{}}."
        "<|eot_id|><|start_header_id|> assistant <|end_header_id|>"for problem in examples['problem']
    ]
    # Append the solution with EOS token
    targets = [
        f"{solution}{tokenizer.eos_token}"
        for solution in examples['solution']
    ]
    # Concatenate inputs and targets
    full_texts = [inp + tgt for inp, tgt in zip(inputs, targets)]
    # Tokenize the concatenated texts
    model_inputs = tokenizer(
        full_texts,
        max_length=512,
        truncation=True,
        padding="longest",  # Use dynamic padding
        return_tensors="pt"
    )
    # Create labels by cloning input IDs
    labels = model_inputs["input_ids"].clone()

    # Mask input tokens in labels
    for i in range(len(labels)):
        input_ids = tokenizer(inputs[i], add_special_tokens=False).input_ids
        input_len = len(input_ids)
        labels[i][:input_len] = -100  # Mask the input tokens
    model_inputs["labels"] = labels
    return model_inputs

In [15]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # For causal language modeling
    pad_to_multiple_of=8  # For faster GPU performance
)

In [16]:
dataset = load_dataset("Maxwell-Jia/MATH",trust_remote_code=True)
dataset['train'] = dataset['train'].filter(
        lambda x: x['level'] != f"Level ?"
    )

MATH.py:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

data/algebra_train.jsonl:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

(…)ata/counting_and_probability_train.jsonl:   0%|          | 0.00/707k [00:00<?, ?B/s]

data/geometry_train.jsonl:   0%|          | 0.00/1.15M [00:00<?, ?B/s]

data/intermediate_algebra_train.jsonl:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

data/number_theory_train.jsonl:   0%|          | 0.00/639k [00:00<?, ?B/s]

data/prealgebra_train.jsonl:   0%|          | 0.00/778k [00:00<?, ?B/s]

data/precalculus_train.jsonl:   0%|          | 0.00/903k [00:00<?, ?B/s]

data/algebra_test.jsonl:   0%|          | 0.00/706k [00:00<?, ?B/s]

data/counting_and_probability_test.jsonl:   0%|          | 0.00/377k [00:00<?, ?B/s]

data/geometry_test.jsonl:   0%|          | 0.00/562k [00:00<?, ?B/s]

data/intermediate_algebra_test.jsonl:   0%|          | 0.00/860k [00:00<?, ?B/s]

data/number_theory_test.jsonl:   0%|          | 0.00/376k [00:00<?, ?B/s]

data/prealgebra_test.jsonl:   0%|          | 0.00/553k [00:00<?, ?B/s]

data/precalculus_test.jsonl:   0%|          | 0.00/614k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/7500 [00:00<?, ? examples/s]

In [17]:
for lci in lora_config.keys():
    model.resize_token_embeddings(len(tokenizer))
    model = get_peft_model(model, lora_config[lci])
    trainable_params = 0
    total_params = 0
    for param in model.parameters():
        num_params = param.numel()
        total_params += num_params
        if param.requires_grad:
            trainable_params += num_params
    print(f"Trainable parameters: {trainable_params} ({100 * trainable_params / total_params:.2f}% of total), Total : {total_params}")

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Trainable parameters: 180355072 (19.40% of total), Total : 929632256
Trainable parameters: 180355072 (19.40% of total), Total : 929632256
Trainable parameters: 180355072 (19.40% of total), Total : 929632256
Trainable parameters: 180355072 (19.40% of total), Total : 929632256
Trainable parameters: 180355072 (19.40% of total), Total : 929632256


In [18]:
for level in range(1, 6):
    print(f"\nStarting training for Level {level}")

    if level == 1:
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            device_map="auto",
            token=token,
        )
        model.resize_token_embeddings(len(tokenizer))
        lc = lora_config["lora_config1"]
        model = get_peft_model(model, lc)

    else:
        # Load tokenizer and model from previous level
        prev_level = level - 1
        prev_model_path = os.path.join(save_dir, f"rank_llama_level_{prev_level}")
        
        model = PeftModel.from_pretrained(
        model=model,
        model_id = prev_model_path,
        peft_config=bnb_config,
        device_map="auto")
        
        model.resize_token_embeddings(len(tokenizer))
        if level == 2:
            lc = lora_config["lora_config2"]
            model = get_peft_model(model, lc)
        if level == 3:
            lc = lora_config["lora_config3"]
            model = get_peft_model(model, lc)
        if level == 4:
            lc = lora_config["lora_config4"]
            model = get_peft_model(model, lc)
        if level == 5:
            lc = lora_config["lora_config5"]
            model = get_peft_model(model, lc)       
        tokenizer.padding_side = "left"
    
    # Filter the dataset by current level
    level_train = dataset['train'].filter(
        lambda x: x['level'] <= f"Level {level}"
    )
    level_test = dataset['test'].filter(
        lambda x: x['level'] <= f"Level {level}"
    )
    
    print(f"Number of training samples: {len(level_train)}")
    print(f"Number of test samples: {len(level_test)}")
    
#     # Combine with previous levels' data if applicable
#     if level > 1:
#         replay_fraction = 0.1  # Adjust as needed
#         previous_levels_train = dataset['train'].filter(
#             lambda x: int(x['level'].split()[-1]) < level
#         ).shuffle(seed).select(range(int(len(level_train)*replay_fraction)))
#         level_train = concatenate_datasets([level_train, previous_levels_train])
    
    # Tokenize datasets
    level_train = level_train.map(preprocess_function, batched=True)
    level_test = level_test.map(preprocess_function, batched=True)
    
#     level_train = level_train.select(range(5))
#     level_test = level_test.select(range(2))
    
    training_args = TrainingArguments(
      output_dir=os.path.join(save_dir, f"rank_level_overall"),
      per_device_train_batch_size=4,
      gradient_accumulation_steps=1,
      warmup_ratio=0.1,
      num_train_epochs=5,
      learning_rate=5e-5,
      fp16=True,  # Use bf16 precision
      logging_steps=100,
      optim="paged_adamw_8bit",
      evaluation_strategy="steps",
      eval_steps=150,
      save_steps=450,
      save_total_limit=2,
      report_to="none",  # Change to "wandb" if using Weights & Biases
      run_name=f"llama_FineTuning_Level_overall",
        load_best_model_at_end=True,  # Load the best model at the end
    metric_for_best_model="eval_loss",  # Use eval_loss to select the best model
    greater_is_better=False,
        ddp_find_unused_parameters=False,
    )

    early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,  # Stop training if no improvement after 2 evaluations
    )
    
    # Initialize the trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=level_train,
        eval_dataset=level_test,
        peft_config=lc,
        args=training_args,
        data_collator=data_collator,
        callbacks=[early_stopping_callback],
    )
    
    # Train the model
    trainer.train()

    
    # Save the model
    model_save_path = os.path.join(save_dir, f"rank_llama_level_{level}")
    model.save_pretrained(model_save_path,save_embedding_layers=True)
    tokenizer.save_pretrained(model_save_path)

    model.push_to_hub(f"e4_mwp_sft_llama3.21b_level_{level}", use_auth_token=token)
    tokenizer.push_to_hub(f"e4_mwp_sft_llama3.21b_level_{level}", use_auth_token=token)
    
    print(trainer.state.log_history)

    if level == 5:
        model.eval()  # Set model to evaluation mode
        
        test_samples = []
        for idx in range(len(level_test)):
            sample = level_test[idx]
            input_text = (
                f"<|begin_of_text|><|start_header_id|>system <|end_header_id|>"
                "You are an expert math assistant<|eot_id|><|start_header_id|>user <|end_header_id|>"
                f"Solve the following math problem: {sample['problem']}\n"
                "Show all intermediate steps and please mandatorily include the final answer in LaTeX format in a box like \\boxed{{}}."
                "<|eot_id|><|start_header_id|> assistant <|end_header_id|>"
            )
            test_samples.append({
                "input_text": input_text,
                "problem": sample['problem'],
                "level": sample['level'],
                "type": sample['type'],
                "ground_truth": sample['solution']
            })
            
        def collate_fn(batch):
            input_texts = [sample['input_text'] for sample in batch]
            model_inputs = tokenizer(
                input_texts,
                padding=True,
                truncation=True,
                max_length=1024,  # Adjust as needed
                return_tensors="pt"
            )
            model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
            return model_inputs, batch
    
        batch_size = 32  # Adjust based on your GPU memory
        test_dataloader = DataLoader(test_samples, batch_size=batch_size, collate_fn=collate_fn)
    
        results_list = []
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        for batch_idx, (model_inputs, batch_samples) in enumerate(tqdm(test_dataloader, desc=f"Evaluating Level {level}")):
            current_batch_size = model_inputs['input_ids'].size(0)
            # Generate predictions
            try:
                with torch.no_grad():
                    output_ids = model.generate(
                        input_ids=model_inputs['input_ids'],
                        attention_mask=model_inputs['attention_mask'],
                        max_new_tokens=512,  
                        do_sample=False,
                        eos_token_id=tokenizer.eos_token_id,
                        pad_token_id=tokenizer.pad_token_id
                    )
                # Decode the outputs
                for i in range(current_batch_size):
                    predicted_text = tokenizer.decode(output_ids[i], skip_special_tokens=True)
                    # Store the results
                    results_list.append({
                        "problem": batch_samples[i]['problem'],
                        "level": batch_samples[i]['level'],
                        "type": batch_samples[i]['type'],
                        "ground_truth": batch_samples[i]['ground_truth'],
                        "predicted_solution": predicted_text
                    })
            except Exception as e:
                print(f"Error during generation at batch {batch_idx+1}: {e}")
                # In case of error, record empty predictions for this batch
                for i in range(current_batch_size):
                    results_list.append({
                        "problem": batch_samples[i]['problem'],
                        "level": batch_samples[i]['level'],
                        "type": batch_samples[i]['type'],
                        "ground_truth": batch_samples[i]['ground_truth'],
                        "predicted_solution": ""  # Empty string for predicted_text
                    })
                continue  # Proceed to the next batch
    
            # Optionally, save intermediate results every N batches
            if (batch_idx + 1) % 100 == 0:
                results_df = pd.DataFrame(results_list)
                results_save_path = os.path.join(save_dir, f"test_results_level_{level}_batch_{batch_idx+1}.csv")
                results_df.to_csv(results_save_path, index=False)
                print(f"Saved test results up to batch {batch_idx+1} to {results_save_path}")
        
        
        results_df = pd.DataFrame(results_list)
        results_save_path = os.path.join(save_dir, f"test_results_level_{level}.csv")
        results_df.to_csv(results_save_path, index=False)
        print(f"Saved test results to {results_save_path}")


Starting training for Level 1


Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 564
Number of test samples: 437


Map:   0%|          | 0/564 [00:00<?, ? examples/s]

Map:   0%|          | 0/437 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
150,1.0428,0.700275
300,0.4038,0.855343
450,0.2302,0.912583
600,0.1047,1.04191




README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]



adapter_model.safetensors:   0%|          | 0.00/1.77G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


[{'loss': 1.0428, 'grad_norm': 2.724350690841675, 'learning_rate': 4.771293375394322e-05, 'epoch': 0.7092198581560284, 'step': 100}, {'eval_loss': 0.7002749443054199, 'eval_runtime': 49.9132, 'eval_samples_per_second': 8.755, 'eval_steps_per_second': 1.102, 'epoch': 1.0638297872340425, 'step': 150}, {'loss': 0.5541, 'grad_norm': 3.0604426860809326, 'learning_rate': 3.982649842271294e-05, 'epoch': 1.4184397163120568, 'step': 200}, {'loss': 0.4038, 'grad_norm': 3.464168071746826, 'learning_rate': 3.194006309148265e-05, 'epoch': 2.127659574468085, 'step': 300}, {'eval_loss': 0.8553428649902344, 'eval_runtime': 50.1445, 'eval_samples_per_second': 8.715, 'eval_steps_per_second': 1.097, 'epoch': 2.127659574468085, 'step': 300}, {'loss': 0.2302, 'grad_norm': 2.6433022022247314, 'learning_rate': 2.405362776025237e-05, 'epoch': 2.8368794326241136, 'step': 400}, {'eval_loss': 0.9125829935073853, 'eval_runtime': 50.1512, 'eval_samples_per_second': 8.714, 'eval_steps_per_second': 1.097, 'epoch': 3



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 1912
Number of test samples: 1331


Map:   0%|          | 0/1912 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
150,1.1877,0.74216
300,0.6875,0.716146
450,0.7034,0.689027
600,0.4588,0.707869
750,0.4809,0.704314




adapter_model.safetensors:   0%|          | 0.00/721M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[{'loss': 1.1877, 'grad_norm': 2.7769358158111572, 'learning_rate': 2.092050209205021e-05, 'epoch': 0.20920502092050208, 'step': 100}, {'eval_loss': 0.7421599626541138, 'eval_runtime': 153.6457, 'eval_samples_per_second': 8.663, 'eval_steps_per_second': 1.087, 'epoch': 0.3138075313807531, 'step': 150}, {'loss': 0.7392, 'grad_norm': 2.3284552097320557, 'learning_rate': 4.184100418410042e-05, 'epoch': 0.41841004184100417, 'step': 200}, {'loss': 0.6875, 'grad_norm': 2.096743583679199, 'learning_rate': 4.8582054858205486e-05, 'epoch': 0.6276150627615062, 'step': 300}, {'eval_loss': 0.7161458730697632, 'eval_runtime': 153.0881, 'eval_samples_per_second': 8.694, 'eval_steps_per_second': 1.091, 'epoch': 0.6276150627615062, 'step': 300}, {'loss': 0.7034, 'grad_norm': 1.886854887008667, 'learning_rate': 4.6257554625755466e-05, 'epoch': 0.8368200836820083, 'step': 400}, {'eval_loss': 0.689026951789856, 'eval_runtime': 153.4481, 'eval_samples_per_second': 8.674, 'eval_steps_per_second': 1.088, 'e



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 3504
Number of test samples: 2462


Map:   0%|          | 0/3504 [00:00<?, ? examples/s]

Map:   0%|          | 0/2462 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
150,1.2773,0.781788
300,0.7358,0.738388
450,0.6935,0.728202
600,0.6725,0.716523
750,0.6912,0.700802
900,0.6275,0.714883
1050,0.4801,0.708293
1200,0.5019,0.710279
1350,0.4755,0.701655
1500,0.4948,0.698331




adapter_model.safetensors:   0%|          | 0.00/721M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[{'loss': 1.2773, 'grad_norm': 2.541588306427002, 'learning_rate': 1.1415525114155251e-05, 'epoch': 0.1141552511415525, 'step': 100}, {'eval_loss': 0.7817882299423218, 'eval_runtime': 284.445, 'eval_samples_per_second': 8.655, 'eval_steps_per_second': 1.083, 'epoch': 0.17123287671232876, 'step': 150}, {'loss': 0.7841, 'grad_norm': 2.374868869781494, 'learning_rate': 2.2831050228310503e-05, 'epoch': 0.228310502283105, 'step': 200}, {'loss': 0.7358, 'grad_norm': 2.488973617553711, 'learning_rate': 3.424657534246575e-05, 'epoch': 0.3424657534246575, 'step': 300}, {'eval_loss': 0.7383881211280823, 'eval_runtime': 284.0921, 'eval_samples_per_second': 8.666, 'eval_steps_per_second': 1.084, 'epoch': 0.3424657534246575, 'step': 300}, {'loss': 0.6935, 'grad_norm': 1.8498680591583252, 'learning_rate': 4.5662100456621006e-05, 'epoch': 0.45662100456621, 'step': 400}, {'eval_loss': 0.7282015085220337, 'eval_runtime': 284.3986, 'eval_samples_per_second': 8.657, 'eval_steps_per_second': 1.083, 'epoch



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 5194
Number of test samples: 3676


Map:   0%|          | 0/5194 [00:00<?, ? examples/s]

Map:   0%|          | 0/3676 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
150,1.3006,0.845569
300,0.7641,0.76337
450,0.7181,0.750135
600,0.7149,0.740708
750,0.6883,0.737063
900,0.6869,0.727639
1050,0.6697,0.714459
1200,0.6741,0.710479
1350,0.6881,0.71586
1500,0.4852,0.72141




adapter_model.safetensors:   0%|          | 0.00/721M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[{'loss': 1.3006, 'grad_norm': 3.2075486183166504, 'learning_rate': 7.692307692307694e-06, 'epoch': 0.07698229407236336, 'step': 100}, {'eval_loss': 0.8455690741539001, 'eval_runtime': 424.7842, 'eval_samples_per_second': 8.654, 'eval_steps_per_second': 1.083, 'epoch': 0.11547344110854503, 'step': 150}, {'loss': 0.8078, 'grad_norm': 3.2484638690948486, 'learning_rate': 1.5384615384615387e-05, 'epoch': 0.15396458814472672, 'step': 200}, {'loss': 0.7641, 'grad_norm': 2.231630325317383, 'learning_rate': 2.307692307692308e-05, 'epoch': 0.23094688221709006, 'step': 300}, {'eval_loss': 0.7633698582649231, 'eval_runtime': 425.0316, 'eval_samples_per_second': 8.649, 'eval_steps_per_second': 1.082, 'epoch': 0.23094688221709006, 'step': 300}, {'loss': 0.7181, 'grad_norm': 2.4986557960510254, 'learning_rate': 3.0769230769230774e-05, 'epoch': 0.30792917628945343, 'step': 400}, {'eval_loss': 0.7501354813575745, 'eval_runtime': 424.9192, 'eval_samples_per_second': 8.651, 'eval_steps_per_second': 1.0



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 7498
Number of test samples: 5000


Map:   0%|          | 0/7498 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
150,1.4202,0.897257
300,0.807,0.789728
450,0.7778,0.774412
600,0.7404,0.763581
750,0.7324,0.758615
900,0.7621,0.755625
1050,0.7452,0.749662
1200,0.7453,0.744742
1350,0.7272,0.735543
1500,0.7439,0.731914




adapter_model.safetensors:   0%|          | 0.00/721M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[{'loss': 1.4202, 'grad_norm': 2.6316840648651123, 'learning_rate': 5.330490405117271e-06, 'epoch': 0.05333333333333334, 'step': 100}, {'eval_loss': 0.8972572088241577, 'eval_runtime': 578.7621, 'eval_samples_per_second': 8.639, 'eval_steps_per_second': 1.08, 'epoch': 0.08, 'step': 150}, {'loss': 0.8826, 'grad_norm': 2.6508371829986572, 'learning_rate': 1.0660980810234541e-05, 'epoch': 0.10666666666666667, 'step': 200}, {'loss': 0.807, 'grad_norm': 2.8448030948638916, 'learning_rate': 1.5991471215351813e-05, 'epoch': 0.16, 'step': 300}, {'eval_loss': 0.7897283434867859, 'eval_runtime': 579.0839, 'eval_samples_per_second': 8.634, 'eval_steps_per_second': 1.079, 'epoch': 0.16, 'step': 300}, {'loss': 0.7778, 'grad_norm': 1.9705071449279785, 'learning_rate': 2.1321961620469083e-05, 'epoch': 0.21333333333333335, 'step': 400}, {'eval_loss': 0.7744123935699463, 'eval_runtime': 579.1014, 'eval_samples_per_second': 8.634, 'eval_steps_per_second': 1.079, 'epoch': 0.24, 'step': 450}, {'loss': 0.7

Evaluating Level 5:  64%|██████▎   | 100/157 [1:23:01<41:40, 43.86s/it]

Saved test results up to batch 100 to /kaggle/working/test_results_level_5_batch_100.csv


Evaluating Level 5: 100%|██████████| 157/157 [2:07:19<00:00, 48.66s/it]

Saved test results to /kaggle/working/test_results_level_5.csv



