In [2]:
!pip install -q --upgrade transformers datasets peft bitsandbytes trl
!pip install -q accelerate

In [3]:
from accelerate.utils import write_basic_config
write_basic_config()

PosixPath('/root/.cache/huggingface/accelerate/default_config.yaml')

In [4]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    AutoConfig
)
from transformers import EarlyStoppingCallback
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import login
from peft import LoraConfig, get_peft_model,PeftModel
from trl import SFTTrainer
import transformers
from torch.utils.data import DataLoader
from tqdm import tqdm  # For progress bar

In [5]:
seed = 7
torch.manual_seed(seed)
np.random.seed(seed)

In [6]:
print(f"Number of GPUs Available:{torch.cuda.device_count()}")

Number of GPUs Available:2


In [7]:
# token = "your_hugging_face_token"
login(token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:
save_dir = ""
model_id = "meta-llama/Llama-3.2-1B-Instruct"

In [9]:
lora_config = {"lora_config1" : LoraConfig(
    r=256,
    lora_alpha=512,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config2" : LoraConfig(
    r=128,
    lora_alpha=256,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config3" : LoraConfig(
    r=64,
    lora_alpha=128,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config4" : LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config5" : LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
)
}

In [10]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16
)

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
tokenizer.padding_side = 'left'

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [12]:
# Define default tokens
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"

special_tokens_dict = {}
if tokenizer.pad_token is None:
    special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN

if special_tokens_dict:
    tokenizer.add_special_tokens(special_tokens_dict)

In [13]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    token=token
)

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [14]:
def preprocess_function(examples):
    # Format inputs with the instruction
    inputs = [
        f"<|begin_of_text|><|start_header_id|>system <|end_header_id|>"
        "You are an expert math assistant<|eot_id|><|start_header_id|>user <|end_header_id|>"
        f"Solve the following math problem: {problem}\n"
        "Show all intermediate steps and please mandatorily include the final answer in LaTeX format in a box like \\boxed{{}}."
        "<|eot_id|><|start_header_id|> assistant <|end_header_id|>"for problem in examples['problem']
    ]
    # Append the solution with EOS token
    targets = [
        f"{solution}{tokenizer.eos_token}"
        for solution in examples['solution']
    ]
    # Concatenate inputs and targets
    full_texts = [inp + tgt for inp, tgt in zip(inputs, targets)]
    # Tokenize the concatenated texts
    model_inputs = tokenizer(
        full_texts,
        max_length=512,
        truncation=True,
        padding="longest",  # Use dynamic padding
        return_tensors="pt"
    )
    # Create labels by cloning input IDs
    labels = model_inputs["input_ids"].clone()

    # Mask input tokens in labels
    for i in range(len(labels)):
        input_ids = tokenizer(inputs[i], add_special_tokens=False).input_ids
        input_len = len(input_ids)
        labels[i][:input_len] = -100  # Mask the input tokens
    model_inputs["labels"] = labels
    return model_inputs

In [15]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # For causal language modeling
    pad_to_multiple_of=8  # For faster GPU performance
)

In [16]:
dataset = load_dataset("Maxwell-Jia/MATH",trust_remote_code=True)
dataset['train'] = dataset['train'].filter(
        lambda x: x['level'] != f"Level ?"
    )

README.md:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/2.98M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/1.85M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7500 [00:00<?, ? examples/s]

In [17]:
for lci in lora_config.keys():
    model.resize_token_embeddings(len(tokenizer))
    model = get_peft_model(model, lora_config[lci])
    trainable_params = 0
    total_params = 0
    for param in model.parameters():
        num_params = param.numel()
        total_params += num_params
        if param.requires_grad:
            trainable_params += num_params
    print(f"Trainable parameters: {trainable_params} ({100 * trainable_params / total_params:.2f}% of total), Total : {total_params}")

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Trainable parameters: 180355072 (19.40% of total), Total : 929632256
Trainable parameters: 90177536 (10.74% of total), Total : 839454720
Trainable parameters: 45088768 (5.68% of total), Total : 794365952
Trainable parameters: 22544384 (2.92% of total), Total : 771821568
Trainable parameters: 11272192 (1.48% of total), Total : 760549376


In [18]:
for level in range(1, 6):
    print(f"\nStarting training for Level {level}")

    if level == 1:
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            device_map="auto",
            token=token,
        )
        model.resize_token_embeddings(len(tokenizer))
        lc = lora_config["lora_config1"]
        model = get_peft_model(model, lc)

    else:
        # Load tokenizer and model from previous level
        prev_level = level - 1
        prev_model_path = os.path.join(save_dir, f"snr_shrink_llama_level_{prev_level}")
        
        model = PeftModel.from_pretrained(
        model=model,
        model_id = prev_model_path,
        peft_config=bnb_config,
        device_map="auto")
        
        model.resize_token_embeddings(len(tokenizer))
        if level == 2:
            lc = lora_config["lora_config2"]
            model = get_peft_model(model, lc)
        if level == 3:
            lc = lora_config["lora_config3"]
            model = get_peft_model(model, lc)
        if level == 4:
            lc = lora_config["lora_config4"]
            model = get_peft_model(model, lc)
        if level == 5:
            lc = lora_config["lora_config5"]
            model = get_peft_model(model, lc)       
        tokenizer.padding_side = "left"
    
    # Filter the dataset by current level
    level_train = dataset['train'].filter(
        lambda x: x['level'] == f"Level {level}"
    )
    level_test = dataset['test'].filter(
        lambda x: x['level'] <= f"Level {level}"
    )
    
    print(f"Number of training samples: {len(level_train)}")
    print(f"Number of test samples: {len(level_test)}")
    
    level_train = level_train.map(preprocess_function, batched=True)
    level_test = level_test.map(preprocess_function, batched=True)
    
#     level_train = level_train.select(range(5))
#     level_test = level_test.select(range(2))
    
    training_args = TrainingArguments(
      output_dir=os.path.join(save_dir, f"snr_shrink_level_overall"),
      per_device_train_batch_size=4,
      gradient_accumulation_steps=1,
      warmup_ratio=0.1,
      num_train_epochs=5,
      learning_rate=5e-5,
      fp16=True,  # Use bf16 precision
      logging_steps=10,
      optim="paged_adamw_8bit",
      evaluation_strategy="steps",
      eval_steps=100,
      save_steps=100,
      save_total_limit=2,
      report_to="none",  # Change to "wandb" if using Weights & Biases
      run_name=f"llama_FineTuning_Level_overall",
      load_best_model_at_end=True,  # Load the best model at the end
      metric_for_best_model="eval_loss",  # Use eval_loss to select the best model
      greater_is_better=False,
      ddp_find_unused_parameters=False,
    )

    early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,  # Stop training if no improvement after 2 evaluations
    )
    
    # Initialize the trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=level_train,
        eval_dataset=level_test,
        peft_config=lc,
        args=training_args,
        data_collator=data_collator,
        callbacks=[early_stopping_callback],
    )
    
    # Train the model
    trainer.train()

    
    # Save the model
    model_save_path = os.path.join(save_dir, f"snr_shrink_llama_level_{level}")
    model.save_pretrained(model_save_path,save_embedding_layers=True)
    tokenizer.save_pretrained(model_save_path)

    model.push_to_hub(f"snr_shrink_mwp_sft_llama3.21b_level_{level}", token=token)
    tokenizer.push_to_hub(f"snr_shrink_mwp_sft_llama3.21b_level_{level}", token=token)
    
    print(trainer.state.log_history)

    if level == 5:
        model.eval()  # Set model to evaluation mode
        
        test_samples = []
        for idx in range(len(level_test)):
            sample = level_test[idx]
            input_text = (
                f"<|begin_of_text|><|start_header_id|>system <|end_header_id|>"
                "You are an expert math assistant<|eot_id|><|start_header_id|>user <|end_header_id|>"
                f"Solve the following math problem: {sample['problem']}\n"
                "Show all intermediate steps and please mandatorily include the final answer in LaTeX format in a box like \\boxed{{}}."
                "<|eot_id|><|start_header_id|> assistant <|end_header_id|>"
            )
            test_samples.append({
                "input_text": input_text,
                "problem": sample['problem'],
                "level": sample['level'],
                "type": sample['type'],
                "ground_truth": sample['solution']
            })
            
        def collate_fn(batch):
            input_texts = [sample['input_text'] for sample in batch]
            model_inputs = tokenizer(
                input_texts,
                padding=True,
                truncation=True,
                max_length=1024,  # Adjust as needed
                return_tensors="pt"
            )
            model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
            return model_inputs, batch
    
        batch_size = 32  # Adjust based on your GPU memory
        test_dataloader = DataLoader(test_samples, batch_size=batch_size, collate_fn=collate_fn)
    
        results_list = []
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        for batch_idx, (model_inputs, batch_samples) in enumerate(tqdm(test_dataloader, desc=f"Evaluating Level {level}")):
            current_batch_size = model_inputs['input_ids'].size(0)
            # Generate predictions
            try:
                with torch.no_grad():
                    output_ids = model.generate(
                        input_ids=model_inputs['input_ids'],
                        attention_mask=model_inputs['attention_mask'],
                        max_new_tokens=512,  
                        do_sample=False,
                        eos_token_id=tokenizer.eos_token_id,
                        pad_token_id=tokenizer.pad_token_id
                    )
                # Decode the outputs
                for i in range(current_batch_size):
                    predicted_text = tokenizer.decode(output_ids[i], skip_special_tokens=True)
                    # Store the results
                    results_list.append({
                        "problem": batch_samples[i]['problem'],
                        "level": batch_samples[i]['level'],
                        "type": batch_samples[i]['type'],
                        "ground_truth": batch_samples[i]['ground_truth'],
                        "predicted_solution": predicted_text
                    })
            except Exception as e:
                print(f"Error during generation at batch {batch_idx+1}: {e}")
                # In case of error, record empty predictions for this batch
                for i in range(current_batch_size):
                    results_list.append({
                        "problem": batch_samples[i]['problem'],
                        "level": batch_samples[i]['level'],
                        "type": batch_samples[i]['type'],
                        "ground_truth": batch_samples[i]['ground_truth'],
                        "predicted_solution": ""  # Empty string for predicted_text
                    })
                continue  # Proceed to the next batch
    
            # Optionally, save intermediate results every N batches
            if (batch_idx + 1) % 100 == 0:
                results_df = pd.DataFrame(results_list)
                results_save_path = os.path.join(save_dir, f"test_results_level_{level}_batch_{batch_idx+1}.csv")
                results_df.to_csv(results_save_path, index=False)
                print(f"Saved test results up to batch {batch_idx+1} to {results_save_path}")
        
        
        results_df = pd.DataFrame(results_list)
        results_save_path = os.path.join(save_dir, f"test_results_level_{level}.csv")
        results_df.to_csv(results_save_path, index=False)
        print(f"Saved test results to {results_save_path}")


Starting training for Level 1


Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 564
Number of test samples: 437


Map:   0%|          | 0/564 [00:00<?, ? examples/s]

Map:   0%|          | 0/437 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
100,0.7695,0.694504
200,0.4409,0.708446
300,0.2322,0.842646




README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]



adapter_model.safetensors:   0%|          | 0.00/1.77G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


[{'loss': 2.6101, 'grad_norm': 11.627020835876465, 'learning_rate': 7.042253521126762e-06, 'epoch': 0.07092198581560284, 'step': 10}, {'loss': 1.4065, 'grad_norm': 4.203446865081787, 'learning_rate': 1.4084507042253523e-05, 'epoch': 0.14184397163120568, 'step': 20}, {'loss': 1.0316, 'grad_norm': 3.809864044189453, 'learning_rate': 2.112676056338028e-05, 'epoch': 0.2127659574468085, 'step': 30}, {'loss': 0.8376, 'grad_norm': 2.1173551082611084, 'learning_rate': 2.8169014084507046e-05, 'epoch': 0.28368794326241137, 'step': 40}, {'loss': 0.8218, 'grad_norm': 3.108731508255005, 'learning_rate': 3.5211267605633805e-05, 'epoch': 0.3546099290780142, 'step': 50}, {'loss': 0.7737, 'grad_norm': 2.674424648284912, 'learning_rate': 4.225352112676056e-05, 'epoch': 0.425531914893617, 'step': 60}, {'loss': 0.7599, 'grad_norm': 2.8099169731140137, 'learning_rate': 4.929577464788733e-05, 'epoch': 0.49645390070921985, 'step': 70}, {'loss': 0.7994, 'grad_norm': 2.570380449295044, 'learning_rate': 4.92902



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 1348
Number of test samples: 1331


Map:   0%|          | 0/1348 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
100,0.8637,0.847403
200,0.792,0.724992
300,0.6653,0.705249
400,0.5413,0.713961
500,0.4988,0.706423


adapter_model.safetensors:   0%|          | 0.00/361M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[{'loss': 2.2957, 'grad_norm': 9.462907791137695, 'learning_rate': 2.9585798816568047e-06, 'epoch': 0.02967359050445104, 'step': 10}, {'loss': 1.9955, 'grad_norm': 5.195075511932373, 'learning_rate': 5.917159763313609e-06, 'epoch': 0.05934718100890208, 'step': 20}, {'loss': 1.4806, 'grad_norm': 3.2818405628204346, 'learning_rate': 8.875739644970414e-06, 'epoch': 0.08902077151335312, 'step': 30}, {'loss': 1.0737, 'grad_norm': 2.83294677734375, 'learning_rate': 1.1834319526627219e-05, 'epoch': 0.11869436201780416, 'step': 40}, {'loss': 0.8866, 'grad_norm': 1.8853461742401123, 'learning_rate': 1.4792899408284025e-05, 'epoch': 0.14836795252225518, 'step': 50}, {'loss': 0.8393, 'grad_norm': 2.059513568878174, 'learning_rate': 1.7751479289940828e-05, 'epoch': 0.17804154302670624, 'step': 60}, {'loss': 0.8437, 'grad_norm': 2.0427768230438232, 'learning_rate': 2.0710059171597635e-05, 'epoch': 0.20771513353115728, 'step': 70}, {'loss': 0.8029, 'grad_norm': 2.1198623180389404, 'learning_rate': 2



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 1592
Number of test samples: 2462


Map:   0%|          | 0/1592 [00:00<?, ? examples/s]

Map:   0%|          | 0/2462 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
100,0.9162,0.893372
200,0.7812,0.759458
300,0.7249,0.737863
400,0.6562,0.725523
500,0.5309,0.729728
600,0.5255,0.724657
700,0.5512,0.724418
800,0.5531,0.718333
900,0.4881,0.760636
1000,0.4286,0.771626


adapter_model.safetensors:   0%|          | 0.00/180M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[{'loss': 2.1506, 'grad_norm': 4.607706069946289, 'learning_rate': 2.512562814070352e-06, 'epoch': 0.02512562814070352, 'step': 10}, {'loss': 2.0336, 'grad_norm': 3.662555694580078, 'learning_rate': 5.025125628140704e-06, 'epoch': 0.05025125628140704, 'step': 20}, {'loss': 1.845, 'grad_norm': 3.075547933578491, 'learning_rate': 7.537688442211055e-06, 'epoch': 0.07537688442211055, 'step': 30}, {'loss': 1.4152, 'grad_norm': 2.5648984909057617, 'learning_rate': 1.0050251256281408e-05, 'epoch': 0.10050251256281408, 'step': 40}, {'loss': 1.2325, 'grad_norm': 2.2302584648132324, 'learning_rate': 1.2562814070351759e-05, 'epoch': 0.12562814070351758, 'step': 50}, {'loss': 0.9492, 'grad_norm': 1.7336435317993164, 'learning_rate': 1.507537688442211e-05, 'epoch': 0.1507537688442211, 'step': 60}, {'loss': 0.8879, 'grad_norm': 1.4364442825317383, 'learning_rate': 1.7587939698492464e-05, 'epoch': 0.17587939698492464, 'step': 70}, {'loss': 0.9245, 'grad_norm': 1.8305785655975342, 'learning_rate': 2.0



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 1690
Number of test samples: 3676


Map:   0%|          | 0/1690 [00:00<?, ? examples/s]

Map:   0%|          | 0/3676 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
100,0.8531,0.92656
200,0.8569,0.826818
300,0.7334,0.77076
400,0.7482,0.760339
500,0.6641,0.760879
600,0.6848,0.758394
700,0.6481,0.749685
800,0.6392,0.744974
900,0.5564,0.769343
1000,0.5468,0.771648


adapter_model.safetensors:   0%|          | 0.00/90.2M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[{'loss': 2.1343, 'grad_norm': 3.2617547512054443, 'learning_rate': 2.358490566037736e-06, 'epoch': 0.02364066193853428, 'step': 10}, {'loss': 1.9744, 'grad_norm': 4.006641387939453, 'learning_rate': 4.716981132075472e-06, 'epoch': 0.04728132387706856, 'step': 20}, {'loss': 1.7154, 'grad_norm': 1.9151713848114014, 'learning_rate': 7.0754716981132075e-06, 'epoch': 0.07092198581560284, 'step': 30}, {'loss': 1.6784, 'grad_norm': 2.571730375289917, 'learning_rate': 9.433962264150944e-06, 'epoch': 0.09456264775413711, 'step': 40}, {'loss': 1.5143, 'grad_norm': 1.5374475717544556, 'learning_rate': 1.179245283018868e-05, 'epoch': 0.1182033096926714, 'step': 50}, {'loss': 1.2331, 'grad_norm': 1.5252779722213745, 'learning_rate': 1.4150943396226415e-05, 'epoch': 0.14184397163120568, 'step': 60}, {'loss': 1.0528, 'grad_norm': 1.2077405452728271, 'learning_rate': 1.650943396226415e-05, 'epoch': 0.16548463356973994, 'step': 70}, {'loss': 0.9481, 'grad_norm': 1.5287052392959595, 'learning_rate': 1.



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 2304
Number of test samples: 5000


Map:   0%|          | 0/2304 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
100,1.1781,1.037697
200,0.9839,0.912277
300,0.9594,0.867288
400,0.8475,0.793726
500,0.8989,0.786315
600,0.759,0.782629
700,0.8713,0.776115
800,0.7638,0.772805
900,0.8575,0.769784
1000,0.7527,0.766829


adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[{'loss': 1.955, 'grad_norm': 2.353409767150879, 'learning_rate': 1.7361111111111112e-06, 'epoch': 0.017361111111111112, 'step': 10}, {'loss': 2.0371, 'grad_norm': 2.3009204864501953, 'learning_rate': 3.4722222222222224e-06, 'epoch': 0.034722222222222224, 'step': 20}, {'loss': 1.8873, 'grad_norm': 2.3021812438964844, 'learning_rate': 5.208333333333334e-06, 'epoch': 0.052083333333333336, 'step': 30}, {'loss': 1.7705, 'grad_norm': 1.823364019393921, 'learning_rate': 6.944444444444445e-06, 'epoch': 0.06944444444444445, 'step': 40}, {'loss': 1.6883, 'grad_norm': 1.5097103118896484, 'learning_rate': 8.680555555555556e-06, 'epoch': 0.08680555555555555, 'step': 50}, {'loss': 1.6261, 'grad_norm': 1.7267935276031494, 'learning_rate': 1.0416666666666668e-05, 'epoch': 0.10416666666666667, 'step': 60}, {'loss': 1.4343, 'grad_norm': 1.115236759185791, 'learning_rate': 1.2152777777777779e-05, 'epoch': 0.12152777777777778, 'step': 70}, {'loss': 1.3759, 'grad_norm': 1.5370744466781616, 'learning_rate'

Evaluating Level 5:  64%|██████▎   | 100/157 [1:19:27<40:50, 42.99s/it]

Saved test results up to batch 100 to /kaggle/working/test_results_level_5_batch_100.csv


Evaluating Level 5: 100%|██████████| 157/157 [2:02:36<00:00, 46.86s/it]


Saved test results to /kaggle/working/test_results_level_5.csv
