In [2]:
!pip install -q --upgrade transformers datasets peft bitsandbytes trl
!pip install -q accelerate

In [3]:
from accelerate.utils import write_basic_config
write_basic_config()

PosixPath('/root/.cache/huggingface/accelerate/default_config.yaml')

In [4]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    AutoConfig
)
from transformers import EarlyStoppingCallback
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import login
from peft import LoraConfig, get_peft_model,PeftModel
from trl import SFTTrainer
import transformers
from torch.utils.data import DataLoader
from tqdm import tqdm  # For progress bar

In [5]:
seed = 7
torch.manual_seed(seed)
np.random.seed(seed)

In [6]:
print(f"Number of GPUs Available:{torch.cuda.device_count()}")

Number of GPUs Available:2


In [7]:
# token = "your_hugging_face_token"
login(token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:
save_dir = ""
model_id = "meta-llama/Llama-3.2-1B-Instruct"

In [9]:
lora_config = {"lora_config1" : LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config2" : LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config3" : LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config4" : LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config5" : LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
)
}

In [10]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16
)

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
tokenizer.padding_side = 'left'

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [12]:
# Define default tokens
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"

special_tokens_dict = {}
if tokenizer.pad_token is None:
    special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN

if special_tokens_dict:
    tokenizer.add_special_tokens(special_tokens_dict)

In [13]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    token=token
)

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [14]:
def preprocess_function(examples):
    # Format inputs with the instruction
    inputs = [
        f"<|begin_of_text|><|start_header_id|>system <|end_header_id|>"
        "You are an expert math assistant<|eot_id|><|start_header_id|>user <|end_header_id|>"
        f"Solve the following math problem: {problem}\n"
        "Show all intermediate steps and please mandatorily include the final answer in LaTeX format in a box like \\boxed{{}}."
        "<|eot_id|><|start_header_id|> assistant <|end_header_id|>"for problem in examples['problem']
    ]
    # Append the solution with EOS token
    targets = [
        f"{solution}{tokenizer.eos_token}"
        for solution in examples['solution']
    ]
    # Concatenate inputs and targets
    full_texts = [inp + tgt for inp, tgt in zip(inputs, targets)]
    # Tokenize the concatenated texts
    model_inputs = tokenizer(
        full_texts,
        max_length=512,
        truncation=True,
        padding="longest",  # Use dynamic padding
        return_tensors="pt"
    )
    # Create labels by cloning input IDs
    labels = model_inputs["input_ids"].clone()

    # Mask input tokens in labels
    for i in range(len(labels)):
        input_ids = tokenizer(inputs[i], add_special_tokens=False).input_ids
        input_len = len(input_ids)
        labels[i][:input_len] = -100  # Mask the input tokens
    model_inputs["labels"] = labels
    return model_inputs

In [15]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # For causal language modeling
    pad_to_multiple_of=8  # For faster GPU performance
)

In [16]:
dataset = load_dataset("Maxwell-Jia/MATH",trust_remote_code=True)
dataset['train'] = dataset['train'].filter(
        lambda x: x['level'] != f"Level ?"
    )

README.md:   0%|          | 0.00/5.32k [00:00<?, ?B/s]

competition_math.py:   0%|          | 0.00/2.57k [00:00<?, ?B/s]

MATH.zip:   0%|          | 0.00/7.91M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7500 [00:00<?, ? examples/s]

In [17]:
for lci in lora_config.keys():
    model.resize_token_embeddings(len(tokenizer))
    model = get_peft_model(model, lora_config[lci])
    trainable_params = 0
    total_params = 0
    for param in model.parameters():
        num_params = param.numel()
        total_params += num_params
        if param.requires_grad:
            trainable_params += num_params
    print(f"Trainable parameters: {trainable_params} ({100 * trainable_params / total_params:.2f}% of total), Total : {total_params}")

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Trainable parameters: 22544384 (2.92% of total), Total : 771821568
Trainable parameters: 22544384 (2.92% of total), Total : 771821568
Trainable parameters: 22544384 (2.92% of total), Total : 771821568
Trainable parameters: 22544384 (2.92% of total), Total : 771821568
Trainable parameters: 22544384 (2.92% of total), Total : 771821568


In [18]:
for level in range(1, 6):
    print(f"\nStarting training for Level {level}")

    if level == 1:
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            device_map="auto",
            token=token,
        )
        model.resize_token_embeddings(len(tokenizer))
        lc = lora_config["lora_config1"]
        model = get_peft_model(model, lc)

    else:
        # Load tokenizer and model from previous level
        prev_level = level - 1
        prev_model_path = os.path.join(save_dir, f"rank_llama_level_{prev_level}")
        
        model = PeftModel.from_pretrained(
        model=model,
        model_id = prev_model_path,
        peft_config=bnb_config,
        device_map="auto")
        
        model.resize_token_embeddings(len(tokenizer))
        if level == 2:
            lc = lora_config["lora_config2"]
            model = get_peft_model(model, lc)
        if level == 3:
            lc = lora_config["lora_config3"]
            model = get_peft_model(model, lc)
        if level == 4:
            lc = lora_config["lora_config4"]
            model = get_peft_model(model, lc)
        if level == 5:
            lc = lora_config["lora_config5"]
            model = get_peft_model(model, lc)       
        tokenizer.padding_side = "left"
    
    # Filter the dataset by current level
    level_train = dataset['train'].filter(
        lambda x: x['level'] == f"Level {level}"
    )
    level_test = dataset['test'].filter(
        lambda x: x['level'] <= f"Level {level}"
    )
    
    print(f"Number of training samples: {len(level_train)}")
    print(f"Number of test samples: {len(level_test)}")
    
    level_train = level_train.map(preprocess_function, batched=True)
    level_test = level_test.map(preprocess_function, batched=True)
    
    # level_train = level_train.select(range(5))
    # level_test = level_test.select(range(2))
    
    training_args = TrainingArguments(
      output_dir=os.path.join(save_dir, f"rank_level_overall"),
      per_device_train_batch_size=4,
      gradient_accumulation_steps=1,
      warmup_ratio=0.1,
      num_train_epochs=5,
      learning_rate=5e-5,
      fp16=True,  # Use bf16 precision
      logging_steps=100,
      optim="paged_adamw_8bit",
      evaluation_strategy="steps",
      eval_steps=500,
      save_steps=500,
      save_total_limit=2,
      report_to="none",  # Change to "wandb" if using Weights & Biases
      run_name=f"llama_FineTuning_Level_overall",
        load_best_model_at_end=True,  # Load the best model at the end
    metric_for_best_model="eval_loss",  # Use eval_loss to select the best model
    greater_is_better=False,
        ddp_find_unused_parameters=False,
    )

    early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,  # Stop training if no improvement after 2 evaluations
    )
    
    # Initialize the trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=level_train,
        eval_dataset=level_test,
        peft_config=lc,
        args=training_args,
        data_collator=data_collator,
        callbacks=[early_stopping_callback],
    )
    
    # Train the model
    trainer.train()

    
    # Save the model
    model_save_path = os.path.join(save_dir, f"rank_llama_level_{level}")
    model.save_pretrained(model_save_path,save_embedding_layers=True)
    tokenizer.save_pretrained(model_save_path)

    model.push_to_hub(f"SNR_mwp_sft_llama3.21b_level_{level}", token=token)
    tokenizer.push_to_hub(f"SNR_mwp_sft_llama3.21b_level_{level}", token=token)
    
    print(trainer.state.log_history)

    if level == 5:
        model.eval()  # Set model to evaluation mode
        
        test_samples = []
        for idx in range(len(level_test)):
            sample = level_test[idx]
            input_text = (
                f"<|begin_of_text|><|start_header_id|>system <|end_header_id|>"
                "You are an expert math assistant<|eot_id|><|start_header_id|>user <|end_header_id|>"
                f"Solve the following math problem: {sample['problem']}\n"
                "Show all intermediate steps and please mandatorily include the final answer in LaTeX format in a box like \\boxed{{}}."
                "<|eot_id|><|start_header_id|> assistant <|end_header_id|>"
            )
            test_samples.append({
                "input_text": input_text,
                "problem": sample['problem'],
                "level": sample['level'],
                "type": sample['type'],
                "ground_truth": sample['solution']
            })
            
        def collate_fn(batch):
            input_texts = [sample['input_text'] for sample in batch]
            model_inputs = tokenizer(
                input_texts,
                padding=True,
                truncation=True,
                max_length=1024,  # Adjust as needed
                return_tensors="pt"
            )
            model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
            return model_inputs, batch
    
        batch_size = 32  # Adjust based on your GPU memory
        test_dataloader = DataLoader(test_samples, batch_size=batch_size, collate_fn=collate_fn)
    
        results_list = []
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        for batch_idx, (model_inputs, batch_samples) in enumerate(tqdm(test_dataloader, desc=f"Evaluating Level {level}")):
            current_batch_size = model_inputs['input_ids'].size(0)
            # Generate predictions
            try:
                with torch.no_grad():
                    output_ids = model.generate(
                        input_ids=model_inputs['input_ids'],
                        attention_mask=model_inputs['attention_mask'],
                        max_new_tokens=512,  
                        do_sample=False,
                        eos_token_id=tokenizer.eos_token_id,
                        pad_token_id=tokenizer.pad_token_id
                    )
                # Decode the outputs
                for i in range(current_batch_size):
                    predicted_text = tokenizer.decode(output_ids[i], skip_special_tokens=True)
                    # Store the results
                    results_list.append({
                        "problem": batch_samples[i]['problem'],
                        "level": batch_samples[i]['level'],
                        "type": batch_samples[i]['type'],
                        "ground_truth": batch_samples[i]['ground_truth'],
                        "predicted_solution": predicted_text
                    })
            except Exception as e:
                print(f"Error during generation at batch {batch_idx+1}: {e}")
                # In case of error, record empty predictions for this batch
                for i in range(current_batch_size):
                    results_list.append({
                        "problem": batch_samples[i]['problem'],
                        "level": batch_samples[i]['level'],
                        "type": batch_samples[i]['type'],
                        "ground_truth": batch_samples[i]['ground_truth'],
                        "predicted_solution": ""  # Empty string for predicted_text
                    })
                continue  # Proceed to the next batch
    
            # Optionally, save intermediate results every N batches
            if (batch_idx + 1) % 100 == 0:
                results_df = pd.DataFrame(results_list)
                results_save_path = os.path.join(save_dir, f"test_results_level_{level}_batch_{batch_idx+1}.csv")
                results_df.to_csv(results_save_path, index=False)
                print(f"Saved test results up to batch {batch_idx+1} to {results_save_path}")
        
        
        results_df = pd.DataFrame(results_list)
        results_save_path = os.path.join(save_dir, f"test_results_level_{level}.csv")
        results_df.to_csv(results_save_path, index=False)
        print(f"Saved test results to {results_save_path}")


Starting training for Level 1


Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 564
Number of test samples: 437


Map:   0%|          | 0/564 [00:00<?, ? examples/s]

Map:   0%|          | 0/437 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
500,0.4332,0.728881




adapter_model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[{'loss': 1.4024, 'grad_norm': 1.6967850923538208, 'learning_rate': 4.771293375394322e-05, 'epoch': 0.7092198581560284, 'step': 100}, {'loss': 0.7117, 'grad_norm': 1.442844033241272, 'learning_rate': 3.982649842271294e-05, 'epoch': 1.4184397163120568, 'step': 200}, {'loss': 0.5953, 'grad_norm': 1.3494913578033447, 'learning_rate': 3.194006309148265e-05, 'epoch': 2.127659574468085, 'step': 300}, {'loss': 0.4871, 'grad_norm': 1.8583506345748901, 'learning_rate': 2.405362776025237e-05, 'epoch': 2.8368794326241136, 'step': 400}, {'loss': 0.4332, 'grad_norm': 1.9215805530548096, 'learning_rate': 1.616719242902208e-05, 'epoch': 3.546099290780142, 'step': 500}, {'eval_loss': 0.7288809418678284, 'eval_runtime': 51.5033, 'eval_samples_per_second': 8.485, 'eval_steps_per_second': 1.068, 'epoch': 3.546099290780142, 'step': 500}, {'loss': 0.3613, 'grad_norm': 2.2193286418914795, 'learning_rate': 8.280757097791798e-06, 'epoch': 4.25531914893617, 'step': 600}, {'loss': 0.3151, 'grad_norm': 2.7289462



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 1348
Number of test samples: 1331


Map:   0%|          | 0/1348 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
500,0.6195,0.711669
1000,0.5226,0.71582
1500,0.3442,0.816425


adapter_model.safetensors:   0%|          | 0.00/90.2M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[{'loss': 1.4532, 'grad_norm': 1.363391399383545, 'learning_rate': 2.958579881656805e-05, 'epoch': 0.29673590504451036, 'step': 100}, {'loss': 0.8269, 'grad_norm': 1.4572700262069702, 'learning_rate': 4.8977572559366755e-05, 'epoch': 0.5934718100890207, 'step': 200}, {'loss': 0.7404, 'grad_norm': 0.9333776235580444, 'learning_rate': 4.5679419525065967e-05, 'epoch': 0.8902077151335311, 'step': 300}, {'loss': 0.6682, 'grad_norm': 1.391685962677002, 'learning_rate': 4.238126649076518e-05, 'epoch': 1.1869436201780414, 'step': 400}, {'loss': 0.6195, 'grad_norm': 1.5162688493728638, 'learning_rate': 3.908311345646438e-05, 'epoch': 1.4836795252225519, 'step': 500}, {'eval_loss': 0.7116686105728149, 'eval_runtime': 157.6922, 'eval_samples_per_second': 8.44, 'eval_steps_per_second': 1.059, 'epoch': 1.4836795252225519, 'step': 500}, {'loss': 0.6321, 'grad_norm': 1.44847571849823, 'learning_rate': 3.578496042216359e-05, 'epoch': 1.7804154302670623, 'step': 600}, {'loss': 0.5861, 'grad_norm': 1.45



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 1592
Number of test samples: 2462


Map:   0%|          | 0/1592 [00:00<?, ? examples/s]

Map:   0%|          | 0/2462 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
500,0.6365,0.733407
1000,0.5458,0.74725
1500,0.4468,0.777904


adapter_model.safetensors:   0%|          | 0.00/90.2M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[{'loss': 1.4578, 'grad_norm': 1.189194917678833, 'learning_rate': 2.5125628140703518e-05, 'epoch': 0.25125628140703515, 'step': 100}, {'loss': 0.8459, 'grad_norm': 1.5010318756103516, 'learning_rate': 4.997208263539922e-05, 'epoch': 0.5025125628140703, 'step': 200}, {'loss': 0.7348, 'grad_norm': 1.2659958600997925, 'learning_rate': 4.7180346175321047e-05, 'epoch': 0.7537688442211056, 'step': 300}, {'loss': 0.7187, 'grad_norm': 0.9556128978729248, 'learning_rate': 4.438860971524288e-05, 'epoch': 1.0050251256281406, 'step': 400}, {'loss': 0.6365, 'grad_norm': 1.0096299648284912, 'learning_rate': 4.159687325516471e-05, 'epoch': 1.2562814070351758, 'step': 500}, {'eval_loss': 0.7334071397781372, 'eval_runtime': 291.8338, 'eval_samples_per_second': 8.436, 'eval_steps_per_second': 1.055, 'epoch': 1.2562814070351758, 'step': 500}, {'loss': 0.6638, 'grad_norm': 1.1305959224700928, 'learning_rate': 3.880513679508654e-05, 'epoch': 1.507537688442211, 'step': 600}, {'loss': 0.6299, 'grad_norm': 1



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 1690
Number of test samples: 3676


Map:   0%|          | 0/1690 [00:00<?, ? examples/s]

Map:   0%|          | 0/3676 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
500,0.6684,0.760776
1000,0.5657,0.771601
1500,0.4703,0.805967


adapter_model.safetensors:   0%|          | 0.00/90.2M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[{'loss': 1.4045, 'grad_norm': 1.2060216665267944, 'learning_rate': 2.358490566037736e-05, 'epoch': 0.2364066193853428, 'step': 100}, {'loss': 0.8395, 'grad_norm': 1.019321084022522, 'learning_rate': 4.716981132075472e-05, 'epoch': 0.4728132387706856, 'step': 200}, {'loss': 0.7778, 'grad_norm': 1.4540176391601562, 'learning_rate': 4.7687861271676305e-05, 'epoch': 0.7092198581560284, 'step': 300}, {'loss': 0.7419, 'grad_norm': 1.0702325105667114, 'learning_rate': 4.506043089858119e-05, 'epoch': 0.9456264775413712, 'step': 400}, {'loss': 0.6684, 'grad_norm': 1.3089544773101807, 'learning_rate': 4.243300052548608e-05, 'epoch': 1.1820330969267139, 'step': 500}, {'eval_loss': 0.760776162147522, 'eval_runtime': 435.92, 'eval_samples_per_second': 8.433, 'eval_steps_per_second': 1.055, 'epoch': 1.1820330969267139, 'step': 500}, {'loss': 0.6526, 'grad_norm': 1.2966558933258057, 'learning_rate': 3.9805570152390967e-05, 'epoch': 1.4184397163120568, 'step': 600}, {'loss': 0.6798, 'grad_norm': 0.99



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 2304
Number of test samples: 5000


Map:   0%|          | 0/2304 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
500,0.8572,0.779004
1000,0.747,0.763485
1500,0.6825,0.775251
2000,0.5816,0.803869


adapter_model.safetensors:   0%|          | 0.00/90.2M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[{'loss': 1.4917, 'grad_norm': 1.0849896669387817, 'learning_rate': 1.736111111111111e-05, 'epoch': 0.1736111111111111, 'step': 100}, {'loss': 0.9488, 'grad_norm': 1.2567546367645264, 'learning_rate': 3.472222222222222e-05, 'epoch': 0.3472222222222222, 'step': 200}, {'loss': 0.8977, 'grad_norm': 1.2025481462478638, 'learning_rate': 4.976851851851852e-05, 'epoch': 0.5208333333333334, 'step': 300}, {'loss': 0.8561, 'grad_norm': 1.2721905708312988, 'learning_rate': 4.783950617283951e-05, 'epoch': 0.6944444444444444, 'step': 400}, {'loss': 0.8572, 'grad_norm': 0.977046549320221, 'learning_rate': 4.591049382716049e-05, 'epoch': 0.8680555555555556, 'step': 500}, {'eval_loss': 0.7790043354034424, 'eval_runtime': 593.6808, 'eval_samples_per_second': 8.422, 'eval_steps_per_second': 1.053, 'epoch': 0.8680555555555556, 'step': 500}, {'loss': 0.7925, 'grad_norm': 1.0660192966461182, 'learning_rate': 4.3981481481481486e-05, 'epoch': 1.0416666666666667, 'step': 600}, {'loss': 0.7486, 'grad_norm': 1.

Evaluating Level 5:  64%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Ž   | 100/157 [1:22:30<41:31, 43.70s/it]

Saved test results up to batch 100 to /kaggle/working/test_results_level_5_batch_100.csv


Evaluating Level 5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 157/157 [2:06:55<00:00, 48.51s/it]

Saved test results to /kaggle/working/test_results_level_5.csv



