In [2]:
!pip install -q --upgrade transformers datasets peft bitsandbytes trl
!pip install -q accelerate

In [3]:
from accelerate.utils import write_basic_config
write_basic_config()

PosixPath('/root/.cache/huggingface/accelerate/default_config.yaml')

In [4]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    AutoConfig
)
from transformers import EarlyStoppingCallback
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import login
from peft import LoraConfig, get_peft_model,PeftModel
from trl import SFTTrainer
import transformers
from torch.utils.data import DataLoader
from tqdm import tqdm  # For progress bar

In [5]:
seed = 7
torch.manual_seed(seed)
np.random.seed(seed)

In [6]:
print(f"Number of GPUs Available:{torch.cuda.device_count()}")

Number of GPUs Available:2


In [7]:
# token = "your_hugging_face_token"
login(token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:
save_dir = ""
model_id = "meta-llama/Llama-3.2-1B-Instruct"

In [9]:
lora_config = {"lora_config1" : LoraConfig(
    r=256,
    lora_alpha=512,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config2" : LoraConfig(
    r=128,
    lora_alpha=256,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config3" : LoraConfig(
    r=64,
    lora_alpha=128,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config4" : LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
),
 "lora_config5" : LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
)
}

In [10]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16
)

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
tokenizer.padding_side = 'left'

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [12]:
# Define default tokens
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"

special_tokens_dict = {}
if tokenizer.pad_token is None:
    special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN

if special_tokens_dict:
    tokenizer.add_special_tokens(special_tokens_dict)

In [13]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    token=token
)

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [14]:
def preprocess_function(examples):
    # Format inputs with the instruction
    inputs = [
        f"<|begin_of_text|><|start_header_id|>system <|end_header_id|>"
        "You are an expert math assistant<|eot_id|><|start_header_id|>user <|end_header_id|>"
        f"Solve the following math problem: {problem}\n"
        "Show all intermediate steps and please mandatorily include the final answer in LaTeX format in a box like \\boxed{{}}."
        "<|eot_id|><|start_header_id|> assistant <|end_header_id|>"for problem in examples['problem']
    ]
    # Append the solution with EOS token
    targets = [
        f"{solution}{tokenizer.eos_token}"
        for solution in examples['solution']
    ]
    # Concatenate inputs and targets
    full_texts = [inp + tgt for inp, tgt in zip(inputs, targets)]
    # Tokenize the concatenated texts
    model_inputs = tokenizer(
        full_texts,
        max_length=512,
        truncation=True,
        padding="longest",  # Use dynamic padding
        return_tensors="pt"
    )
    # Create labels by cloning input IDs
    labels = model_inputs["input_ids"].clone()

    # Mask input tokens in labels
    for i in range(len(labels)):
        input_ids = tokenizer(inputs[i], add_special_tokens=False).input_ids
        input_len = len(input_ids)
        labels[i][:input_len] = -100  # Mask the input tokens
    model_inputs["labels"] = labels
    return model_inputs

In [15]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # For causal language modeling
    pad_to_multiple_of=8  # For faster GPU performance
)

In [16]:
dataset = load_dataset("Maxwell-Jia/MATH",trust_remote_code=True)
dataset['train'] = dataset['train'].filter(
        lambda x: x['level'] != f"Level ?"
    )

MATH.py:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

data/algebra_train.jsonl:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

(…)ata/counting_and_probability_train.jsonl:   0%|          | 0.00/707k [00:00<?, ?B/s]

data/geometry_train.jsonl:   0%|          | 0.00/1.15M [00:00<?, ?B/s]

data/intermediate_algebra_train.jsonl:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

data/number_theory_train.jsonl:   0%|          | 0.00/639k [00:00<?, ?B/s]

data/prealgebra_train.jsonl:   0%|          | 0.00/778k [00:00<?, ?B/s]

data/precalculus_train.jsonl:   0%|          | 0.00/903k [00:00<?, ?B/s]

data/algebra_test.jsonl:   0%|          | 0.00/706k [00:00<?, ?B/s]

data/counting_and_probability_test.jsonl:   0%|          | 0.00/377k [00:00<?, ?B/s]

data/geometry_test.jsonl:   0%|          | 0.00/562k [00:00<?, ?B/s]

data/intermediate_algebra_test.jsonl:   0%|          | 0.00/860k [00:00<?, ?B/s]

data/number_theory_test.jsonl:   0%|          | 0.00/376k [00:00<?, ?B/s]

data/prealgebra_test.jsonl:   0%|          | 0.00/553k [00:00<?, ?B/s]

data/precalculus_test.jsonl:   0%|          | 0.00/614k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/7500 [00:00<?, ? examples/s]

In [17]:
for lci in lora_config.keys():
    model.resize_token_embeddings(len(tokenizer))
    model = get_peft_model(model, lora_config[lci])
    trainable_params = 0
    total_params = 0
    for param in model.parameters():
        num_params = param.numel()
        total_params += num_params
        if param.requires_grad:
            trainable_params += num_params
    print(f"Trainable parameters: {trainable_params} ({100 * trainable_params / total_params:.2f}% of total), Total : {total_params}")

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Trainable parameters: 180355072 (19.40% of total), Total : 929632256


Trainable parameters: 90177536 (10.74% of total), Total : 839454720


Trainable parameters: 45088768 (5.68% of total), Total : 794365952


Trainable parameters: 22544384 (2.92% of total), Total : 771821568


Trainable parameters: 22544384 (2.92% of total), Total : 771821568


In [18]:
for level in range(1, 6):
    print(f"\nStarting training for Level {level}")

    if level == 1:
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            device_map="auto",
            token=token,
        )
        model.resize_token_embeddings(len(tokenizer))
        lc = lora_config["lora_config1"]
        model = get_peft_model(model, lc)

    else:
        # Load tokenizer and model from previous level
        prev_level = level - 1
        prev_model_path = os.path.join(save_dir, f"rank_llama_level_{prev_level}")
        
        model = PeftModel.from_pretrained(
        model=model,
        model_id = prev_model_path,
        peft_config=bnb_config,
        device_map="auto")
        
        model.resize_token_embeddings(len(tokenizer))
        if level == 2:
            lc = lora_config["lora_config2"]
            model = get_peft_model(model, lc)
        if level == 3:
            lc = lora_config["lora_config3"]
            model = get_peft_model(model, lc)
        if level == 4:
            lc = lora_config["lora_config4"]
            model = get_peft_model(model, lc)
        if level == 5:
            lc = lora_config["lora_config5"]
            model = get_peft_model(model, lc)       
        tokenizer.padding_side = "left"
    
    # Filter the dataset by current level
    level_train = dataset['train'].filter(
        lambda x: x['level'] <= f"Level {level}"
    )
    level_test = dataset['test'].filter(
        lambda x: x['level'] <= f"Level {level}"
    )
    
    print(f"Number of training samples: {len(level_train)}")
    print(f"Number of test samples: {len(level_test)}")
    
#     # Combine with previous levels' data if applicable
#     if level > 1:
#         replay_fraction = 0.1  # Adjust as needed
#         previous_levels_train = dataset['train'].filter(
#             lambda x: int(x['level'].split()[-1]) < level
#         ).shuffle(seed).select(range(int(len(level_train)*replay_fraction)))
#         level_train = concatenate_datasets([level_train, previous_levels_train])
    
    # Tokenize datasets
    level_train = level_train.map(preprocess_function, batched=True)
    level_test = level_test.map(preprocess_function, batched=True)
    
#     level_train = level_train.select(range(5))
#     level_test = level_test.select(range(2))
    
    training_args = TrainingArguments(
      output_dir=os.path.join(save_dir, f"rank_level_overall"),
      per_device_train_batch_size=4,
      gradient_accumulation_steps=1,
      warmup_ratio=0.1,
      num_train_epochs=5,
      learning_rate=5e-5,
      fp16=True,  # Use bf16 precision
      logging_steps=100,
      optim="paged_adamw_8bit",
      evaluation_strategy="steps",
      eval_steps=150,
      save_steps=450,
      save_total_limit=2,
      report_to="none",  # Change to "wandb" if using Weights & Biases
      run_name=f"llama_FineTuning_Level_overall",
        load_best_model_at_end=True,  # Load the best model at the end
    metric_for_best_model="eval_loss",  # Use eval_loss to select the best model
    greater_is_better=False,
        ddp_find_unused_parameters=False,
    )

    early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,  # Stop training if no improvement after 2 evaluations
    )
    
    # Initialize the trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=level_train,
        eval_dataset=level_test,
        peft_config=lc,
        args=training_args,
        data_collator=data_collator,
        callbacks=[early_stopping_callback],
    )
    
    # Train the model
    trainer.train()

    
    # Save the model
    model_save_path = os.path.join(save_dir, f"rank_llama_level_{level}")
    model.save_pretrained(model_save_path,save_embedding_layers=True)
    tokenizer.save_pretrained(model_save_path)

    model.push_to_hub(f"e3_mwp_sft_llama3.21b_level_{level}", use_auth_token=token)
    tokenizer.push_to_hub(f"e3_mwp_sft_llama3.21b_level_{level}", use_auth_token=token)
    
    # Evaluate the model
    results = trainer.evaluate()
    print(f"Overall evaluation results: {results}")
    
    print(trainer.state.log_history)

    if level == 5:
        model.eval()  # Set model to evaluation mode
        
        test_samples = []
        for idx in range(len(level_test)):
            sample = level_test[idx]
            input_text = (
                f"<|begin_of_text|><|start_header_id|>system <|end_header_id|>"
                "You are an expert math assistant<|eot_id|><|start_header_id|>user <|end_header_id|>"
                f"Solve the following math problem: {sample['problem']}\n"
                "Show all intermediate steps and please mandatorily include the final answer in LaTeX format in a box like \\boxed{{}}."
                "<|eot_id|><|start_header_id|> assistant <|end_header_id|>"
            )
            test_samples.append({
                "input_text": input_text,
                "problem": sample['problem'],
                "level": sample['level'],
                "type": sample['type'],
                "ground_truth": sample['solution']
            })
            
        def collate_fn(batch):
            input_texts = [sample['input_text'] for sample in batch]
            model_inputs = tokenizer(
                input_texts,
                padding=True,
                truncation=True,
                max_length=1024,  # Adjust as needed
                return_tensors="pt"
            )
            model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
            return model_inputs, batch
    
        batch_size = 32  # Adjust based on your GPU memory
        test_dataloader = DataLoader(test_samples, batch_size=batch_size, collate_fn=collate_fn)
    
        results_list = []
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        for batch_idx, (model_inputs, batch_samples) in enumerate(tqdm(test_dataloader, desc=f"Evaluating Level {level}")):
            current_batch_size = model_inputs['input_ids'].size(0)
            # Generate predictions
            try:
                with torch.no_grad():
                    output_ids = model.generate(
                        input_ids=model_inputs['input_ids'],
                        attention_mask=model_inputs['attention_mask'],
                        max_new_tokens=512,  
                        do_sample=False,
                        eos_token_id=tokenizer.eos_token_id,
                        pad_token_id=tokenizer.pad_token_id
                    )
                # Decode the outputs
                for i in range(current_batch_size):
                    predicted_text = tokenizer.decode(output_ids[i], skip_special_tokens=True)
                    # Store the results
                    results_list.append({
                        "problem": batch_samples[i]['problem'],
                        "level": batch_samples[i]['level'],
                        "type": batch_samples[i]['type'],
                        "ground_truth": batch_samples[i]['ground_truth'],
                        "predicted_solution": predicted_text
                    })
            except Exception as e:
                print(f"Error during generation at batch {batch_idx+1}: {e}")
                # In case of error, record empty predictions for this batch
                for i in range(current_batch_size):
                    results_list.append({
                        "problem": batch_samples[i]['problem'],
                        "level": batch_samples[i]['level'],
                        "type": batch_samples[i]['type'],
                        "ground_truth": batch_samples[i]['ground_truth'],
                        "predicted_solution": ""  # Empty string for predicted_text
                    })
                continue  # Proceed to the next batch
    
            # Optionally, save intermediate results every N batches
            if (batch_idx + 1) % 100 == 0:
                results_df = pd.DataFrame(results_list)
                results_save_path = os.path.join(save_dir, f"test_results_level_{level}_batch_{batch_idx+1}.csv")
                results_df.to_csv(results_save_path, index=False)
                print(f"Saved test results up to batch {batch_idx+1} to {results_save_path}")
        
        
        results_df = pd.DataFrame(results_list)
        results_save_path = os.path.join(save_dir, f"test_results_level_{level}.csv")
        results_df.to_csv(results_save_path, index=False)
        print(f"Saved test results to {results_save_path}")


Starting training for Level 1


Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 564
Number of test samples: 437


Map:   0%|          | 0/564 [00:00<?, ? examples/s]

Map:   0%|          | 0/437 [00:00<?, ? examples/s]



  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
150,1.0425,0.699341
300,0.4037,0.848444
450,0.2303,0.923881
600,0.1059,1.046901








README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]



adapter_model.safetensors:   0%|          | 0.00/1.77G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Overall evaluation results: {'eval_loss': 0.9238811731338501, 'eval_runtime': 52.403, 'eval_samples_per_second': 8.339, 'eval_steps_per_second': 1.05, 'epoch': 5.0}
[{'loss': 1.0425, 'grad_norm': 2.7326579093933105, 'learning_rate': 4.771293375394322e-05, 'epoch': 0.7092198581560284, 'step': 100}, {'eval_loss': 0.6993407011032104, 'eval_runtime': 51.2407, 'eval_samples_per_second': 8.528, 'eval_steps_per_second': 1.073, 'epoch': 1.0638297872340425, 'step': 150}, {'loss': 0.5545, 'grad_norm': 2.8890068531036377, 'learning_rate': 3.982649842271294e-05, 'epoch': 1.4184397163120568, 'step': 200}, {'loss': 0.4037, 'grad_norm': 3.034909963607788, 'learning_rate': 3.194006309148265e-05, 'epoch': 2.127659574468085, 'step': 300}, {'eval_loss': 0.8484441637992859, 'eval_runtime': 54.3257, 'eval_samples_per_second': 8.044, 'eval_steps_per_second': 1.012, 'epoch': 2.127659574468085, 'step': 300}, {'loss': 0.2303, 'grad_norm': 2.743929147720337, 'learning_rate': 2.405362776025237e-05, 'epoch': 2.83



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 1912
Number of test samples: 1331


Map:   0%|          | 0/1912 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]



  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
150,1.3114,0.748185
300,0.6916,0.711647
450,0.6997,0.688194
600,0.5018,0.69754
750,0.5245,0.690611




README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/361M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Overall evaluation results: {'eval_loss': 0.6881941556930542, 'eval_runtime': 162.6619, 'eval_samples_per_second': 8.183, 'eval_steps_per_second': 1.027, 'epoch': 1.5690376569037658}
[{'loss': 1.3114, 'grad_norm': 2.0684597492218018, 'learning_rate': 2.092050209205021e-05, 'epoch': 0.20920502092050208, 'step': 100}, {'eval_loss': 0.7481848001480103, 'eval_runtime': 155.7354, 'eval_samples_per_second': 8.547, 'eval_steps_per_second': 1.072, 'epoch': 0.3138075313807531, 'step': 150}, {'loss': 0.7641, 'grad_norm': 1.809200406074524, 'learning_rate': 4.184100418410042e-05, 'epoch': 0.41841004184100417, 'step': 200}, {'loss': 0.6916, 'grad_norm': 1.6641292572021484, 'learning_rate': 4.8582054858205486e-05, 'epoch': 0.6276150627615062, 'step': 300}, {'eval_loss': 0.711646556854248, 'eval_runtime': 155.9352, 'eval_samples_per_second': 8.536, 'eval_steps_per_second': 1.071, 'epoch': 0.6276150627615062, 'step': 300}, {'loss': 0.6997, 'grad_norm': 1.4930295944213867, 'learning_rate': 4.625755462



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 3504
Number of test samples: 2462


Map:   0%|          | 0/3504 [00:00<?, ? examples/s]

Map:   0%|          | 0/2462 [00:00<?, ? examples/s]



  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
150,1.586,0.888161
300,0.7648,0.752683
450,0.7148,0.734378
600,0.6795,0.715873
750,0.6911,0.700728
900,0.6454,0.705863
1050,0.5612,0.695691
1200,0.5788,0.695835
1350,0.5536,0.691401
1500,0.5621,0.686418




adapter_model.safetensors:   0%|          | 0.00/180M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Overall evaluation results: {'eval_loss': 0.6914014220237732, 'eval_runtime': 294.9568, 'eval_samples_per_second': 8.347, 'eval_steps_per_second': 1.044, 'epoch': 2.2260273972602738}
[{'loss': 1.586, 'grad_norm': 1.680282711982727, 'learning_rate': 1.1415525114155251e-05, 'epoch': 0.1141552511415525, 'step': 100}, {'eval_loss': 0.8881610035896301, 'eval_runtime': 294.084, 'eval_samples_per_second': 8.372, 'eval_steps_per_second': 1.047, 'epoch': 0.17123287671232876, 'step': 150}, {'loss': 0.869, 'grad_norm': 1.672010064125061, 'learning_rate': 2.2831050228310503e-05, 'epoch': 0.228310502283105, 'step': 200}, {'loss': 0.7648, 'grad_norm': 1.6577465534210205, 'learning_rate': 3.424657534246575e-05, 'epoch': 0.3424657534246575, 'step': 300}, {'eval_loss': 0.7526830434799194, 'eval_runtime': 294.4123, 'eval_samples_per_second': 8.362, 'eval_steps_per_second': 1.046, 'epoch': 0.3424657534246575, 'step': 300}, {'loss': 0.7148, 'grad_norm': 1.4545058012008667, 'learning_rate': 4.5662100456621



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 5194
Number of test samples: 3676


Map:   0%|          | 0/5194 [00:00<?, ? examples/s]

Map:   0%|          | 0/3676 [00:00<?, ? examples/s]



  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
150,1.8228,0.942038
300,0.8683,0.86591
450,0.7695,0.770244
600,0.7294,0.755498
750,0.6953,0.746751
900,0.6895,0.736536
1050,0.6821,0.725999
1200,0.6823,0.721724
1350,0.696,0.718246
1500,0.5905,0.71671




adapter_model.safetensors:   0%|          | 0.00/90.2M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Overall evaluation results: {'eval_loss': 0.7001526951789856, 'eval_runtime': 404.716, 'eval_samples_per_second': 9.083, 'eval_steps_per_second': 1.137, 'epoch': 2.1939953810623556}
[{'loss': 1.8228, 'grad_norm': 2.058830738067627, 'learning_rate': 7.692307692307694e-06, 'epoch': 0.07698229407236336, 'step': 100}, {'eval_loss': 0.9420379996299744, 'eval_runtime': 409.3385, 'eval_samples_per_second': 8.98, 'eval_steps_per_second': 1.124, 'epoch': 0.11547344110854503, 'step': 150}, {'loss': 0.923, 'grad_norm': 1.4816007614135742, 'learning_rate': 1.5384615384615387e-05, 'epoch': 0.15396458814472672, 'step': 200}, {'loss': 0.8683, 'grad_norm': 1.114583969116211, 'learning_rate': 2.307692307692308e-05, 'epoch': 0.23094688221709006, 'step': 300}, {'eval_loss': 0.8659098148345947, 'eval_runtime': 429.2293, 'eval_samples_per_second': 8.564, 'eval_steps_per_second': 1.072, 'epoch': 0.23094688221709006, 'step': 300}, {'loss': 0.7695, 'grad_norm': 1.6131666898727417, 'learning_rate': 3.076923076



Filter:   0%|          | 0/7498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Number of training samples: 7498
Number of test samples: 5000


Map:   0%|          | 0/7498 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]



  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
150,1.9293,0.987709
300,0.9088,0.901144
450,0.8644,0.802649
600,0.7708,0.784849
750,0.7557,0.775926
900,0.7749,0.766913
1050,0.7504,0.757107
1200,0.7501,0.75067
1350,0.7327,0.742923
1500,0.7503,0.738191




adapter_model.safetensors:   0%|          | 0.00/90.2M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Overall evaluation results: {'eval_loss': 0.7089974880218506, 'eval_runtime': 531.1182, 'eval_samples_per_second': 9.414, 'eval_steps_per_second': 1.177, 'epoch': 2.16}
[{'loss': 1.9293, 'grad_norm': 1.916932225227356, 'learning_rate': 5.330490405117271e-06, 'epoch': 0.05333333333333334, 'step': 100}, {'eval_loss': 0.987708568572998, 'eval_runtime': 552.7718, 'eval_samples_per_second': 9.045, 'eval_steps_per_second': 1.131, 'epoch': 0.08, 'step': 150}, {'loss': 1.03, 'grad_norm': 1.2509994506835938, 'learning_rate': 1.0660980810234541e-05, 'epoch': 0.10666666666666667, 'step': 200}, {'loss': 0.9088, 'grad_norm': 1.4883673191070557, 'learning_rate': 1.5991471215351813e-05, 'epoch': 0.16, 'step': 300}, {'eval_loss': 0.9011439681053162, 'eval_runtime': 550.5185, 'eval_samples_per_second': 9.082, 'eval_steps_per_second': 1.135, 'epoch': 0.16, 'step': 300}, {'loss': 0.8644, 'grad_norm': 1.1280219554901123, 'learning_rate': 2.1321961620469083e-05, 'epoch': 0.21333333333333335, 'step': 400}, 


Evaluating Level 5:   0%|          | 0/157 [00:00<?, ?it/s]




Evaluating Level 5:   1%|          | 1/157 [00:47<2:03:45, 47.60s/it]


Evaluating Level 5:   1%|▏         | 2/157 [01:40<2:11:00, 50.71s/it]


Evaluating Level 5:   2%|▏         | 3/157 [02:27<2:06:21, 49.23s/it]


Evaluating Level 5:   3%|▎         | 4/157 [03:20<2:09:11, 50.67s/it]


Evaluating Level 5:   3%|▎         | 5/157 [03:53<1:52:22, 44.36s/it]


Evaluating Level 5:   4%|▍         | 6/157 [04:43<1:56:01, 46.10s/it]


Evaluating Level 5:   4%|▍         | 7/157 [05:26<1:52:48, 45.12s/it]


Evaluating Level 5:   5%|▌         | 8/157 [06:10<1:50:57, 44.68s/it]


Evaluating Level 5:   6%|▌         | 9/157 [06:53<1:49:10, 44.26s/it]


Evaluating Level 5:   6%|▋         | 10/157 [07:37<1:47:55, 44.05s/it]


Evaluating Level 5:   7%|▋         | 11/157 [08:20<1:46:36, 43.81s/it]


Evaluating Level 5:   8%|▊         | 12/157 [09:28<2:03:43, 51.19s/it]


Evaluating Level 5:   8%|▊         | 13/157 [09:43<1:36:26, 40.18s/it]


Evaluating Level 5:   9%|▉         | 14/157 [10:27<1:38:24, 41.29s/it]


Evaluating Level 5:  10%|▉         | 15/157 [11:32<1:54:36, 48.43s/it]


Evaluating Level 5:  10%|█         | 16/157 [12:15<1:50:23, 46.98s/it]


Evaluating Level 5:  11%|█         | 17/157 [13:01<1:48:27, 46.48s/it]


Evaluating Level 5:  11%|█▏        | 18/157 [13:44<1:45:47, 45.67s/it]


Evaluating Level 5:  12%|█▏        | 19/157 [14:52<2:00:20, 52.32s/it]


Evaluating Level 5:  13%|█▎        | 20/157 [15:36<1:53:18, 49.63s/it]


Evaluating Level 5:  13%|█▎        | 21/157 [16:08<1:40:39, 44.40s/it]


Evaluating Level 5:  14%|█▍        | 22/157 [16:57<1:43:00, 45.78s/it]


Evaluating Level 5:  15%|█▍        | 23/157 [17:49<1:46:37, 47.74s/it]


Evaluating Level 5:  15%|█▌        | 24/157 [18:33<1:43:13, 46.57s/it]


Evaluating Level 5:  16%|█▌        | 25/157 [19:24<1:45:16, 47.85s/it]


Evaluating Level 5:  17%|█▋        | 26/157 [19:57<1:34:32, 43.30s/it]


Evaluating Level 5:  17%|█▋        | 27/157 [20:40<1:34:08, 43.45s/it]


Evaluating Level 5:  18%|█▊        | 28/157 [21:23<1:32:42, 43.12s/it]


Evaluating Level 5:  18%|█▊        | 29/157 [22:06<1:31:57, 43.10s/it]


Evaluating Level 5:  19%|█▉        | 30/157 [22:23<1:14:43, 35.30s/it]


Evaluating Level 5:  20%|█▉        | 31/157 [23:07<1:19:29, 37.85s/it]


Evaluating Level 5:  20%|██        | 32/157 [23:50<1:22:23, 39.55s/it]


Evaluating Level 5:  21%|██        | 33/157 [24:35<1:24:59, 41.13s/it]


Evaluating Level 5:  22%|██▏       | 34/157 [25:21<1:27:26, 42.65s/it]


Evaluating Level 5:  22%|██▏       | 35/157 [26:05<1:27:30, 43.04s/it]


Evaluating Level 5:  23%|██▎       | 36/157 [26:48<1:26:59, 43.14s/it]


Evaluating Level 5:  24%|██▎       | 37/157 [27:39<1:30:33, 45.28s/it]


Evaluating Level 5:  24%|██▍       | 38/157 [28:09<1:21:02, 40.86s/it]


Evaluating Level 5:  25%|██▍       | 39/157 [28:59<1:25:43, 43.59s/it]


Evaluating Level 5:  25%|██▌       | 40/157 [29:51<1:29:58, 46.14s/it]


Evaluating Level 5:  26%|██▌       | 41/157 [30:24<1:21:09, 41.98s/it]


Evaluating Level 5:  27%|██▋       | 42/157 [31:29<1:33:44, 48.91s/it]


Evaluating Level 5:  27%|██▋       | 43/157 [32:13<1:30:08, 47.44s/it]


Evaluating Level 5:  28%|██▊       | 44/157 [33:09<1:34:13, 50.03s/it]


Evaluating Level 5:  29%|██▊       | 45/157 [33:52<1:29:24, 47.90s/it]


Evaluating Level 5:  29%|██▉       | 46/157 [34:45<1:31:31, 49.47s/it]


Evaluating Level 5:  30%|██▉       | 47/157 [35:33<1:29:54, 49.05s/it]


Evaluating Level 5:  31%|███       | 48/157 [36:48<1:43:29, 56.96s/it]


Evaluating Level 5:  31%|███       | 49/157 [37:37<1:38:07, 54.51s/it]


Evaluating Level 5:  32%|███▏      | 50/157 [38:26<1:34:26, 52.95s/it]


Evaluating Level 5:  32%|███▏      | 51/157 [39:12<1:29:22, 50.59s/it]


Evaluating Level 5:  33%|███▎      | 52/157 [39:59<1:26:51, 49.63s/it]


Evaluating Level 5:  34%|███▍      | 53/157 [40:57<1:30:30, 52.22s/it]


Evaluating Level 5:  34%|███▍      | 54/157 [41:48<1:28:56, 51.82s/it]


Evaluating Level 5:  35%|███▌      | 55/157 [42:35<1:25:49, 50.48s/it]


Evaluating Level 5:  36%|███▌      | 56/157 [43:26<1:24:58, 50.48s/it]