In [1]:
import os
import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset, concatenate_datasets
from peft import (
    LoraConfig,
    PeftConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)

from trl import SFTTrainer

In [2]:
model_id = "microsoft/Phi-3-mini-4k-instruct"
dataset_name = "openai/gsm8k"
dataset_split = 'main'

In [3]:
dataset_1 = load_dataset(dataset_name, 'main')

In [4]:
dataset_1

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [5]:
# 'model_id' and 'model_name' are the identifiers for the pre-trained model from Hugging Face hub that you want to fine-tune.
model_id = "microsoft/Phi-3-mini-4k-instruct"
model_name = "microsoft/Phi-3-mini-4k-instruct"

# 'dataset_name' is the identifier for the dataset that you want to use for fine-tuning.
dataset_name = "openai/gsm8k"

# 'dataset_split' is the split of the dataset that you want to use for fine-tuning. In this case, it is set to 'train', which means that the training split of the dataset will be used.
#dataset_split= "train"

# 'new_model' is the name that you want to give to the fine-tuned model.
new_model = "new-model-name"

# 'hf_model_repo' is the identifier for the Hugging Face repository where you want to save the fine-tuned model.
hf_model_repo="username/"+new_model

# Load Model on GPU

# 'device_map' is a dictionary that maps devices to model parts. In this case, it is set to {"": 0}, which means that the entire model will be loaded on GPU 0.
device_map = {"": 0}

# Bits and Bytes configuration for the model

# 'use_4bit' is a boolean that controls whether 4-bit precision should be used for loading the base model.
use_4bit = True

# 'bnb_4bit_compute_dtype' is the data type that should be used for computations with the 4-bit base model. In this case, it is set to 'bfloat16'.
bnb_4bit_compute_dtype = "bfloat16"

# 'bnb_4bit_quant_type' is the type of quantization that should be used for the 4-bit base model. In this case, it is set to 'nf4'.
bnb_4bit_quant_type = "nf4"

# 'use_double_quant' is a boolean that controls whether nested quantization should be used for the 4-bit base model.
use_double_quant = True

# LoRA configuration for the model

# 'lora_r' is the dimension of the LoRA attention.
lora_r = 16

# 'lora_alpha' is the alpha parameter for LoRA scaling.
lora_alpha = 16

# 'lora_dropout' is the dropout probability for LoRA layers.
lora_dropout = 0.05

# 'target_modules' is a list of the modules that should be targeted by LoRA.
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]

In [6]:
dataset_2 = dataset_1['train']
dataset_3 = dataset_1['test']

dataset = concatenate_datasets([dataset_2, dataset_3])

dataset = dataset.shuffle(seed=42)

In [7]:
dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 8792
})

In [8]:
# 'tokenizer_id' is the identifier for the tokenizer that you want to load. In this case, it is set to the value of 'model_id', which means that the tokenizer associated with the pre-trained model will be loaded.

# 'AutoTokenizer' is a class from the 'transformers' library that provides a generic tokenizer class from which all other tokenizer classes inherit.

# 'from_pretrained' is a method of the 'AutoTokenizer' class that loads a tokenizer from the Hugging Face Model Hub.

# 'tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)' loads the tokenizer associated with 'tokenizer_id' from the Hugging Face Model Hub and assigns it to the variable 'tokenizer'.

# 'tokenizer.padding_side' is a property of the 'tokenizer' object that determines on which side of the input sequences padding should be added. It can be set to either 'left' or 'right'.

# 'tokenizer.padding_side = 'right'' sets 'tokenizer.padding_side' to 'right', which means that padding will be added to the right side of the input sequences. This is done to prevent warnings that can occur when 'tokenizer.padding_side' is set to 'left'.
tokenizer_id = model_id
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.padding_side = 'right' # to prevent warnings

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
# 'create_message_column' is a function that takes a row from a dataset and returns a dictionary with a single key-value pair. The key is 'messages' and the value is a list of dictionaries, each representing a message.

# 'row' is the input to the 'create_message_column' function. It is expected to be a dictionary with keys 'instruction', 'input', and 'output'.

# 'messages' is a list that will contain the messages.

# 'user' is a dictionary that represents a user message. The 'content' key contains the instruction and input from the row, and the 'role' key is set to 'user'.

# 'messages.append(user)' adds the user message to the 'messages' list.

# 'assistant' is a dictionary that represents an assistant message. The 'content' key contains the output from the row, and the 'role' key is set to 'assistant'.

# 'messages.append(assistant)' adds the assistant message to the 'messages' list.

# 'return {"messages": messages}' returns a dictionary with a single key-value pair. The key is 'messages' and the value is the 'messages' list.

# 'format_dataset_chatml' is a function that takes a row from a dataset and returns a dictionary with a single key-value pair. The key is 'text' and the value is the result of applying the chat template to the messages in the row.

# 'row' is the input to the 'format_dataset_chatml' function. It is expected to be a dictionary with a key 'messages'.

# 'return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False)}' returns a dictionary with a single key-value pair. The key is 'text' and the value is the result of applying the chat template to the messages in the row. The 'add_generation_prompt' parameter is set to False, which means that no generation prompt will be added to the end of the text. The 'tokenize' parameter is set to False, which means that the text will not be tokenized.
def create_message_column(row):
    messages = []
    user = {
        "content": f" Input: {row['question']}",
        "role": "user"
    }
    messages.append(user)
    assistant = {
        "content": f"{row['answer']}",
        "role": "assistant"
    }
    messages.append(assistant)
    return {"messages": messages}

def format_dataset_chatml(row):
    return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False)}

In [10]:
dataset_chatml = dataset.map(create_message_column)
dataset_chatml = dataset_chatml.map(format_dataset_chatml)

Map:   0%|          | 0/8792 [00:00<?, ? examples/s]

Map:   0%|          | 0/8792 [00:00<?, ? examples/s]

In [11]:

# 'dataset_chatml' is a variable that contains the dataset that has been transformed into a format where each example is a single string of text that represents a conversation.

# 'train_test_split' is a method of the 'Dataset' class that splits the dataset into a training set and a test set.

# 'test_size=0.05' is a parameter of the 'train_test_split' method that specifies the proportion of the dataset to include in the test set. In this case, it is set to 0.05, which means that 5% of the dataset will be included in the test set.

# 'seed=1234' is a parameter of the 'train_test_split' method that specifies the seed for the random number generator. This is used to ensure that the split is reproducible.

# 'dataset_chatml = dataset_chatml.train_test_split(test_size=0.05, seed=1234)' splits 'dataset_chatml' into a training set and a test set and assigns the result to 'dataset_chatml'. The result is a dictionary with two key-value pairs. The keys are 'train' and 'test', and the values are the training set and the test set, respectively.

# 'dataset_chatml' when used alone like this in a Jupyter notebook cell, it will display the structure of the training set and the test set. This includes information such as the number of examples in each set, the names and types of the fields in the sets, and the shapes of the fields.

# This line of code is used to check the structure of the training set and the test set to ensure that the split was performed correctly.
dataset_chatml = dataset_chatml.train_test_split(test_size=0.1, seed=1234)
dataset_chatml

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'messages', 'text'],
        num_rows: 7912
    })
    test: Dataset({
        features: ['question', 'answer', 'messages', 'text'],
        num_rows: 880
    })
})

In [13]:
# 'AutoTokenizer' is a class from the Hugging Face Transformers library that provides a tokenizer for a given pre-trained model.

# 'from_pretrained' is a method of the 'AutoTokenizer' class that loads a tokenizer from a pre-trained model.

# 'model_name' is a variable that contains the name of the pre-trained model.

# 'trust_remote_code=True' is a parameter that allows the execution of remote code when loading the tokenizer.

# 'add_eos_token=True' is a parameter that adds an end-of-sentence token to the tokenizer.

# 'use_fast=True' is a parameter that uses the fast version of the tokenizer, if available.

# 'tokenizer.pad_token = tokenizer.unk_token' sets the padding token of the tokenizer to be the same as the unknown token.

# 'tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)' sets the ID of the padding token to be the same as the ID of the padding token.

# 'tokenizer.padding_side = 'left'' sets the side where padding will be added to be the left side.

# 'BitsAndBytesConfig' is a class that provides a configuration for quantization.

# 'bnb_config' is a variable that holds the configuration for quantization.

# 'AutoModelForCausalLM' is a class from the Hugging Face Transformers library that provides a model for causal language modeling.

# 'from_pretrained' is a method of the 'AutoModelForCausalLM' class that loads a model from a pre-trained model.

# 'torch_dtype=compute_dtype' is a parameter that sets the data type of the model to be the same as 'compute_dtype'.

# 'quantization_config=bnb_config' is a parameter that sets the configuration for quantization to be 'bnb_config'.

# 'device_map=device_map' is a parameter that sets the device map of the model to be 'device_map'.

# 'attn_implementation=attn_implementation' is a parameter that sets the type of attention implementation to be 'attn_implementation'.

# 'prepare_model_for_kbit_training' is a function that prepares a model for k-bit training.

# 'model = prepare_model_for_kbit_training(model)' prepares 'model' for k-bit training and assigns the result back to 'model'.
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, add_eos_token=True, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'left'

bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=use_double_quant,
)

model = AutoModelForCausalLM.from_pretrained(
          model_name, torch_dtype=torch.bfloat16, trust_remote_code=True, quantization_config=bnb_config, device_map=device_map,
          #attn_implementation='flash_attention_2',
)



model = prepare_model_for_kbit_training(model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:

# This code block is used to define the training arguments for the model.

# 'TrainingArguments' is a class that holds the arguments for training a model.
# 'output_dir' is the directory where the model and its checkpoints will be saved.
# 'evaluation_strategy' is set to "steps", meaning that evaluation will be performed after a certain number of training steps.
# 'do_eval' is set to True, meaning that evaluation will be performed.
# 'optim' is set to "adamw_torch", meaning that the AdamW optimizer from PyTorch will be used.
# 'per_device_train_batch_size' and 'per_device_eval_batch_size' are set to 8, meaning that the batch size for training and evaluation will be 8 per device.
# 'gradient_accumulation_steps' is set to 4, meaning that gradients will be accumulated over 4 steps before performing a backward/update pass.
# 'log_level' is set to "debug", meaning that all log messages will be printed.
# 'save_strategy' is set to "epoch", meaning that the model will be saved after each epoch.
# 'logging_steps' is set to 100, meaning that log messages will be printed every 100 steps.
# 'learning_rate' is set to 1e-4, which is the learning rate for the optimizer.
# 'fp16' is set to the opposite of whether bfloat16 is supported on the current CUDA device.
# 'bf16' is set to whether bfloat16 is supported on the current CUDA device.
# 'eval_steps' is set to 100, meaning that evaluation will be performed every 100 steps.
# 'num_train_epochs' is set to 3, meaning that the model will be trained for 3 epochs.
# 'warmup_ratio' is set to 0.1, meaning that 10% of the total training steps will be used for the warmup phase.
# 'lr_scheduler_type' is set to "linear", meaning that a linear learning rate scheduler will be used.
# 'report_to' is set to "wandb", meaning that training and evaluation metrics will be reported to Weights & Biases.
# 'seed' is set to 42, which is the seed for the random number generator.

# LoraConfig object is created with the following parameters:
# 'r' (rank of the low-rank approximation) is set to 16,
# 'lora_alpha' (scaling factor) is set to 16,
# 'lora_dropout' dropout probability for Lora layers is set to 0.05,
# 'task_type' (set to TaskType.CAUSAL_LM indicating the task type),
# 'target_modules' (the modules to which LoRA is applied) choosing linear layers except the output layer..


args = TrainingArguments(
        output_dir="./phi-3-mini-LoRA",
        evaluation_strategy="steps",
        do_eval=True,
        optim="adamw_torch",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        per_device_eval_batch_size=8,
        log_level="debug",
        save_strategy="epoch",
        logging_steps=10,
        learning_rate=1e-5,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        eval_steps=100,
        num_train_epochs=3,
        warmup_ratio=0.1,
        lr_scheduler_type="polynomial",

)

peft_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        task_type='CAUSAL_LM',
        target_modules=target_modules,
)



In [15]:
# 'SFTTrainer' is a class that provides a trainer for fine-tuning a model.

# 'trainer' is a variable that holds the trainer.

# 'model=model' is a parameter that sets the model to be trained to be 'model'.

# 'train_dataset=dataset_chatml['train']' is a parameter that sets the training dataset to be 'dataset_chatml['train']'.

# 'eval_dataset=dataset_chatml['test']' is a parameter that sets the evaluation dataset to be 'dataset_chatml['test']'.

# 'peft_config=peft_config' is a parameter that sets the configuration for the Lora layer to be 'peft_config'.

# 'dataset_text_field="text"' is a parameter that sets the field in the dataset that contains the text to be 'text'.

# 'max_seq_length=512' is a parameter that sets the maximum sequence length for the model to be 512.

# 'tokenizer=tokenizer' is a parameter that sets the tokenizer to be 'tokenizer'.

# 'args=args' is a parameter that sets the training arguments to be 'args'.

# This line of code is used to create a trainer for fine-tuning the model with the specified parameters.
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset_chatml['train'],
        eval_dataset=dataset_chatml['test'],
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=args,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/7912 [00:00<?, ? examples/s]

Map:   0%|          | 0/880 [00:00<?, ? examples/s]

Using auto half precision backend


In [16]:
# 'trainer.train()' is a method that starts the training of the model. It uses the training dataset, model, and training arguments that were specified when the trainer was created.

# 'trainer.save_model()' is a method that saves the trained model to the local file system. The model will be saved in the output directory that was specified in the training arguments.

# This block of code is used to train the model and then save the trained model to the local file system.
# train
trainer.train()

Currently training with a batch size of: 8
***** Running training *****
  Num examples = 7,912
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 1,482
  Number of trainable parameters = 8,912,896
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdhanishetty[0m ([33mdhanishetty-personaluse[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/1482 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
You are not running the flash-attention implementation, expect numerical differences.


{'loss': 1.0312, 'grad_norm': 0.26954472064971924, 'learning_rate': 6.711409395973155e-07, 'epoch': 0.02}
{'loss': 1.0375, 'grad_norm': 0.3037637174129486, 'learning_rate': 1.342281879194631e-06, 'epoch': 0.04}
{'loss': 1.0188, 'grad_norm': 0.2689313590526581, 'learning_rate': 2.013422818791946e-06, 'epoch': 0.06}
{'loss': 1.0688, 'grad_norm': 0.2708096206188202, 'learning_rate': 2.684563758389262e-06, 'epoch': 0.08}
{'loss': 1.0188, 'grad_norm': 0.3317864239215851, 'learning_rate': 3.3557046979865777e-06, 'epoch': 0.1}
{'loss': 1.0188, 'grad_norm': 0.27063092589378357, 'learning_rate': 4.026845637583892e-06, 'epoch': 0.12}
{'loss': 0.9812, 'grad_norm': 0.33866822719573975, 'learning_rate': 4.697986577181208e-06, 'epoch': 0.14}
{'loss': 1.0125, 'grad_norm': 0.41769060492515564, 'learning_rate': 5.369127516778524e-06, 'epoch': 0.16}
{'loss': 0.9875, 'grad_norm': 0.3837345540523529, 'learning_rate': 6.04026845637584e-06, 'epoch': 0.18}



***** Running Evaluation *****
  Num examples = 880
  Batch size = 8


{'loss': 0.9625, 'grad_norm': 0.47730931639671326, 'learning_rate': 6.711409395973155e-06, 'epoch': 0.2}


  0%|          | 0/110 [00:00<?, ?it/s]

{'eval_loss': 0.9498032927513123, 'eval_runtime': 1330.0919, 'eval_samples_per_second': 0.662, 'eval_steps_per_second': 0.083, 'epoch': 0.2}
{'loss': 0.9125, 'grad_norm': 0.47238677740097046, 'learning_rate': 7.382550335570471e-06, 'epoch': 0.22}
{'loss': 0.9062, 'grad_norm': 0.4599153697490692, 'learning_rate': 8.053691275167785e-06, 'epoch': 0.24}
{'loss': 0.8625, 'grad_norm': 0.6051740646362305, 'learning_rate': 8.724832214765101e-06, 'epoch': 0.26}
{'loss': 0.875, 'grad_norm': 0.6168507933616638, 'learning_rate': 9.395973154362416e-06, 'epoch': 0.28}
{'loss': 0.8438, 'grad_norm': 0.6269940137863159, 'learning_rate': 9.992573143285822e-06, 'epoch': 0.3}
{'loss': 0.7625, 'grad_norm': 0.622672438621521, 'learning_rate': 9.918304576144037e-06, 'epoch': 0.32}
{'loss': 0.7469, 'grad_norm': 0.6294471025466919, 'learning_rate': 9.844036009002252e-06, 'epoch': 0.34}
{'loss': 0.6875, 'grad_norm': 0.6378968954086304, 'learning_rate': 9.769767441860465e-06, 'epoch': 0.36}
{'loss': 0.6875, 'gra


***** Running Evaluation *****
  Num examples = 880
  Batch size = 8


{'loss': 0.625, 'grad_norm': 0.7076315879821777, 'learning_rate': 9.621230307576895e-06, 'epoch': 0.4}


  0%|          | 0/110 [00:00<?, ?it/s]

{'eval_loss': 0.6202049851417542, 'eval_runtime': 1336.8117, 'eval_samples_per_second': 0.658, 'eval_steps_per_second': 0.082, 'epoch': 0.4}
{'loss': 0.5875, 'grad_norm': 0.7249865531921387, 'learning_rate': 9.54696174043511e-06, 'epoch': 0.42}
{'loss': 0.5875, 'grad_norm': 0.7473070621490479, 'learning_rate': 9.472693173293325e-06, 'epoch': 0.44}
{'loss': 0.55, 'grad_norm': 0.586567223072052, 'learning_rate': 9.39842460615154e-06, 'epoch': 0.47}
{'loss': 0.5406, 'grad_norm': 0.4127423167228699, 'learning_rate': 9.324156039009753e-06, 'epoch': 0.49}
{'loss': 0.5219, 'grad_norm': 0.3879745900630951, 'learning_rate': 9.249887471867967e-06, 'epoch': 0.51}
{'loss': 0.5312, 'grad_norm': 0.29754891991615295, 'learning_rate': 9.175618904726182e-06, 'epoch': 0.53}
{'loss': 0.5188, 'grad_norm': 0.28808385133743286, 'learning_rate': 9.101350337584397e-06, 'epoch': 0.55}
{'loss': 0.4875, 'grad_norm': 0.2309638112783432, 'learning_rate': 9.027081770442612e-06, 'epoch': 0.57}
{'loss': 0.4906, 'grad


***** Running Evaluation *****
  Num examples = 880
  Batch size = 8


{'loss': 0.4719, 'grad_norm': 0.1953417956829071, 'learning_rate': 8.87854463615904e-06, 'epoch': 0.61}


  0%|          | 0/110 [00:00<?, ?it/s]

{'eval_loss': 0.49614569544792175, 'eval_runtime': 1333.5755, 'eval_samples_per_second': 0.66, 'eval_steps_per_second': 0.082, 'epoch': 0.61}
{'loss': 0.475, 'grad_norm': 0.21932660043239594, 'learning_rate': 8.804276069017255e-06, 'epoch': 0.63}
{'loss': 0.4906, 'grad_norm': 0.23453430831432343, 'learning_rate': 8.73000750187547e-06, 'epoch': 0.65}
{'loss': 0.4969, 'grad_norm': 0.2580335736274719, 'learning_rate': 8.655738934733683e-06, 'epoch': 0.67}
{'loss': 0.5125, 'grad_norm': 0.2591034471988678, 'learning_rate': 8.5814703675919e-06, 'epoch': 0.69}
{'loss': 0.4875, 'grad_norm': 0.23238611221313477, 'learning_rate': 8.507201800450114e-06, 'epoch': 0.71}
{'loss': 0.4688, 'grad_norm': 0.20533055067062378, 'learning_rate': 8.432933233308328e-06, 'epoch': 0.73}
{'loss': 0.4625, 'grad_norm': 0.19051823019981384, 'learning_rate': 8.358664666166542e-06, 'epoch': 0.75}
{'loss': 0.4625, 'grad_norm': 0.20713599026203156, 'learning_rate': 8.284396099024757e-06, 'epoch': 0.77}
{'loss': 0.4875,


***** Running Evaluation *****
  Num examples = 880
  Batch size = 8


{'loss': 0.4938, 'grad_norm': 0.20581768453121185, 'learning_rate': 8.135858964741187e-06, 'epoch': 0.81}


  0%|          | 0/110 [00:00<?, ?it/s]

{'eval_loss': 0.48058387637138367, 'eval_runtime': 1135.4143, 'eval_samples_per_second': 0.775, 'eval_steps_per_second': 0.097, 'epoch': 0.81}
{'loss': 0.4656, 'grad_norm': 0.21045073866844177, 'learning_rate': 8.0615903975994e-06, 'epoch': 0.83}
{'loss': 0.4813, 'grad_norm': 0.22445738315582275, 'learning_rate': 7.987321830457615e-06, 'epoch': 0.85}
{'loss': 0.4562, 'grad_norm': 0.21133555471897125, 'learning_rate': 7.91305326331583e-06, 'epoch': 0.87}
{'loss': 0.4656, 'grad_norm': 0.18097390234470367, 'learning_rate': 7.838784696174045e-06, 'epoch': 0.89}
{'loss': 0.4719, 'grad_norm': 0.24156509339809418, 'learning_rate': 7.764516129032258e-06, 'epoch': 0.91}
{'loss': 0.4844, 'grad_norm': 0.26140981912612915, 'learning_rate': 7.690247561890473e-06, 'epoch': 0.93}
{'loss': 0.4938, 'grad_norm': 0.23838533461093903, 'learning_rate': 7.615978994748688e-06, 'epoch': 0.95}
{'loss': 0.45, 'grad_norm': 0.21771842241287231, 'learning_rate': 7.541710427606901e-06, 'epoch': 0.97}
{'loss': 0.493

Saving model checkpoint to ./phi-3-mini-LoRA\checkpoint-494
loading configuration file config.json from cache at C:\Users\dhani\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\c1358f8a35e6d2af81890deffbbfa575b978c62f\config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_tok

{'loss': 0.4594, 'grad_norm': 0.2493448704481125, 'learning_rate': 7.393173293323331e-06, 'epoch': 1.01}


  0%|          | 0/110 [00:00<?, ?it/s]

{'eval_loss': 0.4737878143787384, 'eval_runtime': 1136.3638, 'eval_samples_per_second': 0.774, 'eval_steps_per_second': 0.097, 'epoch': 1.01}
{'loss': 0.45, 'grad_norm': 0.17106139659881592, 'learning_rate': 7.3189047261815445e-06, 'epoch': 1.03}
{'loss': 0.4531, 'grad_norm': 0.20336796343326569, 'learning_rate': 7.244636159039759e-06, 'epoch': 1.05}
{'loss': 0.4656, 'grad_norm': 0.2725241482257843, 'learning_rate': 7.170367591897974e-06, 'epoch': 1.07}
{'loss': 0.4437, 'grad_norm': 0.16843701899051666, 'learning_rate': 7.09609902475619e-06, 'epoch': 1.09}
{'loss': 0.4781, 'grad_norm': 0.22322389483451843, 'learning_rate': 7.021830457614404e-06, 'epoch': 1.11}
{'loss': 0.4562, 'grad_norm': 0.19365207850933075, 'learning_rate': 6.947561890472618e-06, 'epoch': 1.13}
{'loss': 0.4781, 'grad_norm': 0.24105386435985565, 'learning_rate': 6.873293323330833e-06, 'epoch': 1.15}
{'loss': 0.4531, 'grad_norm': 0.20324228703975677, 'learning_rate': 6.799024756189048e-06, 'epoch': 1.17}
{'loss': 0.46


***** Running Evaluation *****
  Num examples = 880
  Batch size = 8


{'loss': 0.4594, 'grad_norm': 0.19218124449253082, 'learning_rate': 6.650487621905476e-06, 'epoch': 1.21}


  0%|          | 0/110 [00:00<?, ?it/s]

{'eval_loss': 0.4692101776599884, 'eval_runtime': 1135.1069, 'eval_samples_per_second': 0.775, 'eval_steps_per_second': 0.097, 'epoch': 1.21}
{'loss': 0.4469, 'grad_norm': 0.19529859721660614, 'learning_rate': 6.576219054763691e-06, 'epoch': 1.23}
{'loss': 0.4688, 'grad_norm': 0.20329667627811432, 'learning_rate': 6.501950487621906e-06, 'epoch': 1.25}
{'loss': 0.4406, 'grad_norm': 0.20600296556949615, 'learning_rate': 6.4276819204801185e-06, 'epoch': 1.27}
{'loss': 0.4656, 'grad_norm': 0.21542847156524658, 'learning_rate': 6.353413353338334e-06, 'epoch': 1.29}
{'loss': 0.4688, 'grad_norm': 0.2248665690422058, 'learning_rate': 6.279144786196549e-06, 'epoch': 1.31}
{'loss': 0.4469, 'grad_norm': 0.20158429443836212, 'learning_rate': 6.204876219054763e-06, 'epoch': 1.33}
{'loss': 0.4594, 'grad_norm': 0.18824639916419983, 'learning_rate': 6.130607651912978e-06, 'epoch': 1.35}
{'loss': 0.4719, 'grad_norm': 0.23070336878299713, 'learning_rate': 6.056339084771194e-06, 'epoch': 1.38}
{'loss': 0


***** Running Evaluation *****
  Num examples = 880
  Batch size = 8


{'loss': 0.4625, 'grad_norm': 0.20297852158546448, 'learning_rate': 5.907801950487622e-06, 'epoch': 1.42}


  0%|          | 0/110 [00:00<?, ?it/s]

{'eval_loss': 0.465841144323349, 'eval_runtime': 1135.6889, 'eval_samples_per_second': 0.775, 'eval_steps_per_second': 0.097, 'epoch': 1.42}
{'loss': 0.4656, 'grad_norm': 0.2178407907485962, 'learning_rate': 5.833533383345835e-06, 'epoch': 1.44}
{'loss': 0.4875, 'grad_norm': 0.4121418297290802, 'learning_rate': 5.759264816204051e-06, 'epoch': 1.46}
{'loss': 0.4688, 'grad_norm': 0.21672895550727844, 'learning_rate': 5.684996249062267e-06, 'epoch': 1.48}
{'loss': 0.4688, 'grad_norm': 0.25576478242874146, 'learning_rate': 5.61072768192048e-06, 'epoch': 1.5}
{'loss': 0.4688, 'grad_norm': 0.2130218744277954, 'learning_rate': 5.536459114778694e-06, 'epoch': 1.52}
{'loss': 0.4656, 'grad_norm': 0.22808252274990082, 'learning_rate': 5.46219054763691e-06, 'epoch': 1.54}
{'loss': 0.4531, 'grad_norm': 0.21590545773506165, 'learning_rate': 5.387921980495124e-06, 'epoch': 1.56}
{'loss': 0.4219, 'grad_norm': 0.20888525247573853, 'learning_rate': 5.313653413353338e-06, 'epoch': 1.58}
{'loss': 0.4688, 


***** Running Evaluation *****
  Num examples = 880
  Batch size = 8


{'loss': 0.45, 'grad_norm': 0.22784891724586487, 'learning_rate': 5.165116279069767e-06, 'epoch': 1.62}


  0%|          | 0/110 [00:00<?, ?it/s]

{'eval_loss': 0.4635418951511383, 'eval_runtime': 1135.3214, 'eval_samples_per_second': 0.775, 'eval_steps_per_second': 0.097, 'epoch': 1.62}
{'loss': 0.4531, 'grad_norm': 0.25325194001197815, 'learning_rate': 5.0908477119279835e-06, 'epoch': 1.64}
{'loss': 0.4406, 'grad_norm': 0.2111399918794632, 'learning_rate': 5.016579144786197e-06, 'epoch': 1.66}
{'loss': 0.45, 'grad_norm': 0.2343989461660385, 'learning_rate': 4.942310577644411e-06, 'epoch': 1.68}
{'loss': 0.4938, 'grad_norm': 0.2823880910873413, 'learning_rate': 4.8680420105026255e-06, 'epoch': 1.7}
{'loss': 0.4656, 'grad_norm': 0.1933375895023346, 'learning_rate': 4.79377344336084e-06, 'epoch': 1.72}
{'loss': 0.4594, 'grad_norm': 0.2351568341255188, 'learning_rate': 4.719504876219055e-06, 'epoch': 1.74}
{'loss': 0.4594, 'grad_norm': 0.22545936703681946, 'learning_rate': 4.645236309077269e-06, 'epoch': 1.76}
{'loss': 0.45, 'grad_norm': 0.19508497416973114, 'learning_rate': 4.570967741935484e-06, 'epoch': 1.78}
{'loss': 0.4531, 'g


***** Running Evaluation *****
  Num examples = 880
  Batch size = 8


{'loss': 0.5062, 'grad_norm': 0.22576721012592316, 'learning_rate': 4.422430607651914e-06, 'epoch': 1.82}


  0%|          | 0/110 [00:00<?, ?it/s]

{'eval_loss': 0.46129822731018066, 'eval_runtime': 1135.2698, 'eval_samples_per_second': 0.775, 'eval_steps_per_second': 0.097, 'epoch': 1.82}
{'loss': 0.475, 'grad_norm': 0.30549734830856323, 'learning_rate': 4.348162040510128e-06, 'epoch': 1.84}
{'loss': 0.4375, 'grad_norm': 0.21720968186855316, 'learning_rate': 4.273893473368343e-06, 'epoch': 1.86}
{'loss': 0.4688, 'grad_norm': 0.329385906457901, 'learning_rate': 4.199624906226557e-06, 'epoch': 1.88}
{'loss': 0.4219, 'grad_norm': 0.22188399732112885, 'learning_rate': 4.125356339084771e-06, 'epoch': 1.9}
{'loss': 0.4406, 'grad_norm': 0.22292594611644745, 'learning_rate': 4.0510877719429855e-06, 'epoch': 1.92}
{'loss': 0.4562, 'grad_norm': 0.2311718761920929, 'learning_rate': 3.9768192048011995e-06, 'epoch': 1.94}
{'loss': 0.4969, 'grad_norm': 0.2625870704650879, 'learning_rate': 3.902550637659415e-06, 'epoch': 1.96}
{'loss': 0.4344, 'grad_norm': 0.22446909546852112, 'learning_rate': 3.828282070517629e-06, 'epoch': 1.98}


Saving model checkpoint to ./phi-3-mini-LoRA\checkpoint-989
loading configuration file config.json from cache at C:\Users\dhani\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\c1358f8a35e6d2af81890deffbbfa575b978c62f\config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_tok

{'loss': 0.45, 'grad_norm': 0.20226581394672394, 'learning_rate': 3.754013503375844e-06, 'epoch': 2.0}



***** Running Evaluation *****
  Num examples = 880
  Batch size = 8


{'loss': 0.425, 'grad_norm': 0.2027665674686432, 'learning_rate': 3.679744936234058e-06, 'epoch': 2.02}


  0%|          | 0/110 [00:00<?, ?it/s]

{'eval_loss': 0.4599650204181671, 'eval_runtime': 1135.2182, 'eval_samples_per_second': 0.775, 'eval_steps_per_second': 0.097, 'epoch': 2.02}
{'loss': 0.4562, 'grad_norm': 0.34359097480773926, 'learning_rate': 3.6054763690922734e-06, 'epoch': 2.04}
{'loss': 0.4656, 'grad_norm': 0.23517601191997528, 'learning_rate': 3.5312078019504874e-06, 'epoch': 2.06}
{'loss': 0.4688, 'grad_norm': 0.19516156613826752, 'learning_rate': 3.4569392348087023e-06, 'epoch': 2.08}
{'loss': 0.475, 'grad_norm': 0.23781506717205048, 'learning_rate': 3.3826706676669163e-06, 'epoch': 2.1}
{'loss': 0.4719, 'grad_norm': 0.24309015274047852, 'learning_rate': 3.3084021005251316e-06, 'epoch': 2.12}
{'loss': 0.4156, 'grad_norm': 0.18875692784786224, 'learning_rate': 3.2341335333833456e-06, 'epoch': 2.14}
{'loss': 0.4531, 'grad_norm': 0.21854715049266815, 'learning_rate': 3.159864966241561e-06, 'epoch': 2.16}
{'loss': 0.4625, 'grad_norm': 0.21810537576675415, 'learning_rate': 3.085596399099775e-06, 'epoch': 2.18}
{'loss


***** Running Evaluation *****
  Num examples = 880
  Batch size = 8


{'loss': 0.4781, 'grad_norm': 0.23068256676197052, 'learning_rate': 2.9370592648162037e-06, 'epoch': 2.22}


  0%|          | 0/110 [00:00<?, ?it/s]

{'eval_loss': 0.45880141854286194, 'eval_runtime': 1135.1761, 'eval_samples_per_second': 0.775, 'eval_steps_per_second': 0.097, 'epoch': 2.22}
{'loss': 0.4531, 'grad_norm': 0.22971700131893158, 'learning_rate': 2.862790697674419e-06, 'epoch': 2.24}
{'loss': 0.4594, 'grad_norm': 0.2367987483739853, 'learning_rate': 2.788522130532633e-06, 'epoch': 2.26}
{'loss': 0.45, 'grad_norm': 0.27181336283683777, 'learning_rate': 2.7142535633908466e-06, 'epoch': 2.29}
{'loss': 0.4562, 'grad_norm': 0.24141797423362732, 'learning_rate': 2.6399849962490628e-06, 'epoch': 2.31}
{'loss': 0.4437, 'grad_norm': 0.21378837525844574, 'learning_rate': 2.5657164291072763e-06, 'epoch': 2.33}
{'loss': 0.4656, 'grad_norm': 0.25983524322509766, 'learning_rate': 2.491447861965491e-06, 'epoch': 2.35}
{'loss': 0.4625, 'grad_norm': 0.24909131228923798, 'learning_rate': 2.4171792948237052e-06, 'epoch': 2.37}
{'loss': 0.4531, 'grad_norm': 0.20975780487060547, 'learning_rate': 2.3429107276819205e-06, 'epoch': 2.39}
{'loss'


***** Running Evaluation *****
  Num examples = 880
  Batch size = 8


{'loss': 0.4625, 'grad_norm': 0.24058225750923157, 'learning_rate': 2.1943735933983498e-06, 'epoch': 2.43}


  0%|          | 0/110 [00:00<?, ?it/s]

{'eval_loss': 0.45796477794647217, 'eval_runtime': 1135.0513, 'eval_samples_per_second': 0.775, 'eval_steps_per_second': 0.097, 'epoch': 2.43}
{'loss': 0.4188, 'grad_norm': 0.2042124718427658, 'learning_rate': 2.120105026256564e-06, 'epoch': 2.45}
{'loss': 0.4719, 'grad_norm': 0.21342317759990692, 'learning_rate': 2.0458364591147787e-06, 'epoch': 2.47}
{'loss': 0.4656, 'grad_norm': 0.2069472372531891, 'learning_rate': 1.971567891972993e-06, 'epoch': 2.49}
{'loss': 0.4469, 'grad_norm': 0.21405424177646637, 'learning_rate': 1.897299324831208e-06, 'epoch': 2.51}
{'loss': 0.4313, 'grad_norm': 0.24022746086120605, 'learning_rate': 1.8230307576894217e-06, 'epoch': 2.53}
{'loss': 0.4437, 'grad_norm': 0.22467447817325592, 'learning_rate': 1.7487621905476375e-06, 'epoch': 2.55}
{'loss': 0.4437, 'grad_norm': 0.19682493805885315, 'learning_rate': 1.6744936234058513e-06, 'epoch': 2.57}
{'loss': 0.4594, 'grad_norm': 0.25713926553726196, 'learning_rate': 1.6002250562640665e-06, 'epoch': 2.59}
{'loss


***** Running Evaluation *****
  Num examples = 880
  Batch size = 8


{'loss': 0.4625, 'grad_norm': 0.25819864869117737, 'learning_rate': 1.4516879219804958e-06, 'epoch': 2.63}


  0%|          | 0/110 [00:00<?, ?it/s]

{'eval_loss': 0.4571300745010376, 'eval_runtime': 1269.9402, 'eval_samples_per_second': 0.693, 'eval_steps_per_second': 0.087, 'epoch': 2.63}
{'loss': 0.4437, 'grad_norm': 0.19933897256851196, 'learning_rate': 1.3774193548387098e-06, 'epoch': 2.65}
{'loss': 0.4437, 'grad_norm': 0.2092340737581253, 'learning_rate': 1.3031507876969247e-06, 'epoch': 2.67}
{'loss': 0.4688, 'grad_norm': 0.3511952757835388, 'learning_rate': 1.2288822205551387e-06, 'epoch': 2.69}
{'loss': 0.4531, 'grad_norm': 0.24058663845062256, 'learning_rate': 1.1546136534133527e-06, 'epoch': 2.71}
{'loss': 0.4938, 'grad_norm': 0.24044805765151978, 'learning_rate': 1.080345086271568e-06, 'epoch': 2.73}
{'loss': 0.4437, 'grad_norm': 0.2387939691543579, 'learning_rate': 1.006076519129782e-06, 'epoch': 2.75}
{'loss': 0.475, 'grad_norm': 0.183254674077034, 'learning_rate': 9.318079519879971e-07, 'epoch': 2.77}
{'loss': 0.4437, 'grad_norm': 0.25154319405555725, 'learning_rate': 8.575393848462112e-07, 'epoch': 2.79}
{'loss': 0.4


***** Running Evaluation *****
  Num examples = 880
  Batch size = 8


{'loss': 0.4594, 'grad_norm': 0.254890114068985, 'learning_rate': 7.090022505626403e-07, 'epoch': 2.83}


  0%|          | 0/110 [00:00<?, ?it/s]

{'eval_loss': 0.4567612111568451, 'eval_runtime': 1333.7426, 'eval_samples_per_second': 0.66, 'eval_steps_per_second': 0.082, 'epoch': 2.83}
{'loss': 0.4781, 'grad_norm': 0.2675899863243103, 'learning_rate': 6.347336834208555e-07, 'epoch': 2.85}
{'loss': 0.4344, 'grad_norm': 0.3255552053451538, 'learning_rate': 5.604651162790695e-07, 'epoch': 2.87}
{'loss': 0.4375, 'grad_norm': 0.21279893815517426, 'learning_rate': 4.861965491372846e-07, 'epoch': 2.89}
{'loss': 0.4344, 'grad_norm': 0.21352602541446686, 'learning_rate': 4.119279819954986e-07, 'epoch': 2.91}
{'loss': 0.4437, 'grad_norm': 0.36068883538246155, 'learning_rate': 3.3765941485371377e-07, 'epoch': 2.93}
{'loss': 0.4625, 'grad_norm': 0.2450450211763382, 'learning_rate': 2.6339084771192784e-07, 'epoch': 2.95}
{'loss': 0.4313, 'grad_norm': 0.20696929097175598, 'learning_rate': 1.8912228057014293e-07, 'epoch': 2.97}
{'loss': 0.475, 'grad_norm': 0.2487502098083496, 'learning_rate': 1.1485371342835697e-07, 'epoch': 2.99}


Saving model checkpoint to ./phi-3-mini-LoRA\checkpoint-1482
loading configuration file config.json from cache at C:\Users\dhani\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\c1358f8a35e6d2af81890deffbbfa575b978c62f\config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_to

{'train_runtime': 39036.4247, 'train_samples_per_second': 0.608, 'train_steps_per_second': 0.038, 'train_loss': 0.5239541160593792, 'epoch': 3.0}


TrainOutput(global_step=1482, training_loss=0.5239541160593792, metrics={'train_runtime': 39036.4247, 'train_samples_per_second': 0.608, 'train_steps_per_second': 0.038, 'total_flos': 1.7058661221040128e+17, 'train_loss': 0.5239541160593792, 'epoch': 2.9969666329625886})

In [22]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [42]:
trainer.save_model()

Saving model checkpoint to ./phi-3-mini-LoRA
loading configuration file config.json from cache at C:\Users\dhani\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\c1358f8a35e6d2af81890deffbbfa575b978c62f\config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,


In [34]:
hf_model_repo = "dhanishetty/phi-3-mini-LoRA"
trainer.model.push_to_hub(hf_model_repo)

loading configuration file config.json from cache at C:\Users\dhani\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\c1358f8a35e6d2af81890deffbbfa575b978c62f\config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05

adapter_model.safetensors:   0%|          | 0.00/35.7M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dhanishetty/phi-3-mini-LoRA/commit/ff98a29e8a6f7c3f5d127833127984f755cdbc8f', commit_message='Upload model', commit_description='', oid='ff98a29e8a6f7c3f5d127833127984f755cdbc8f', pr_url=None, pr_revision=None, pr_num=None)

In [58]:
trainer.push_to_hub("dhanishetty/phi-3-mini", token= "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")

Saving model checkpoint to ./phi-3-mini-LoRA
loading configuration file config.json from cache at C:\Users\dhani\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\c1358f8a35e6d2af81890deffbbfa575b978c62f\config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,


adapter_model.safetensors:   0%|          | 0.00/35.7M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/dhanishetty/phi-3-mini-LoRA/commit/ef792f6f20ac9c0073ffc2aad380e7a1874e1257', commit_message='dhanishetty/phi-3-mini', commit_description='', oid='ef792f6f20ac9c0073ffc2aad380e7a1874e1257', pr_url=None, pr_revision=None, pr_num=None)