In [1]:
!pip install transformers datasets trl huggingface_hub

Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.15.2-py3-none-any.whl (318 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.15.2


In [2]:

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format
from peft import PeftModel
import torch

# Load base model name
base_model_name = "Qwen/Qwen2.5-7B-Instruct"
checkpoint = "vuha2003/Qwen2.5-7B-instruct-ESITime-checkpoint-480"

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",  # Enables multi-GPU support
    torch_dtype=torch.bfloat16,  # Reduce memory usage
)

# Load fine-tuned LoRA checkpoint
model = PeftModel.from_pretrained(model, checkpoint)

model.train()
model.enable_input_require_grads()

# Only allow LoRA parameters to have gradients
for name, param in model.named_parameters():
    if "lora" in name.lower():  # Only LoRA layers
        param.requires_grad = True
    else:
        param.requires_grad = False  # Freeze the rest

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Set finetune name
finetune_name = "ESI"

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/796 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/80.8M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

# Generate the train_prompt:

In [3]:
train_prompt_style = """
### Instruction:
You are an expert with advanced knowledge in solving temporal reasoning problems.
Before answering, classify the task given and then create a step-by-step reasoning to ensure a logical and accurate response.
Please answer the following questions.

### Question:
{}
### Task:
{}
### Reasoning:
{}
### Final Answer:
{}"""

In [4]:
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    inputs = examples["question"]
    cots = examples["reasoning"]  # Có thể chứa giá trị None
    outputs = examples["reasoner_answer"]
    tasks = examples["task"]
    texts = []

    for input, task, cot_list, output in zip(inputs,tasks, cots, outputs ):
        if cot_list is None:  
            cot = "N/A"
        elif isinstance(cot_list, list):  
            cot = "\n".join(map(str, cot_list))
        else:  
            cot = str(cot_list)

        text = train_prompt_style.format(input, task, cot, output) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}

# Split data:

In [5]:
from datasets import load_dataset
dataset = load_dataset("ESITime/timesi-arithmetic", split = "train",trust_remote_code=True)
dataset = dataset.shuffle(seed=42)
dataset = dataset.select(range(1920, 5408))  # 5408 because range() is exclusive
# split_dataset = dataset.train_test_split(test_size=0.2)

train_dataset = dataset
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
# val_dataset = val_dataset.map(formatting_prompts_func, batched=True)
# print("val data example:")
# print(val_dataset["text"][0])
print("train data example:")
print(train_dataset["text"][0])
print(train_dataset["text"][1])



README.md:   0%|          | 0.00/477 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5408 [00:00<?, ? examples/s]

Map:   0%|          | 0/3488 [00:00<?, ? examples/s]

train data example:

### Instruction:
You are an expert with advanced knowledge in solving temporal reasoning problems.
Before answering, classify the task given and then create a step-by-step reasoning to ensure a logical and accurate response.
Please answer the following questions.

### Question:
If a contract started in August and ends in February (of the next year), how many months does it last?
### Task:
ari_month
### Reasoning:
Step 1: Identify the start month and the end month.
Step 2: Start month is August and end month is February of the next year.
Step 3: Count the number of months from August to December in the starting year.
Step 4: August, September, October, November, December are 5 months.
Step 5: Count the number of months from January to February in the next year.
Step 6: January, February are 2 months.
Step 7: Add the months from both periods to get the total duration: 5 months + 2 months = 7 months.
### Final Answer:
7 months<|im_end|>

### Instruction:
You are an ex

# Setting up models


In [6]:
from peft import LoraConfig

# TODO: Configure LoRA parameters
# r: rank dimension for LoRA update matrices (smaller = more compression)
rank_dimension = 8
# lora_alpha: scaling factor for LoRA layers (higher = stronger adaptation)
lora_alpha = 8
# lora_dropout: dropout probability for LoRA layers (helps prevent overfitting)
lora_dropout = 0.05

peft_config = LoraConfig(
    r=rank_dimension,  # Rank dimension - typically between 4-32
    lora_alpha=lora_alpha,  # LoRA scaling factor - typically 2x rank
    lora_dropout=lora_dropout,  # Dropout probability for LoRA layers
    bias="none",  # Bias type for LoRA. the corresponding biases will be updated during training.
    target_modules="all-linear",  # Which modules to apply LoRA to
    task_type="CAUSAL_LM",  # Task type for model architecture
)

In [7]:
args = SFTConfig(
    # Output settings
    output_dir=finetune_name,  # Directory to save model checkpoints
    # Training duration
    num_train_epochs=1,  # Number of training epochs
    dataset_text_field="text",# Focus on what column we train
    # Batch size settings
    # max_steps=480,
    per_device_train_batch_size=2,  # Batch size per GPU
    # per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,  # Accumulate gradients for larger effective batch
    # Memory optimization
    gradient_checkpointing=True,  # Trade compute for memory savings
    # Optimizer settings
    optim="adamw_torch_fused",  # Use fused AdamW for efficiency
    learning_rate=2e-4,  # Learning rate (QLoRA paper)
    max_grad_norm=0.3,  # Gradient clipping threshold
    # Learning rate schedule
    warmup_ratio=0.03,  # Portion of steps for warmup
    lr_scheduler_type="constant",  # Keep learning rate constant after warmup
    # Logging and saving
    logging_steps=60,  # Log metrics every N steps
    save_steps=60,  # Save checkpoint every epoch
    # save_strategy="epoch", 
    # eval_strategy="steps",
    
    # Precision settings
    bf16=True,  # Use bfloat16 precision
    # Integration settings
    push_to_hub=False,  # Don't push to HuggingFace Hub
    report_to="none",  # Disable external logging
)

In [8]:
from transformers import TrainingArguments, EarlyStoppingCallback
max_seq_length = 2160  # max sequence length for model and packing of the dataset
tokenizer.model_max_length = max_seq_length  # Adjust the sequence length
# Create SFTTrainer with LoRA configuration
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    # eval_dataset=val_dataset,
    peft_config=peft_config,  # LoRA configuration
    # max_seq_length=max_seq_length,  # Maximum sequence length
    tokenizer=tokenizer,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Dừng nếu không cải thiện sau 3 epoch
    # packing=True,  # Enable input packing for efficiency
    # dataset_kwargs={
    #     "add_special_tokens": False,  # Special tokens handled by template
    #     "append_concat_token": False,  # No additional separator needed
    # },
)

  trainer = SFTTrainer(


Converting train dataset to ChatML:   0%|          | 0/3488 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/3488 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3488 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2572 > 2160). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/3488 [00:00<?, ? examples/s]

# Train model

In [None]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model
trainer.save_model("ESI/final")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
60,0.677
120,0.6195
180,0.5846
240,0.61
300,0.5959
360,0.5875
420,0.6221
480,0.5414
540,0.5808


config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]