In [2]:
import json

from data_prepare import samples
from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from transformers import Trainer, TrainingArguments

from peft import get_peft_model, LoraConfig, TaskType


# Model

In [5]:
model_name = '/Users/kaichen/workspace/data/models/deepseekr1-1.5b'

tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name).to('cuda')


# Data

In [6]:
with open('datasets.jsonl', 'w', encoding='utf-8') as f:
    for sample in samples:
        json_line = json.dumps(sample, ensure_ascii=False)
        f.write(json_line + '\n')
    else:
        print('Data saved to datasets.jsonl')   


Data saved to datasets.jsonl


# Train and test set

In [7]:
dataset = load_dataset("json", data_files={"train": "datasets.jsonl"}, split="train")

Generating train split: 50 examples [00:00, 4107.15 examples/s]


In [8]:
len(dataset)

50

In [9]:
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']


In [10]:
print(len(train_dataset))
print(len(test_dataset))

45
5


# Tokenizer

In [11]:
def tokenizer_function(examples, tokenizer):
    text = [f"{prompt}\n{completion}" for prompt, completion in zip(examples['prompt'], examples['completion'])]
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=512)
    tokens['labels'] = tokens['input_ids'].copy()

    return tokens


In [15]:
tokenized_train_dataset = train_dataset.map(lambda examples: tokenizer_function(examples, tokenizer), batched=True)
tokenized_eval_dataset = test_dataset.map(lambda examples: tokenizer_function(examples, tokenizer), batched=True)



Map: 100%|██████████| 45/45 [00:00<00:00, 3865.96 examples/s]


In [14]:
print(tokenized_train_dataset[0])

{'prompt': 'Question 28: Why is it important to practice dynamics in singing?', 'completion': 'Answer 28: Dynamics add emotion and variety to your performance, making your singing more expressive and engaging.', 'input_ids': [151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643

# Quantization

In [None]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

# device_map={"cuda:0": "cpu"}
# device_map={"auto"}
model = AutoModelForCausalLM.from_pretrained(model_name, config=quantization_config)
# model.save_pretrained(model_name)




# Lora 

In [None]:
lora_config = LoraConfig(
    r=16,  # Number of bits for the mantissa
    lora_alpha=32,  # Scaling factor for low-rank adaptation
    lora_dropout=0.1,  # Dropout rate for low-rank adaptation
    task_type=TaskType.CAUSAL_LM,  # Task type
)
model = get_peft_model(model, lora_config)
model.print_config()
# model.save_pretrained(model_name)

# Train parameters setting

In [None]:
training_args = TrainingArguments(
    output_dir='./fine_tuned_models',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=1, # batch size per device during training
    gradient_accumulation_steps=8, # number of updates steps to accumulate before performing a backward/update pass
    fp16=True,                     # Whether to use 16-bit (mixed) precision training instead of 32-bit training.
    eval_steps=10,                 # Number of update steps between two evaluations.
    learning_rate=5e-5,            # learning rate used for training
    logging_dir='./logs',          # directory for storing logs
    logging_steps=10,
    run_name='deepseek-r1-sft-distill',
)

# Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
)

# Train

In [None]:
trainer.train()
#model.save_pretrained(model_name)
#tokenizer.save_pretrained(model_name)

# Save trained model

In [None]:
save_path = './saved_model'
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)