In [2]:
# A script to fine-tune model using own datasets

# https://www.youtube.com/watch?v=ZqoZDI0p1aI
# Fine Tune DeepSeek Model on your Custom Dataset

In [5]:
# !pip install datasets sympy wandb
# !pip install --no-cache-dir bitsandbytes

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to('cuda') 

In [2]:
# check model information
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qw

In [3]:
# custom dataset
import json 

# Define samples for fine-tuning, more samples needed for real fine tuning
# source from below
# https://github.com/aws-samples/amazon-sagemaker-fine-tuning-seq2seq-model-example/blob/main/sample_data/test.csv
samples = [
    {
        "prompt": "note 1: Patient was admitted to the hospital with acute pancreatitis. CT scan showed pancreatic necrosis, and the patient was started on antibiotics and enteral nutrition. The patient's heart rate was 110 beats per minute, and blood pressure was 150/100 mmHg. The patient's condition improved over the course of treatment, and repeat imaging showed resolution of the necrosis.",
        "completion": "summary 1: Acute pancreatitis, antibiotics, enteral nutrition, HR 110, BP 150/100."
    },
    {
        "prompt": "note 1: Patient was admitted to the hospital with acute exacerbation of heart failure. Echocardiogram showed reduced ejection fraction and severe aortic stenosis. The patient was started on diuretics and underwent aortic valve replacement surgery. The patient's heart rate was 80 beats per minute, and blood pressure was 120/80 mmHg. The patient's cardiac function improved over the course of treatment.",
        "completion": "summary 2: Heart failure exacerbation, aortic stenosis, diuretics, surgery, HR 80, BP 120/80."
    },
    {
        "prompt": "note 1: Patient has a history of type 2 diabetes and was admitted to the hospital with hyperglycemia. Blood glucose levels were elevated at 400 mg/dL on arrival, and insulin therapy was initiated. The patient's heart rate was 100 beats per minute, and blood pressure was 140/90 mmHg. The patient's blood glucose levels improved over the course of treatment.",
        "completion": "summary 3: Type 2 diabetes, hyperglycemia, insulin therapy, HR 100, BP 140/90."
    },
    {
        "prompt": "note 1: Patient was admitted to the ICU with severe traumatic brain injury. CT scan showed diffuse axonal injury, and the patient underwent intracranial pressure monitoring and was started on barbiturate therapy. The patient's heart rate was 70 beats per minute, and blood pressure was 110/70 mmHg. The patient's neurological status improved over the course of treatment, and repeat imaging showed reduction in the intracranial pressure.",
        "completion": "summary 4: Traumatic brain injury, diffuse axonal injury, ICP monitoring, barbiturate therapy, HR 70, BP 110/70."
    },
    {
        "prompt": "note 2: Patient presented with fever and productive cough. Chest X-ray showed a cavitary lesion in the right upper lobe, and the patient was diagnosed with pulmonary tuberculosis. The patient was started on antituberculosis therapy and monitored for side effects. The patient's heart rate was 90 beats per minute, and blood pressure was 130/80 mmHg. The patient's condition improved over the course of treatment.",
        "completion": "summary 5: Pulmonary tuberculosis, antituberculosis therapy, HR 90, BP 130/80."
    }
]

In [4]:
# write sample to jsonl file
file_name = "custom_dataset.jsonl"
with open(file_name, "w", encoding="utf-8") as f:
    for sample in samples:
        json_line = json.dumps(sample, ensure_ascii=False)
        f.write(json_line + "\n")
print(f"{file_name} is created!")

custom_dataset.jsonl is created!


In [None]:
# !pwd; ls -l custom_dataset.jsonl

In [6]:
# login wandb
import wandb
# wandb.login()   # uncomment to login once to view fine-tune metrics

In [None]:
wandb.init(
    project = "fine-tune-custom-dataset",
    config = {
        "learning_rate": 5e-5,
        "architecture": "DeepSeek-R1-Distill-Qwen-1.5B",
        "dataset": "custom_dataset.jsonl",
        "epochs": 2,
    }
)

In [8]:
from datasets import load_dataset
dataset = load_dataset('json', data_files='custom_dataset.jsonl', split="train")
print(len(dataset))

Generating train split: 0 examples [00:00, ? examples/s]

5


In [9]:
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [10]:
def tokenize_function(examples):
    combined_texts = [f'{prompt}\n{completion}' for prompt, completion in zip(examples["prompt"], examples["completion"])] 
    tokenized = tokenizer(combined_texts, truncation=True, max_length=512, padding="max_length")
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
# !pip install 'accelerate>=0.26.0'
# !pip install -U bitsandbytes

In [None]:
# for cpu training
# !pip install 'accelerate>=0.26.0'

#
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
    load_in_8bit = True
)
# model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, low_cpu_mem_usage=True) #, device_map="auto")

In [None]:
from peft import get_peft_model, LoraConfig, TaskType

loar_config = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05, task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
from transformers import TrainingArgument, Trainer

training_args = TrainingArguments(
    output_dir="./deepseek_finetuned",
    num_train_epochs=50,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    fp16=True,
    logging_step=10,
    save_steps=100,
    evaluation_stragegy="steps",
    eval_steps=10,
    learning_rage=3e-5, # 0.00003
    logging_dir="./logs",
    report_to="wandb",
    run_name="DeepSeek_FineTuning_Experiment",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
)

In [None]:
trainer.train()

In [None]:
save_path = "./deepseek_finetuned"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"{save_path}")

In [None]:
# merge lora 
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(model_name)
model = PeftModel.from_pretrained(base_model, save_path)   # base model + lora
model = model.merge_and_unload()  # merge model

final_save_path="./deepseek_finetuned"
model.save_pretrained(final_save_path)
tokenzier.save_pretrained(final_save_path)
print(f'{final_save_path}')

In [None]:
model = AutoModelForCausalLM.from_pretrained(final_save_path)
tokenizer = AutoTokenizer.from_pretrained(final_save_path)

In [None]:
from transformers import pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
# ask question to fine-tuned model
prompt="Summarize the text - Patient was admitted to the hospital with acute pancreatitis. CT scan showed pancreatic necrosis, and the patient was started on antibiotics and enteral nutrition. The patient's heart rate was 110 beats per minute, and blood pressure was 150/100 mmHg. The patient's condition improved over the course of treatment, and repeat imaging showed resolution of the necrosis."
generated_texts = pipe(prompt, max_length=300, num_return_sequence=1)
generated_text = generated_texts[0]['generated_text']