In [5]:
import torch
from transformers import LlamaForCausalLM as ModelCls
from transformers import LlamaTokenizerFast as TkCls

model_name = "../models/TinyLlama-1.1B-Chat-v0.4"
model: ModelCls = ModelCls.from_pretrained(
   model_name,
   device_map="auto",
   torch_dtype=torch.bfloat16,
)
tokenizer: TkCls = TkCls.from_pretrained(model_name)

In [18]:
import gzip
import json

def dump_json_gz(data, file_path):
   with gzip.open(file_path, "wt", encoding="UTF-8") as fp:
      json.dump(data, fp)
      
def dump_json(data, file_path):
   with open(file_path, "wt", encoding="UTF-8") as fp:
      json.dump(data, fp, ensure_ascii=False, indent=2)      

In [4]:
def build_prompt(user_input, assistant_output=""):
   template_template = """<|im_start|>user
<|Analysis|>   
{user_input}
<|im_end|>
<|im_start|>assistant
{assistant_output}"""
   return template_template.format(user_input=user_input, assistant_output=assistant_output)

def convert_train_row_to_prev_train_row(tokenizer, row):
   train_prompt = build_prompt(row['input'], row['output'])
   tokens = tokenizer.encode(train_prompt) + [tokenizer.eos_token_id]
   return tokens

def convert_train_data_to_prev_train_data(tokenizer, ds_data):
   result = list()
   for row in ds_data:
      result.append(convert_train_row_to_prev_train_row(tokenizer, row))
   return result

# 對訓練資料的 Padding 與推論時 Padding 的方向並不相同，在訓練時通常將 PAD Token 放在右邊，而推論時會放在左邊。
def padding_train_dataset(tokenizer, ds_tokens):
   maxlen = max(map(len, ds_tokens))
   ds_result = list()
   for tokens in ds_tokens:
      delta = maxlen - len(tokens)
      # 將 EOS Token 當作 PAD Token 來用
      tokens += [tokenizer.eos_token_id] * delta
      ds_result.append({
         "input_ids": tokens, 
         "labels": tokens
      })
   return ds_result


In [21]:
train_data = [
   {
      "input": "Code Review Short Checklist\nFor Reviewee:\n* Engage in discussion with the reviewer regarding the code and requirements.\n* Take notes of feedback and improvement suggestions.\n* Address identified improvements before the next review iteration.\nFor Reviewer:\n* Offer feedback on strengths and weaknesses of the code.\n* Give specific suggestions for improvement and guide on addressing issues.\n* Ensure that feedback helps enhance code quality and align with standards.",
      "output": "person:[\"Reviewee\",\"Reviewer\"]"
   },
   {
      "input": "Reviewee Pre-Review Preparation:\n* Are all preconditions and prerequisites met for the review?\n* Do reviewer have access to the necessary documentation and requirements?",
      "output": "person:[\"Reviewee\"]"
   },
   {
      "input": "Reviewer Understanding Requirements:\n* Does the solution align with the defined requirements?\n* Have Reviewee identified the target audience and their specific needs?\n* Have security requirements and permission control been confirmed?",
      "output": "person:[\"Reviewer\"]"
   }
]

ds_dataset = convert_train_data_to_prev_train_data(tokenizer, train_data)
ds_dataset = padding_train_dataset(tokenizer, ds_dataset)

dump_json(ds_dataset, "./results/train.json")
dump_json_gz(ds_dataset, f"./results/train.json.gz")
dump_json_gz(ds_dataset, f"./results/dev.json.gz")

In [23]:
import datasets
import shutil

data_files = {
   "train": "./results/train.json.gz",
   "dev": "./results/dev.json.gz",
}

shutil.rmtree("./cache/")

dataset = datasets.load_dataset(
   "json",
   data_files=data_files,
   cache_dir="cache",
)

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

In [27]:
from transformers import TrainingArguments

# 設定訓練參數
output_dir = "./results/TinyLlama-1B-Trained"
train_args = TrainingArguments(
   output_dir=output_dir,
   per_device_train_batch_size=2,
   per_device_eval_batch_size=2,
   eval_accumulation_steps=2,
   evaluation_strategy="steps",
   save_strategy="steps",
   eval_steps=25,
   save_steps=25,
   save_total_limit=3,
   num_train_epochs=3,
   load_best_model_at_end=True,
   bf16=True,
)

In [28]:
from transformers import Trainer

# 開始訓練模型
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["dev"],
)
trainer.train()

[codecarbon INFO @ 17:17:51] [setup] RAM Tracking...
[codecarbon INFO @ 17:17:51] [setup] GPU Tracking...
[codecarbon INFO @ 17:17:51] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 17:17:51] [setup] CPU Tracking...
[codecarbon INFO @ 17:17:53] CPU Model on constant consumption mode: 12th Gen Intel(R) Core(TM) i7-12700K
[codecarbon INFO @ 17:17:53] >>> Tracker's metadata:
[codecarbon INFO @ 17:17:53]   Platform system: Linux-5.15.133.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
[codecarbon INFO @ 17:17:53]   Python version: 3.10.12
[codecarbon INFO @ 17:17:53]   CodeCarbon version: 2.2.3
[codecarbon INFO @ 17:17:53]   Available RAM : 15.619 GB
[codecarbon INFO @ 17:17:53]   CPU count: 20
[codecarbon INFO @ 17:17:53]   CPU model: 12th Gen Intel(R) Core(TM) i7-12700K
[codecarbon INFO @ 17:17:53]   GPU count: 1
[codecarbon INFO @ 17:17:53]   GPU model: 1 x NVIDIA GeForce RTX 3060


Step,Training Loss,Validation Loss


[codecarbon INFO @ 17:18:04] Energy consumed for RAM : 0.000014 kWh. RAM Power : 5.857309341430664 W
[codecarbon INFO @ 17:18:04] Energy consumed for all GPUs : 0.000190 kWh. Total GPU Power : 77.496 W
[codecarbon INFO @ 17:18:04] Energy consumed for all CPUs : 0.000104 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 17:18:04] 0.000308 kWh of electricity used since the beginning.
  df = pd.concat([df, pd.DataFrame.from_records([dict(data.values)])])


TrainOutput(global_step=6, training_loss=2.4738148053487143, metrics={'train_runtime': 8.8139, 'train_samples_per_second': 1.021, 'train_steps_per_second': 0.681, 'total_flos': 7206456066048.0, 'train_loss': 2.4738148053487143, 'epoch': 3.0})

In [29]:
# 儲存訓練完的模型
MODEL_TUNED = "./results/TinyLlama-1B"
trainer.save_model(MODEL_TUNED)

In [3]:
from vllm import LLM, SamplingParams

MODEL_TUNED = "./results/TinyLlama-1B"
llm = LLM(MODEL_TUNED, dtype="float16")

INFO 12-31 23:13:48 llm_engine.py:73] Initializing an LLM engine with config: model='./results/TinyLlama-1B', tokenizer='./results/TinyLlama-1B', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, enforce_eager=False, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 12-31 23:16:34 llm_engine.py:223] # GPU blocks: 21423, # CPU blocks: 11915
INFO 12-31 23:16:34 model_runner.py:394] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 12-31 23:16:38 model_runner.py:437] Graph capturing finished in 3 secs.


In [5]:
sampling_params = SamplingParams(
   max_tokens=512,
   temperature=0.0,
   stop=["}"],
)

In [17]:
prompts = build_prompt("What is your name?")
results = llm.generate(prompts, sampling_params)

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.10it/s]


In [18]:
results

[RequestOutput(request_id=2, prompt='<|im_start|>user\n<|Analysis|>   \nWhat is your name?\n<|im_end|>\n<|im_start|>assistant\n', prompt_token_ids=[1, 32001, 1404, 13, 29966, 29989, 21067, 4848, 29989, 29958, 1678, 13, 5618, 338, 596, 1024, 29973, 13, 32002, 29871, 13, 32001, 20255, 13], prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='Hi, I am Open Assistant. How can I help you today?<|im_end|> \n', token_ids=[18567, 29892, 306, 626, 4673, 4007, 22137, 29889, 1128, 508, 306, 1371, 366, 9826, 29973, 32002, 29871, 13, 2], cumulative_logprob=-5.01611244052674, logprobs=None, finish_reason=stop)], finished=True)]

In [20]:
text = results[0].outputs[0].text
text

'Hi, I am Open Assistant. How can I help you today?<|im_end|> \n'