In [5]:
import torch
from transformers import LlamaForCausalLM as ModelCls
from transformers import LlamaTokenizerFast as TkCls

model_name = "../models/TinyLlama-1.1B-Chat-v0.4"
model: ModelCls = ModelCls.from_pretrained(
   model_name,
   device_map="auto",
   torch_dtype=torch.bfloat16,
)
tokenizer: TkCls = TkCls.from_pretrained(model_name)

In [18]:
import gzip
import json

def dump_json_gz(data, file_path):
   with gzip.open(file_path, "wt", encoding="UTF-8") as fp:
      json.dump(data, fp)
      
def dump_json(data, file_path):
   with open(file_path, "wt", encoding="UTF-8") as fp:
      json.dump(data, fp, ensure_ascii=False, indent=2)      

In [6]:
def build_train_prompt(user_input, assistant_output):
   template_template = """<|im_start|>user
<|Analysis|>   
{user_input}
<|im_end|>
<|im_start|>assistant
{assistant_output}"""
   return template_template.format(user_input=user_input, assistant_output=assistant_output)

def convert_train_row_to_prev_train_row(tokenizer, row):
   train_prompt = build_train_prompt(row['input'], row['output'])
   tokens = tokenizer.encode(train_prompt) + [tokenizer.eos_token_id]
   return tokens

def convert_train_data_to_prev_train_data(tokenizer, ds_data):
   result = list()
   for row in ds_data:
      result.append(convert_train_row_to_prev_train_row(tokenizer, row))
   return result

# 對訓練資料的 Padding 與推論時 Padding 的方向並不相同，在訓練時通常將 PAD Token 放在右邊，而推論時會放在左邊。
def padding_train_dataset(tokenizer, ds_tokens):
   maxlen = max(map(len, ds_tokens))
   ds_result = list()
   for tokens in ds_tokens:
      delta = maxlen - len(tokens)
      # 將 EOS Token 當作 PAD Token 來用
      tokens += [tokenizer.eos_token_id] * delta
      ds_result.append({
         "input_ids": tokens, 
         "labels": tokens
      })
   return ds_result


In [21]:
train_data = [
   {
      "input": "Code Review Short Checklist\nFor Reviewee:\n* Engage in discussion with the reviewer regarding the code and requirements.\n* Take notes of feedback and improvement suggestions.\n* Address identified improvements before the next review iteration.\nFor Reviewer:\n* Offer feedback on strengths and weaknesses of the code.\n* Give specific suggestions for improvement and guide on addressing issues.\n* Ensure that feedback helps enhance code quality and align with standards.",
      "output": "person:[\"Reviewee\",\"Reviewer\"]"
   },
   {
      "input": "Reviewee Pre-Review Preparation:\n* Are all preconditions and prerequisites met for the review?\n* Do reviewer have access to the necessary documentation and requirements?",
      "output": "person:[\"Reviewee\"]"
   },
   {
      "input": "Reviewer Understanding Requirements:\n* Does the solution align with the defined requirements?\n* Have Reviewee identified the target audience and their specific needs?\n* Have security requirements and permission control been confirmed?",
      "output": "person:[\"Reviewer\"]"
   }
]

ds_dataset = convert_train_data_to_prev_train_data(tokenizer, train_data)
ds_dataset = padding_train_dataset(tokenizer, ds_dataset)

dump_json(ds_dataset, "./results/train.json")
dump_json_gz(ds_dataset, f"./results/train.json.gz")
dump_json_gz(ds_dataset, f"./results/dev.json.gz")

In [23]:
import datasets
import shutil

data_files = {
   "train": "./results/train.json.gz",
   "dev": "./results/dev.json.gz",
}

shutil.rmtree("./cache/")

dataset = datasets.load_dataset(
   "json",
   data_files=data_files,
   cache_dir="cache",
)

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]