# Preprocess Datasets for Korean LLM (Large Language Model) fine-tuning
---

- Alpaca 논문에서 전처리했던 방식대로 전처리 수행
- 허깅페이스 인증 정보 설정: `huggingface-cli login`
    - https://huggingface.co/join
    - https://huggingface.co/settings/tokens

In [None]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../utils')
sys.path.append('../templates')

from common_lib import check_packages
check_packages()

<br>

## 1. Download LLM from Hugging Face hub
---

### Load dataset
허깅페이스 허브에서 다운로드하거나 json/json 포맷의 데이터 세트를 다운로드합니다. 데이터 세트 내 샘플은 (`instruction, input, output`)의 key-value나 (`instruction, output`)의 key-value로 구성되어야 합니다.

예시:
```
{
    "instruction":"건강을 유지하기 위한 세 가지 팁을 알려주세요.",
    "input":"",
    "output":"세 가지 팁은 아침식사를 꼭 챙기며, 충분한 수면을 취하고, 적극적으로 운동을 하는 것입니다."
}
```

In [None]:
import os
import torch
import transformers
from datasets import load_dataset
from inference_lib import Prompter
from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast

data_path = "nlpai-lab/kullm-v2"
#data_path = "beomi/KoAlpaca-v1.1a"
#data_path = "./data/ko_alpaca_data.json"

if data_path.endswith(".json") or data_path.endswith(".jsonl"):
    data = load_dataset("json", data_files=data_path)
else:
    data = load_dataset(data_path)
    
prompter = Prompter("kullm")
cutoff_len = 2048
train_on_inputs = True

In [None]:
import os
from pathlib import Path
from huggingface_hub import snapshot_download

HF_MODEL_ID = "nlpai-lab/kullm-polyglot-12.8b-v2"

tokenizer = GPTNeoXTokenizerFast.from_pretrained(HF_MODEL_ID)

# Only download pytorch checkpoint files
allow_patterns = ["*.json", "*.pt", "*.bin", "*.txt", "*.model"]

# create model dir
model_name = HF_MODEL_ID.split("/")[-1].replace('.', '-')
model_tar_dir = Path(f"/home/ec2-user/SageMaker/models/{model_name}")
if not os.path.isdir(model_tar_dir):
    os.makedirs(model_tar_dir, exist_ok=True)
    # Download model from Hugging Face into model_dir
    snapshot_download(
        HF_MODEL_ID, 
        local_dir=str(model_tar_dir), 
        local_dir_use_symlinks=False,
        allow_patterns=allow_patterns,
        cache_dir="/home/ec2-user/SageMaker/"
    )

<br>

## 2. Tokenize
---

In [None]:
def tokenize(prompt, add_eos_token=True):
    # there's probably a way to do this with the tokenizer settings
    # but again, gotta move fast
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=cutoff_len,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = prompter.generate_prompt(
        data_point["instruction"],
        data_point.get("input"),
        data_point["output"],
    )
    tokenized_full_prompt = tokenize(full_prompt)
    if not train_on_inputs:
        user_prompt = prompter.generate_prompt(data_point["instruction"], data_point.get("input"))
        tokenized_user_prompt = tokenize(user_prompt, add_eos_token=add_eos_token)
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if add_eos_token:
            user_prompt_len -= 1

        tokenized_full_prompt["labels"] = [-100] * user_prompt_len + tokenized_full_prompt["labels"][
            user_prompt_len:
        ]  # could be sped up, probably
    return tokenized_full_prompt


In [None]:
dataset = data['train'].shuffle()#.select(range(100))
lm_dataset = dataset.map(generate_and_tokenize_prompt)

# Print total number of samples
print(f"Total number of samples: {len(lm_dataset)}")

<br>

## 3. Save dataset to S3
---

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
region = boto3.Session().region_name
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
bucket = None
if bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=bucket)

print(f"SageMaker role arn: {role}")
print(f"SageMaker bucket: {sess.default_bucket()}")
print(f"SageMaker session region: {sess.boto_region_name}")

In [None]:
bucket_prefix = 'ko-llms/peft'
dataset_prefix = 'alpaca-train'
s3_data_path = f"s3://{bucket}/{bucket_prefix}/{model_name}/dataset/{dataset_prefix}"
s3_pretrained_model_path = f"s3://{bucket}/{bucket_prefix}/huggingface-models/{model_name}/"
print(f"S3 data path: \n {s3_data_path}")
print(f"S3 pretrained model path: \n {s3_pretrained_model_path}")

In [None]:
num_debug_samples = 50
lm_dataset.save_to_disk(s3_data_path)
lm_dataset.select(range(num_debug_samples)).save_to_disk(dataset_prefix)
print(f"Number of samples for debugging: {num_debug_samples}")

In [None]:
%store bucket_prefix dataset_prefix s3_data_path