In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch, wandb, platform, warnings
import gradio 
from datasets import load_dataset
from trl import SFTTrainer

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
 print('메모리 사용량/총 메모리: ' + str(torch.cuda.memory_allocated('cuda:0')/1024**3 ) + 'GB / ' + str(torch.cuda.memory_reserved('cuda:0')/1024**3 ) + 'GB')

In [None]:
def print_system_specs():
    # Check if CUDA is available
    is_cuda_available = torch.cuda.is_available()
    print("CUDA Available:", is_cuda_available)
# Get the number of available CUDA devices
    num_cuda_devices = torch.cuda.device_count()
    print("Number of CUDA devices:", num_cuda_devices)
    if is_cuda_available:
        for i in range(num_cuda_devices):
            # Get CUDA device properties
            device = torch.device('cuda', i)
            print(f"--- CUDA Device {i} ---")
            print("Name:", torch.cuda.get_device_name(i))
            print("Compute Capability:", torch.cuda.get_device_capability(i))
            print("Total Memory:", torch.cuda.get_device_properties(i).total_memory, "bytes")
    # Get CPU information
    print("--- CPU Information ---")
    print("Processor:", platform.processor())
    print("System:", platform.system(), platform.release())
    print("Python Version:", platform.python_version())
print_system_specs()

In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("lcw99/llama2-ko-chang-13b-instruct-chat")
# model = AutoModelForCausalLM.from_pretrained("lcw99/llama2-ko-chang-13b-instruct-chat")

# Pre trained model
model_name = "Minirecord/Mini_synatra_7b_02" 

# Dataset name
dataset_name = "dohun0714/chat_bot"

# Hugging face repository link to save fine-tuned model(Create new repository in huggingface,copy and paste here)
new_model = "dohun0714/care"

In [None]:
from huggingface_hub import interpreter_login

interpreter_login()

In [None]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")
# dataset["text"][0]
print(dataset)

In [None]:
# Load base model(llama-2-7b-hf) and tokenizer
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit= True,
#     bnb_4bit_quant_type= "nf4",
#     bnb_4bit_compute_dtype= torch.float16,
#     bnb_4bit_use_double_quant= False,
# )
model = AutoModelForCausalLM.from_pretrained(
    model_name,
#     quantization_config=bnb_config,
#     device_map={"": 0}
)
model.to('cuda:0')
model = prepare_model_for_kbit_training(model)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.add_eos_token = True
# tokenizer.add_bos_token, tokenizer.add_eos_token

In [None]:
# for i, data in enumerate(dataset):
#     print(data)
#     print("\n")
#     if i > 10:  # 첫 10개 요소만 출력
#         break

In [None]:
# 예시 코드
example_sentence = "이것은 테스트 문장입니다."
tokenized_output = tokenizer.tokenize(example_sentence)
print(tokenized_output)

# 토크나이저와 모델의 임베딩 레이어 크기 비교
print(len(tokenizer.vocab))  # 토크나이저 단어장 크기
print(model.model.embed_tokens.weight.size(0)) # 모델 임베딩 레이어 크기

In [None]:
#monitering login
wandb.login(key="ba603a5612d073d6bd76b4b1844ee94b3e05fbbd")
run = wandb.init(project='synatra-kor-chatbot', job_type="training", anonymous="allow")

In [None]:
peft_config = LoraConfig(
    lora_alpha= 8,
    lora_dropout= 0.1,
    r= 16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj"]
)

In [None]:
training_arguments = TrainingArguments(
    output_dir= "./results",
    num_train_epochs= 1,
    per_device_train_batch_size= 8,
    gradient_accumulation_steps= 2,
    optim = "paged_adamw_8bit",
    save_steps= 719,
    logging_steps= 30,
    learning_rate= 2e-4,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.3,
    group_by_length= True,
    lr_scheduler_type= "linear",
    report_to="wandb",
)

In [None]:
print(model)

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['Q'])):
        question = example['Q'][i]
        answer = example['A'][i]
        text = f"### 질문: {question}\n### 답: {answer}"
        output_texts.append(text)
    return output_texts
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    formatting_func=formatting_prompts_func,
    peft_config=peft_config,
    max_seq_length= 50,
#     dataset_text_field="Q",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
) 

In [None]:
# Train model
trainer.train()

In [None]:
# Save the fine-tuned model
trainer.save_model('/data/dhk/chat_c')
wandb.finish()
model.config.use_cache = True
model.eval()

In [None]:
# def stream(user_prompt):
#     runtimeFlag = "cuda:0"
#     system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n'
#     B_INST, E_INST = "### Instruction:\n", "### Response:\n"

#     prompt = f"{system_prompt}{B_INST}{user_prompt.strip()}\n\n{E_INST}"

#     inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

#     streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

#     # Despite returning the usual output, the streamer will also print the generated text to stdout.
#     _ = model.generate(**inputs, streamer=streamer, max_new_tokens=1500)

In [None]:
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    B_INST, E_INST = "### Instruction:", "### Response:"

    prompt = f"{system_prompt}{B_INST}{user_prompt.strip()}{E_INST}"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    # Generate response
    outputs = model.generate(**inputs, max_new_tokens=1000, temperature=1.0, length_penalty=0.2, do_sample=True, num_beams=10)

    # Decode and print the generated response
    print(tokenizer.decode(outputs[0]))

stream("나 기뻐")

In [None]:
stream("나 오늘 혼났어")

In [None]:
stream("안녕 너는 누구야?")

In [None]:
stream("오늘은 슬프네")

In [None]:
# 모델 저장
model.save_pretrained("/data/dhk/ccc/",safe_serialization=True)
tokenizer.save_pretrained("/data/dhk/ccc/",safe_serialization=True)

In [None]:
## Upload to Huggingface Hub
model.push_to_hub(
    new_model, 
    safe_serialization=False
)
tokenizer.push_to_hub(
    new_model,
    safe_serialization=False
)

In [None]:
# !export TOKENIZERS_PARALLELISM=false