In [None]:
!pip install bitsandbytes
!pip install datasets peft
!pip install transformers
!pip install transformers huggingface_hub
!pip install peft

In [None]:
import os
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig, DataCollatorForSeq2Seq
from datasets import Dataset
from peft import get_peft_model, LoraConfig, TaskType

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# Train

In [None]:
filepath='/content/drive/MyDrive/AIKU_RANCHAT_HER/chat-dataset/Chat-Satoru-all.csv'
df = pd.read_csv(filepath)
# Convert the DataFrame to a Hugging Face dataset
dataset = Dataset.from_pandas(df)
dataset

In [None]:
# Load the model and tokenizer
model_name = "upstage/SOLAR-10.7B-Instruct-v1.0"  # Replace with the actual model path
tokenizer = AutoTokenizer.from_pretrained(model_name)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)

# Configure QLoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)


In [None]:
from transformers import TrainerCallback
from tqdm import tqdm
import time

class ProgressCallback(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()
        self.progress_bar = tqdm(total=state.max_steps, desc="Training Progress")

    def on_step_end(self, args, state, control, **kwargs):
        self.progress_bar.update(1)
        elapsed_time = time.time() - self.start_time
        steps_per_sec = state.global_step / elapsed_time
        remaining_steps = state.max_steps - state.global_step
        eta = remaining_steps / steps_per_sec
        self.progress_bar.set_postfix_str(f"ETA: {int(eta // 60)} min {int(eta % 60)} sec")

    def on_train_end(self, args, state, control, **kwargs):
        self.progress_bar.close()

In [None]:
# Define the prompt format function
def format_prompt(input_text, response_text):
    return f"### System:\n{input_text}\n\n### User:\n{response_text}\n### Assistant:\n"

# Preprocess the dataset
def preprocess_function(examples):
    formatted_prompts = [format_prompt(inp, resp) for inp, resp in zip(examples['Input'], examples['Response'])]
    tokenized_inputs = tokenizer(formatted_prompts, padding=True, truncation=True, max_length=512)
    tokenized_inputs['labels'] = tokenized_inputs['input_ids'].copy()
    return tokenized_inputs

encoded_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

# Define the data collator for padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16=True,  # Use mixed precision training
    logging_dir='./logs',  # Directory for storing logs
    save_total_limit=3,  # Limit the total amount of checkpoints. Deletes the older checkpoints.
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset,  # Using the same dataset for eval as a placeholder
    data_collator=data_collator,
    callbacks=[ProgressCallback()]  # Add the custom callback here
)

# Train the model
trainer.train()


# Hugging Face Hub에 모델과 토크나이저 업로드
model_name = "gojo-finetuned-solar"  # 업로드할 모델 이름을 지정
model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)


print("Model fine-tuning complete.")

# Inference


In [None]:
import os
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig, DataCollatorForSeq2Seq
from datasets import Dataset
from peft import get_peft_model, LoraConfig, TaskType

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch
# 양자화 설정
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 또는 load_in_8bit=True
    bnb_4bit_quant_type="nf4",  # 양자화 유형 설정 (예: 'nf4')
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16  # 계산에 사용할 데이터 유형
)

# 기본 모델 로드
model_name = "upstage/SOLAR-10.7B-Instruct-v1.0"  # 원래의 기본 모델 경로
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # 양자화 설정 적용
    device_map="auto"  # 자동 장치 매핑
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 어댑터 설정 로드
adapter_model_path = "std50218/gojo-finetuned-solar"  # 어댑터 모델 경로

# 어댑터 모델 로드
peft_model = PeftModel.from_pretrained(model, adapter_model_path)

# 모델을 peft 모델로 변경
model = peft_model

In [None]:
# 수정된 format_prompt 함수 정의
def format_prompt(input_text):
    return f"### System:\n{input_text}\n\n### User:\n"

# 예시 입력 텍스트
input_text = "보고싶어"
formatted_input = format_prompt(input_text)

# 입력 텍스트를 토큰화
inputs = tokenizer(formatted_input, return_tensors="pt")

# 모델 예측 수행
outputs = model.generate(**inputs, max_length=100)

# 출력 디코딩
predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# "### Assistant:" 부분 이후를 제거
cleaned_text = predicted_text.split("### User:")[-1].split("### Assistant:")[0].strip()

print(cleaned_text)
