## Import

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"
!pip install sentence_transformers

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

In [None]:
!pip install -U torch tokenizers safetensors

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import  AdamW
from tqdm import tqdm

In [None]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
    PeftModel,
    PeftConfig,
)

peft_type = PeftType.LORA
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


## Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import torch
import transformers
import bitsandbytes as bnb
import os
# import wandb

from transformers import PreTrainedTokenizerFast, AdamW, AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from tqdm import tqdm

#os.environ["TOKENIZERS_PARALLELISM"] = "true"
#torch.backends.cuda.matmul.allow_tf32=True
#torch.set_float32_matmul_precision('medium')
#torch.backends.cudnn.benchmark = True

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "LDCC/LDCC-SOLAR-10.7B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True,eos_token='</s>')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config).to(device)

model.print_trainable_parameters()
model


In [None]:
# conversation = [ {'role': 'user', 'content': 'Hello?'} ]

# prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)

# inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# outputs = model.generate(**inputs, use_cache=True, max_length=4096)
# output_text = tokenizer.decode(outputs[0])
# print(output_text)

In [None]:
import pandas as pd
from tqdm import tqdm

# 데이터 로드
data = pd.read_csv('./data/v2train.csv')

# 데이터 포맷팅 및 토크나이징
formatted_data = []
input_texts = []
for _, row in tqdm(data.iterrows()):
    for q_col in ['질문_1', '질문_2']:
        for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
            # 질문과 답변 쌍을 </s> token으로 연결
            input_text = '###질문:' + row[q_col] + '\n\n###답변:'+ row[a_col]+tokenizer.eos_token
            # input_text = '<s> ### User:' + row[q_col] + '\n\n### Assistant:\n'+ row[a_col]+tokenizer.eos_token
            input_texts.append(input_text)

# 데이터프레임 생성
df = pd.DataFrame({'text': input_texts})
print(df.head())


In [None]:
data.to_csv('./data/v2_train(모든수정완료)',index=False)

In [None]:
max_length = df['text'].str.len().max()
print("최대 길이:", max_length)

In [None]:
max_row_lengths = df.apply(lambda row: max(row.str.len()), axis=1)

# 최대 문자열 길이 출력
print("각 행의 최대 문자열 길이:")
print(max_row_lengths)
print("최대 문자열 길이:", max_row_lengths.max())
length_freq = {}

# 각 행의 최대 길이를 기반으로 딕셔너리에 빈도 업데이트
for length in max_row_lengths:
    if length in length_freq:
        length_freq[length] += 1
    else:
        length_freq[length] = 1

# print(length_freq)
# 빈도순으로 정렬
sorted_length_freq = dict(sorted(length_freq.items(), key=lambda x: x[1], reverse=True))

print(sorted_length_freq)

In [None]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="./data/v2_train(모든수정완료).csv")

In [None]:
# context_length=128
def tokenize(element):
    outputs = tokenizer(
        element['text'],
        truncation=True,
        max_length=1024,
        # return_overflowing_tokens=True,
        # return_length=True,
    )
    input_batch = []
    for input_ids in outputs['input_ids']:
        # print(length)
          input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = dataset.map(
    tokenize, batched=True, remove_columns=dataset["train"].column_names)
tokenized_datasets


## Model Fine-tuning

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)


In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir='v2-solar-hansol-ft',
    logging_steps=10,
    per_device_train_batch_size=2,
    # evaluation_strategy="steps",
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    weight_decay=0.001,
    # warmup_steps=10,
    lr_scheduler_type="cosine",
    learning_rate=3e-4,
    save_steps=10,
    fp16=True,
    push_to_hub=True,
    optim="paged_adamw_8bit"
)


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
)
model.config.use_cache = False
# model.config.pretraining_tp = 1

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

# Test

## Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"
!pip install sentence_transformers

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

In [None]:
!pip install -U torch tokenizers safetensors

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import  AdamW
from tqdm import tqdm

In [None]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
    PeftModel,
    PeftConfig,
)

peft_type = PeftType.LORA
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


## Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import torch
import transformers
import bitsandbytes as bnb
import os
# import wandb

from transformers import PreTrainedTokenizerFast, AdamW, AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from tqdm import tqdm

#os.environ["TOKENIZERS_PARALLELISM"] = "true"
#torch.backends.cuda.matmul.allow_tf32=True
#torch.set_float32_matmul_precision('medium')
#torch.backends.cudnn.benchmark = True

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "LDCC/LDCC-SOLAR-10.7B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True,eos_token='</s>')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

In [None]:
peft_model_id = "emaeon/v2-solar-hansol-ft"
model = PeftModel.from_pretrained(model, peft_model_id, device_map="auto")

In [None]:
model

## Model Inference

In [None]:
testdata = pd.read_csv('./data/test.csv')
testdata.head()

In [None]:
!nvidia-smi

In [None]:
tokenizer.eos_token_id

In [None]:
preds = []
# 모델 평가 모드로 설정
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, AdamW
model.eval()
# tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id,trust_remote_code=True,eos_token='</s>')
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"
# '질문' 컬럼의 각 질문에 대해 답변 생성
model.config.use_cache = True
for x in tqdm(testdata['질문']):
    # 입력 텍스트를 토큰화하고 모델 입력 형태로 변환
    input_text = f"###질문:{x}\n\n###답변:"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # 답변 생성
    output_sequences = model.generate(
        input_ids=input_ids.to(device),
        max_length=400,
        temperature=0.9,
        top_k=1,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )

    # 생성된 텍스트(답변) 저장
    for generated_sequence in output_sequences:
        full_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
        print(full_text)
        # 질문과 답변의 사이를 나타내는 eos_token (</s>)를 찾아, 이후부터 출력
        answer_start = full_text.find("\n\n###답변:") + len("\n\n###답변:")
        answer_only = full_text[answer_start:].strip()
        answer_only = answer_only.replace('</s>', ' ')
        preds.append(answer_only)

## Submission

In [None]:
result = pd.DataFrame({'text':preds})
result.to_excel('./data/gen_result.xlsx',index=False)

In [None]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
# 평가를 위한 Embedding Vector 추출에 활용하는 모델은 'distiluse-base-multilingual-cased-v1' 이므로 반드시 확인해주세요.
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = model.encode(preds)
pred_embeddings.shape

In [None]:


submit = pd.read_csv('./data/sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
submit.head()

In [None]:
# 리더보드 제출을 위한 csv파일 생성
submit.to_csv('./data/v2_solar_submit.csv', index=False)