## Import

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, AdamW
from tqdm import tqdm

In [2]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
class config():
    def __init__(self):
        
        self.source_len=128
        self.epochs = 10
        self.learning_rate=2e-5
        self.batch_size=16
        self.shuffle = True
        self.seed=800
        self.num_labels=10
        self.train_data_path= r'C:\Users\user\OneDrive - KookminUNIV\바탕 화면\DACON\train.csv'
        self.test_data_path = r'C:\Users\user\OneDrive - KookminUNIV\바탕 화면\DACON\test.csv'
        self.model_path = 'skt/kogpt2-base-v2'
        # self.modelsave_path = r'C:\Users\user\OneDrive - KookminUNIV\바탕 화면\추가사전학습\Fine_tuning'
        # self.loss_path = r'C:\Users\user\OneDrive - KookminUNIV\바탕 화면\추가사전학습\Fine_tuning'
cfg = config()

## Data Preprocessing

In [6]:
# 데이터 로드
data = pd.read_csv(cfg.train_data_path)

# 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2', eos_token='</s>')

# 데이터 포맷팅 및 토크나이징
formatted_data = []
for _, row in tqdm(data.iterrows()):
    for q_col in ['질문_1', '질문_2']:
        for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
            # 질문과 답변 쌍을 </s> token으로 연결
            input_text = row[q_col] + tokenizer.eos_token + row[a_col]
            print(input_text)
#             input_ids = tokenizer.encode(input_text, return_tensors='pt')
#             formatted_data.append(input_ids)
# print('Done.')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
644it [00:00, 16100.59it/s]

면진장치가 뭐야?</s>면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 격리장치입니다.
면진장치가 뭐야?</s>면진장치란 건물의 지반에서 발생하는 진동 에너지를 흡수하여 건물을 보호하고, 진동을 줄여주는 장치입니다. 주로 지진이나 기타 지반의 진동으로 인한 피해를 방지하기 위해 사용됩니다.
면진장치가 뭐야?</s>면진장치란 지반으로부터 발생하는 진동 에너지를 흡수하여 건물에 전달되는 진동을 줄여주는 장치를 말합니다. 이를 통해 건물의 안전성과 안정성을 향상시키고, 지진 등의 외부 충격으로부터 보호하는 역할을 합니다. 지진으로 인한 건물의 피해를 최소화하기 위해 주로 사용됩니다.
면진장치가 뭐야?</s>면진장치는 건물의 지반으로부터 오는 진동 에너지를 흡수하여 건물에 전달되는 진동을 최소화해 주는 진동 격리장치입니다. 이를 통해 건물 내부의 진동을 줄이고 안정성을 유지하는 데 도움을 줍니다.
면진장치가 뭐야?</s>면진장치는 건물에 오는 지반 진동의 영향을 최대한으로 흡수하여 건물에 전달되는 진동을 줄여주는 장치입니다. 지반으로부터 오는 진동 에너지의 영향을 완화시키기 위해 사용됩니다.
면진장치에 사용되는 주요 기술은 무엇인가요?</s>면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 격리장치입니다.
면진장치에 사용되는 주요 기술은 무엇인가요?</s>면진장치란 건물의 지반에서 발생하는 진동 에너지를 흡수하여 건물을 보호하고, 진동을 줄여주는 장치입니다. 주로 지진이나 기타 지반의 진동으로 인한 피해를 방지하기 위해 사용됩니다.
면진장치에 사용되는 주요 기술은 무엇인가요?</s>면진장치란 지반으로부터 발생하는 진동 에너지를 흡수하여 건물에 전달되는 진동을 줄여주는 장치를 말합니다. 이를 통해 건물의 안전성과 안정성을 향상시키고, 지진 등의 외부 충격으로부터 보호하는 역할을 합니다. 지진으로 인한 건물의 피해를 최소화하기 위해 주로 사용됩니다.
면진장치에 사용되는 주요 기술은 무엇인가요?</s>면진장




In [10]:
formatted_data

[tensor([[ 9411,  8265, 37765, 46651,  7991,   406,     1,  9411,  8265, 20725,
           7374,  9027,  7599,  9023, 14472, 15898, 14820, 33220, 36928, 10764,
           9166, 11818, 28037, 10090, 15898, 34062, 20725, 21154]]),
 tensor([[ 9411,  8265, 37765, 46651,  7991,   406,     1,  9411,  8265, 20725,
           7374, 16759,  9027,  7599,  9023, 13426, 15898, 14820, 33220, 17836,
          10146,  9307,  9166, 11818, 28037, 10090, 16668, 21154,  9584, 15351,
           9185, 10493,  9027, 16652,  9166, 12999, 11931, 14581, 22489,  9207,
           9209,  7255, 12521]]),
 tensor([[ 9411,  8265, 37765, 46651,  7991,   406,     1,  9411,  8265, 20725,
           7374,  9027,  7599, 10306, 13426, 15898, 14820, 33220, 36928, 43591,
           9166, 11818, 28037, 10090, 21547,  9135, 37194,  9558,  9430, 16759,
          11350, 10342, 44444, 15072, 15877, 15351,  9284, 13387, 21827, 10306,
          23227, 10254, 49421, 15351,  9021, 11931, 16759, 14581, 17031, 24092,
           9207, 

## Model Fine-tuning

In [5]:
# 모델 로드
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.to(device) # 모델을 GPU단으로 이동

# 모델 학습 하이퍼파라미터(Hyperparameter) 세팅
# 실제 필요에 따라 조정하세요.
CFG = {
    'LR' : 2e-5, # Learning Rate
    'EPOCHS' : 10, # 학습 Epoch
}

# 모델 학습 설정
optimizer = AdamW(model.parameters(), lr=CFG['LR'])
model.train()

# 모델 학습
for epoch in range(CFG['EPOCHS']):
    total_loss = 0
    progress_bar = tqdm(enumerate(formatted_data), total=len(formatted_data))
    for batch_idx, batch in progress_bar:
        # 데이터를 GPU단으로 이동
        batch = batch.to(device)
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        # 진행률 표시줄에 평균 손실 업데이트
        progress_bar.set_description(f"Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")

    # 에폭의 평균 손실을 출력
    print(f"Epoch {epoch+1}/{CFG['EPOCHS']}, Average Loss: {total_loss / len(formatted_data)}")

# 모델 저장
model.save_pretrained("./hansoldeco-kogpt2")
tokenizer.save_pretrained("./hansoldeco-kogpt2")

Downloading pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

Epoch 1 - Avg Loss: 2.8460: 100%|██████████| 6440/6440 [07:05<00:00, 15.13it/s]


Epoch 1/10, Average Loss: 2.8459768351098025


Epoch 2 - Avg Loss: 1.7505: 100%|██████████| 6440/6440 [06:57<00:00, 15.41it/s]


Epoch 2/10, Average Loss: 1.7505300436816786


Epoch 3 - Avg Loss: 1.1280: 100%|██████████| 6440/6440 [06:55<00:00, 15.51it/s]


Epoch 3/10, Average Loss: 1.1280281403963888


Epoch 4 - Avg Loss: 0.7618: 100%|██████████| 6440/6440 [06:52<00:00, 15.61it/s]


Epoch 4/10, Average Loss: 0.7618466119854624


Epoch 5 - Avg Loss: 0.5380: 100%|██████████| 6440/6440 [06:53<00:00, 15.59it/s]


Epoch 5/10, Average Loss: 0.5379548681448706


Epoch 6 - Avg Loss: 0.3999: 100%|██████████| 6440/6440 [06:52<00:00, 15.60it/s]


Epoch 6/10, Average Loss: 0.3998622996229331


Epoch 7 - Avg Loss: 0.3163: 100%|██████████| 6440/6440 [06:52<00:00, 15.61it/s]


Epoch 7/10, Average Loss: 0.31627605172186535


Epoch 8 - Avg Loss: 0.2655: 100%|██████████| 6440/6440 [06:53<00:00, 15.59it/s]


Epoch 8/10, Average Loss: 0.26550994531789457


Epoch 9 - Avg Loss: 0.2299: 100%|██████████| 6440/6440 [06:53<00:00, 15.58it/s]


Epoch 9/10, Average Loss: 0.2299412859835147


Epoch 10 - Avg Loss: 0.2069: 100%|██████████| 6440/6440 [06:52<00:00, 15.61it/s]


Epoch 10/10, Average Loss: 0.20688121704766468


('./hansoldeco-kogpt2\\tokenizer_config.json',
 './hansoldeco-kogpt2\\special_tokens_map.json',
 './hansoldeco-kogpt2\\tokenizer.json')

## Model Inference

In [6]:
# 저장된 Fine-tuned 모델과 토크나이저 불러오기
model_dir = "./hansoldeco-kogpt2"
model = GPT2LMHeadModel.from_pretrained(model_dir)
model.to(device)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_dir)

# Inference를 위한 test.csv 파일 로드
test = pd.read_csv('./test.csv')

# test.csv의 '질문'에 대한 '답변'을 저장할 리스트
preds = []

# '질문' 컬럼의 각 질문에 대해 답변 생성
for test_question in tqdm(test['질문']):
    # 입력 텍스트를 토큰화하고 모델 입력 형태로 변환
    input_ids = tokenizer.encode(test_question + tokenizer.eos_token, return_tensors='pt')

    # 답변 생성
    output_sequences = model.generate(
        input_ids=input_ids.to(device),
        max_length=300,
        temperature=0.9,
        top_k=1,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )

    # 생성된 텍스트(답변) 저장
    for generated_sequence in output_sequences:
        full_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
        # 질문과 답변의 사이를 나타내는 eos_token (</s>)를 찾아, 이후부터 출력
        answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
        answer_only = full_text[answer_start:].strip()
        answer_only = answer_only.replace('\n', ' ')
        preds.append(answer_only)

100%|██████████| 130/130 [04:54<00:00,  2.26s/it]


## Submission

In [7]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
# 평가를 위한 Embedding Vector 추출에 활용하는 모델은 'distiluse-base-multilingual-cased-v1' 이므로 반드시 확인해주세요.
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = model.encode(preds)
pred_embeddings.shape

Downloading .gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading 2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/2.45k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

(130, 512)

In [8]:
submit = pd.read_csv('./sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
submit.head()

Unnamed: 0,id,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
0,TEST_000,0.015765,0.05601,-0.011973,0.009514,0.125522,0.007938,0.03135,0.06199,0.010081,...,-0.007705,-0.013325,1e-05,-0.005996,-0.020423,0.008186,0.062777,0.031447,-0.012665,0.036256
1,TEST_001,0.027476,0.031406,-0.050489,0.009678,0.119823,-0.041251,0.014243,-0.017895,0.081403,...,-0.033637,-0.013678,-0.026174,-0.019957,-0.005777,0.023247,0.020694,-0.020786,-0.049734,-0.01127
2,TEST_002,-0.006813,-0.003179,0.008923,0.005651,0.090779,-0.008267,-0.04122,0.001288,0.029889,...,-0.036412,-0.062878,-0.003894,-0.04765,0.034637,0.044994,-0.026444,-0.049396,-0.024263,0.032262
3,TEST_003,0.012931,0.017019,0.002525,0.036831,0.078396,-0.03094,-0.059503,0.019713,0.016635,...,-0.031194,-0.006522,0.066188,-0.04211,0.00348,0.019691,-0.018142,-0.01634,-0.02883,0.026559
4,TEST_004,0.020637,0.020052,-0.005806,-0.010334,0.105124,-0.003586,0.035843,0.059211,-0.01676,...,0.009173,-0.035664,0.047057,-0.003688,0.013008,0.012201,0.001192,0.012827,0.019431,0.038264


In [9]:
# 리더보드 제출을 위한 csv파일 생성
submit.to_csv('./baseline_submit.csv', index=False)