In [1]:
# Data  영화리뷰 데이터
import gdown

file_id = '1k-aUl9Qqg_972g6vAb1hYPQedNpp3WN9'
download_url = f'https://drive.google.com/uc?id={file_id}'
gdown.download(download_url, 'test.txt', quiet=False)

file_id = '1ig2rK2OMhX1Bgz-zNNhm3aI3ZGCk3Jpl'
download_url = f'https://drive.google.com/uc?id={file_id}'
gdown.download(download_url, 'train.txt', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1k-aUl9Qqg_972g6vAb1hYPQedNpp3WN9
To: c:\Github\python_ML\LLM\test.txt
100%|██████████| 4.89M/4.89M [00:00<00:00, 14.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ig2rK2OMhX1Bgz-zNNhm3aI3ZGCk3Jpl
To: c:\Github\python_ML\LLM\train.txt
100%|██████████| 14.6M/14.6M [00:00<00:00, 24.1MB/s]


'train.txt'

In [2]:
from transformers import AutoTokenizer, GPT2Model
# 모델 로드
model_name = 'skt/kogpt2-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          bos_token='</s>', eos_token='</s>', pad_token='<pad>')

In [4]:
import re
import pandas as pd
# 데이터 로드
train_data = pd.read_csv('train.txt', sep='\t')
test_data = pd.read_csv('test.txt', sep='\t')
print(f"{len(train_data), len(test_data)}")
# 전처리 중복 및 결측치, 한글만처리
train_data.drop_duplicates(subset=['document'], inplace=True)
train_data.dropna(inplace=True)
test_data.drop_duplicates(subset=['document'], inplace=True)
test_data.dropna(inplace=True)
train_data['document'] = train_data['document'].apply(lambda x: re.sub(r'[^가-힣]', '', x))
test_data['document'] = test_data['document'].apply(lambda x: re.sub(r'[^가-힣]', '', x))
print(f"{len(train_data), len(test_data)}")

(150000, 50000)
(146182, 49157)


In [5]:
# 소량의 데이터만  1000개
train_data = train_data[:3000]
test_data = test_data[:1000]
print(f"{train_data.shape, test_data.shape}")


((3000, 3), (1000, 3))


In [6]:
MAX_SEQ_LEN = 128

In [7]:
# 함수로 처리
from tqdm import tqdm
import numpy as np
def convert_example_to_feature(example,labels, max_seq_len,tokenizer):
  input_ids, data_ids , attention_masks= [],[],[]
  for example, label in  tqdm(zip(example,labels)):
    bos_token = [tokenizer.bos_token_id]
    eos_token = [tokenizer.eos_token_id]
    tokens = bos_token + tokenizer.encode(example) + eos_token
    input_id = tokens + [tokenizer.pad_token_id] * (max_seq_len - len(tokens)) if len(tokens) < max_seq_len else tokens[:max_seq_len]
    # 어텐션 마스크 생성
    attention_mask = [1] * len(tokens) + [0] * (max_seq_len - len(tokens)) if len(tokens) < max_seq_len else [1] * max_seq_len


    input_ids.append(input_id)
    data_ids.append([label])
    attention_masks.append(attention_mask)

  input_ids = np.array(input_ids, dtype=np.int32)
  data_ids = np.array(data_ids, dtype=np.int32)
  attention_masks = np.array(attention_masks, dtype=np.int32)
  return input_ids, attention_masks,  data_ids

In [8]:
train_x, train_attention_masks ,train_y = convert_example_to_feature(train_data['document'], train_data['label'], MAX_SEQ_LEN, tokenizer)
test_x, test_attention_masks, test_y = convert_example_to_feature(test_data['document'], test_data['label'], MAX_SEQ_LEN, tokenizer)

3000it [00:00, 24882.86it/s]
1000it [00:00, 27514.64it/s]


In [9]:
# GPT 출력
import torch
class GPT2Classifier(torch.nn.Module):
  def __init__(self, model_name):
    super(GPT2Classifier, self).__init__()
    self.gpt = GPT2Model.from_pretrained(model_name)
    self.dropout = torch.nn.Dropout(0.2)
    self.classifer = torch.nn.Linear(768, 1)
  def forward(self,input_ids,attention_masks = None):
    #(배치크기, 시퀀스길이, 히든레이터 크기)
    outputs = self.gpt(input_ids=input_ids, attention_mask=attention_masks)[0]
    # 마지막 출력 벡터
    cls_token = outputs[:, -1, :]
    cls_token = self.dropout(cls_token)  # (32x768 and 10x1)
    prediction = torch.sigmoid( self.classifer(cls_token) )
    return prediction

In [10]:
# 모델 초기화
model = GPT2Classifier(model_name)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.BCELoss()

In [11]:
# 데이터를 텐서로 변환
train_x_tensor = torch.tensor(train_x, dtype=torch.long)
train_attention_tensor = torch.tensor(train_attention_masks,dtype=torch.long)
train_y_tensor = torch.tensor(train_y, dtype=torch.float32).view(-1,1)
test_x_tensor = torch.tensor(test_x, dtype=torch.long)
test_attention_tensor = torch.tensor(test_attention_masks,dtype=torch.long)
test_y_tensor = torch.tensor(test_y, dtype=torch.float32).view(-1,1)
train_x_tensor.shape, train_y_tensor.shape, test_x_tensor.shape, test_y_tensor.shape

(torch.Size([3000, 128]),
 torch.Size([3000, 1]),
 torch.Size([1000, 128]),
 torch.Size([1000, 1]))

In [12]:
# 모델 학습
import torch_directml
device = torch_directml.device() if torch_directml.is_available() else torch.device()
model.to(device)
from tqdm import tqdm
for epoch in range(2):
  iterator = tqdm(range(0,len(train_x_tensor),32))
  for i in iterator: # 배치 32
    optimizer.zero_grad()
    x = train_x_tensor[i:i+32].to(device)
    attention_masks = train_attention_tensor[i:i+32].to(device)
    y = train_y_tensor[i:i+32].to(device)
    y_pred = model(x,attention_masks=attention_masks)
    loss = loss_fn(y_pred, y)
    loss.backward()
    optimizer.step()

    iterator.set_description(f"epoch: {epoch+1} / {2} loss: {loss.item()}")

  0%|          | 0/94 [00:00<?, ?it/s]


RuntimeError: The GPU device does not support Double (Float64) operations!

In [None]:
# 모델 저장 - 가중치 저장
import torch
torch.save(model.state_dict(),'gpt2_movie_model.pt')

In [None]:
# 저장된 모델 불러오기
file_id = '1TgT07VU4su3y3ad-9yNHTjXXF3PVeqqk'
download_url = f'https://drive.google.com/uc?id={file_id}'
gdown.download(download_url, 'gpt2_movie_model.pt', quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1TgT07VU4su3y3ad-9yNHTjXXF3PVeqqk
From (redirected): https://drive.google.com/uc?id=1TgT07VU4su3y3ad-9yNHTjXXF3PVeqqk&confirm=t&uuid=e54f123c-51b7-4220-9c5c-edff58f6a0ea
To: c:\Github\python_ML\LLM\gpt2_movie_model.pt
100%|██████████| 501M/501M [00:10<00:00, 47.6MB/s] 


'gpt2_movie_model.pt'

In [None]:
from transformers import GPT2ForSequenceClassification
# 모델 로드
loaded_model = GPT2ForSequenceClassification.from_pretrained('skt/kogpt2-base-v2')
loaded_model.load_state_dict(torch.load('gpt2_movie_model.pt'))
loaded_model.to(device)
print("success")

Some weights of the model checkpoint at skt/kogpt2-base-v2 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  loaded_model.load_state_dict(torch.load('gpt2_movie_model.pt'))


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [None]:
# 평가
model.to(device)
with torch.no_grad():
  test_outputs = model(test_x_tensor[:30].to(device),
                       attention_masks=test_attention_tensor[:30].to(device)
                       )
  test_outputs = test_outputs.cpu()
  test_loss = loss_fn(test_outputs, test_y_tensor[:30])

  test_accuracy = ((test_outputs > 0.5).float() == test_y_tensor[:30]).float().mean()

print(f'test loss : {test_loss}')
print(f'test_accuracy : {test_accuracy}')