In [6]:
import urllib
import pandas as pd
import re

# 데이터 다운로드
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

# 데이터 불러오기
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

# 결측치 제거
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

# 한글만 추출 (정규식 사용)
train_data['document'] = train_data['document'].apply(lambda x: re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣\s]', ' ', str(x)))
test_data['document'] = test_data['document'].apply(lambda x: re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣\s]', ' ', str(x)))

# 중복 제거
train_data.drop_duplicates(subset=['document'], inplace=True)
test_data.drop_duplicates(subset=['document'], inplace=True)

# 인덱스 리셋
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

print(f'전처리 후 훈련 데이터 크기: {train_data.shape}')
print(f'전처리 후 테스트 데이터 크기: {test_data.shape}')

전처리 후 훈련 데이터 크기: (145140, 3)
전처리 후 테스트 데이터 크기: (48822, 3)


In [17]:
from transformers import PreTrainedTokenizerFast, GPT2ForSequenceClassification
import torch
import torch_directml

# 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', pad_token='<pad>')

# KoGPT2 모델 로드 (이진 분류를 위한 GPT2ForSequenceClassification 사용)
model = GPT2ForSequenceClassification.from_pretrained('skt/kogpt2-base-v2', num_labels=2)

# GPU 사용 설정 (가능한 경우)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch_directml.device()
model.to(device).float()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [18]:
def tokenize_and_encode(data, tokenizer):
    return tokenizer(
        data['document'].tolist(),
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=128
    )

train_encodings = tokenize_and_encode(train_data, tokenizer)
test_encodings = tokenize_and_encode(test_data, tokenizer)

train_labels = torch.tensor(train_data['label'].values)
test_labels = torch.tensor(test_data['label'].values)

In [19]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [21]:
from transformers import AdamW
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)

# 학습 루프 정의
for epoch in range(1):  # 에포크 수 설정
    model.train()
    for batch in tqdm(train_loader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1} completed.')

  0%|          | 0/4536 [00:00<?, ?it/s]


RuntimeError: The GPU device does not support Double (Float64) operations!

In [12]:
import torch_directml
dml_device = torch_directml.device()
print(dml_device)

privateuseone:0


In [None]:
from sklearn.metrics import accuracy_score

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print(f'Test Accuracy: {accuracy:.4f}')

In [22]:
import torch
from transformers import AdamW
from tqdm import tqdm
import torch_directml
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset

# float32로 변환하는 tokenize_and_encode 함수
def tokenize_and_encode(data, tokenizer):
    encodings = tokenizer(
        data['document'].tolist(),
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=128
    )
    # input_ids와 attention_mask를 float32로 변환
    encodings['input_ids'] = encodings['input_ids'].to(torch.float32)
    encodings['attention_mask'] = encodings['attention_mask'].to(torch.float32)
    return encodings

# 데이터 인코딩 및 레이블 텐서 생성
train_encodings = tokenize_and_encode(train_data, tokenizer)
test_encodings = tokenize_and_encode(test_data, tokenizer)

train_labels = torch.tensor(train_data['label'].values, dtype=torch.float32)  # 레이블도 float32로 변환
test_labels = torch.tensor(test_data['label'].values, dtype=torch.float32)  # 레이블도 float32로 변환

# TensorDataset 및 DataLoader 생성
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# DirectML 장치 설정
dml_device = torch_directml.device()

# Optimizer 설정
optimizer = AdamW(model.parameters(), lr=5e-5)

# 학습 루프 정의
for epoch in range(1):  # 에포크 수 설정
    model.train()
    for batch in tqdm(train_loader):
        input_ids, attention_mask, labels = [b.to(dml_device) for b in batch]  # 모든 텐서를 dml_device로 전송
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1} completed.')

# 모델 평가
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(dml_device) for b in batch]  # 모든 텐서를 dml_device로 전송
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print(f'Test Accuracy: {accuracy:.4f}')

  0%|          | 0/4536 [00:00<?, ?it/s]


RuntimeError: Unknown error -2147024809