In [2]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW

In [3]:
print("PyTorch 버전:", torch.__version__)
print("CUDA 사용 가능 여부:", torch.cuda.is_available())
print("PyTorch에서 사용하는 CUDA 버전:", torch.version.cuda)
torch.set_default_device("cuda")
# GPU용 Generator 생성
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator = torch.Generator(device=device)

data = pd.read_csv("filtered_data/filtered_data5.csv", sep=';', on_bad_lines='skip')
texts = data['text'].tolist()
labels = data['type'].tolist()
#학습용, 테스트 데이터 분리
print(texts[171])
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

PyTorch 버전: 2.5.1+cu118
CUDA 사용 가능 여부: True
PyTorch에서 사용하는 CUDA 버전: 11.8
i'll have to check when i go home


In [4]:
# BERT 토크나이저
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128, return_tensors="pt")
labels = y_train

#11558, 35558, 1029, 48145
# 클래스별 가중치 정의
class_weights = torch.tensor([0.45, 0.15, 0.4])  # 클래스의 가중치

# 데이터셋 정의
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


lr=5e-6
batch_size=32
num_labels=3

dataset = CustomDataset(encodings, labels)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, generator=generator)

# 모델 로드
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
optimizer = AdamW(model.parameters(), lr=lr)
# 손실 함수
loss_fn = CrossEntropyLoss(weight=class_weights)

sequence = 0
total_loss = 0

# 학습
model.train()
for epoch in range(5):  # N번 반복 학습
    total_loss = 0
    for batch in dataloader:
        sequence = sequence + 1
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if sequence % 100 == 0:
            print(f"Epoch {epoch} - {batch_size * sequence} ")
    print(f"***Epoch {epoch} Total Loss = {total_loss}***")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return func(*args, **kwargs)


Epoch 0 - 3200 
Epoch 0 - 6400 
Epoch 0 - 9600 
Epoch 0 - 12800 
Epoch 0 - 16000 
Epoch 0 - 19200 
Epoch 0 - 22400 
Epoch 0 - 25600 
Epoch 0 - 28800 
Epoch 0 - 32000 
Epoch 0 - 35200 
Epoch 0 - 38400 
Epoch 0 - 41600 
***Epoch 0 Total Loss = 496.7573622018099***
Epoch 1 - 44800 
Epoch 1 - 48000 
Epoch 1 - 51200 
Epoch 1 - 54400 
Epoch 1 - 57600 
Epoch 1 - 60800 
Epoch 1 - 64000 
Epoch 1 - 67200 
Epoch 1 - 70400 
Epoch 1 - 73600 
Epoch 1 - 76800 
Epoch 1 - 80000 
Epoch 1 - 83200 
Epoch 1 - 86400 
***Epoch 1 Total Loss = 221.72697870619595***
Epoch 2 - 89600 
Epoch 2 - 92800 
Epoch 2 - 96000 
Epoch 2 - 99200 
Epoch 2 - 102400 
Epoch 2 - 105600 
Epoch 2 - 108800 
Epoch 2 - 112000 
Epoch 2 - 115200 
Epoch 2 - 118400 
Epoch 2 - 121600 
Epoch 2 - 124800 
Epoch 2 - 128000 
Epoch 2 - 131200 
***Epoch 2 Total Loss = 144.7713261381723***
Epoch 3 - 134400 
Epoch 3 - 137600 
Epoch 3 - 140800 
Epoch 3 - 144000 
Epoch 3 - 147200 
Epoch 3 - 150400 
Epoch 3 - 153600 
Epoch 3 - 156800 
Epoch 3 - 160000

In [5]:
# 모델 가중치만 저장
torch.save(model.state_dict(), "final_model_weights_filt.pth")


In [6]:
torch.cuda.empty_cache()  # PyTorch가 사용하지 않는 메모리 반환

# 평가
model.eval()
test_texts = ["fuck you", "i hate you", "plz suicide", "your mom go to hell"]
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
answer = [2, 1, 1, 1]
correct = 0;

with torch.no_grad():
    outputs = model(
        input_ids=test_encodings["input_ids"],
        attention_mask=test_encodings["attention_mask"]
    )
    logits = outputs.logits # [batch_size, 1] -> [batch_size]
    # 소프트맥스 확률 계산
    probabilities = torch.softmax(logits, dim=-1)  # [batch_size, num_classes]
    # 가장 큰 확률의 클래스 인덱스를 예측
    predictions = torch.argmax(probabilities, dim=-1)  # [batch_size]
    print(predictions)

print("Predictions: ", predictions, answer)

tensor([2, 0, 1, 1], device='cuda:0')
Predictions:  tensor([2, 0, 1, 1], device='cuda:0') [2, 1, 1, 1]
