In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW

In [2]:
print("PyTorch 버전:", torch.__version__)
print("CUDA 사용 가능 여부:", torch.cuda.is_available())
print("PyTorch에서 사용하는 CUDA 버전:", torch.version.cuda)
torch.set_default_device("cuda")
# GPU용 Generator 생성
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator = torch.Generator(device=device)

PyTorch 버전: 2.5.1+cu118
CUDA 사용 가능 여부: True
PyTorch에서 사용하는 CUDA 버전: 11.8


In [3]:
for i in range(10):
    data = pd.read_csv("filtered_data/filtered_data" + str(i) + ".csv", sep=';', on_bad_lines='skip')
    texts = data['text'].tolist()
    labels = data['type'].tolist()
    
    # BERT 토크나이저
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
    #11558, 35558, 1029, 48145
    # 클래스별 가중치 정의
    class_weights = torch.tensor([0.45, 0.15, 0.4])  # 클래스의 가중치
    
    # 데이터셋 정의
    class CustomDataset(Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels
    
        def __len__(self):
            return len(self.labels)
    
        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
            return item
 
    lr=5e-6
    batch_size=32
    num_labels=3
    
    dataset = CustomDataset(encodings, labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, generator=generator)
    
    # 모델 로드
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    optimizer = AdamW(model.parameters(), lr=lr)
    # 손실 함수
    loss_fn = CrossEntropyLoss(weight=class_weights)
    
    sequence = 0
    total_loss = 0
    
    # 학습
    model.train()
    for epoch in range(5):  # N번 반복 학습
        total_loss = 0
        for batch in dataloader:
            sequence = sequence + 1
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["labels"]
    
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = loss_fn(logits, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            if sequence % 100 == 0:
                print(f"Epoch {epoch} - {batch_size * sequence} ")
        print(f"***Epoch {epoch} Total Loss = {total_loss}***")
        
    # 모델 가중치만 저장
    torch.save(model.state_dict(), "parameter/final_model_weights" + str(i + 1) + ".pth")

    data = pd.read_csv("cleaned_data.csv", sep=';', on_bad_lines='skip')
    texts = data['text'].tolist()
    labels = data['type'].tolist()
    torch.cuda.empty_cache()  # PyTorch가 사용하지 않는 메모리 반환
    
    # 평가
    model.eval()
    test_texts = texts
    test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
    total = len(test_texts)
    
    batch_size=32
    
    test_dataset = CustomDataset(test_encodings, labels)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    correct = 0
    correct_indices = []
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            # 배치 데이터 가져오기
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["labels"]
            
            # 모델 예측
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits # [batch_size, 1] -> [batch_size]
            # 소프트맥스 확률 계산
            probabilities = torch.softmax(logits, dim=-1)  # [batch_size, num_classes]
            # 가장 큰 확률의 클래스 인덱스를 예측
            predictions = torch.argmax(probabilities, dim=-1)
            # 정확도 계산
            correct += (predictions == labels.int()).sum().item()
            
            # 올바른 데이터의 인덱스 확인
            batch_correct_indices = torch.where(predictions == labels.int())[0].tolist()
    
            # 전체 데이터셋 인덱스로 변환
            global_indices = [batch_idx * batch_size + idx for idx in batch_correct_indices]
            correct_indices.extend(global_indices)
    
    # 원본 데이터에서 올바른 데이터만 필터링
    filtered_data = data.iloc[correct_indices]
    
    # CSV로 저장
    csv_file_path = "filtered_data/filtered_data" + str(i + 1) +".csv"
    filtered_data.to_csv(csv_file_path, sep=';', index=False)
            
    print("Accuracy: ", correct/total)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return func(*args, **kwargs)


Epoch 0 - 3200 
Epoch 0 - 6400 
Epoch 0 - 9600 
Epoch 0 - 12800 
Epoch 0 - 16000 
Epoch 0 - 19200 
Epoch 0 - 22400 
Epoch 0 - 25600 
Epoch 0 - 28800 
Epoch 0 - 32000 
Epoch 0 - 35200 
Epoch 0 - 38400 
Epoch 0 - 41600 
Epoch 0 - 44800 
Epoch 0 - 48000 
Epoch 0 - 51200 
Epoch 0 - 54400 
***Epoch 0 Total Loss = 626.215508647263***
Epoch 1 - 57600 
Epoch 1 - 60800 
Epoch 1 - 64000 
Epoch 1 - 67200 
Epoch 1 - 70400 
Epoch 1 - 73600 
Epoch 1 - 76800 
Epoch 1 - 80000 
Epoch 1 - 83200 
Epoch 1 - 86400 
Epoch 1 - 89600 
Epoch 1 - 92800 
Epoch 1 - 96000 
Epoch 1 - 99200 
Epoch 1 - 102400 
Epoch 1 - 105600 
Epoch 1 - 108800 
***Epoch 1 Total Loss = 298.9963792655617***
Epoch 2 - 112000 
Epoch 2 - 115200 
Epoch 2 - 118400 
Epoch 2 - 121600 
Epoch 2 - 124800 
Epoch 2 - 128000 
Epoch 2 - 131200 
Epoch 2 - 134400 
Epoch 2 - 137600 
Epoch 2 - 140800 
Epoch 2 - 144000 
Epoch 2 - 147200 
Epoch 2 - 150400 
Epoch 2 - 153600 
Epoch 2 - 156800 
Epoch 2 - 160000 
Epoch 2 - 163200 
***Epoch 2 Total Loss = 199

  return func(*args, **kwargs)


Accuracy:  0.9296996550091321


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return func(*args, **kwargs)


Epoch 0 - 3200 
Epoch 0 - 6400 
Epoch 0 - 9600 
Epoch 0 - 12800 
Epoch 0 - 16000 
Epoch 0 - 19200 
Epoch 0 - 22400 
Epoch 0 - 25600 
Epoch 0 - 28800 
Epoch 0 - 32000 
Epoch 0 - 35200 
Epoch 0 - 38400 
Epoch 0 - 41600 
Epoch 0 - 44800 
Epoch 0 - 48000 
Epoch 0 - 51200 
Epoch 0 - 54400 
***Epoch 0 Total Loss = 588.1846195012331***
Epoch 1 - 57600 
Epoch 1 - 60800 
Epoch 1 - 64000 
Epoch 1 - 67200 
Epoch 1 - 70400 
Epoch 1 - 73600 
Epoch 1 - 76800 
Epoch 1 - 80000 
Epoch 1 - 83200 
Epoch 1 - 86400 
Epoch 1 - 89600 
Epoch 1 - 92800 
Epoch 1 - 96000 
Epoch 1 - 99200 
Epoch 1 - 102400 
Epoch 1 - 105600 
Epoch 1 - 108800 
***Epoch 1 Total Loss = 275.3694657860324***
Epoch 2 - 112000 
Epoch 2 - 115200 
Epoch 2 - 118400 
Epoch 2 - 121600 
Epoch 2 - 124800 
Epoch 2 - 128000 
Epoch 2 - 131200 
Epoch 2 - 134400 
Epoch 2 - 137600 
Epoch 2 - 140800 
Epoch 2 - 144000 
Epoch 2 - 147200 
Epoch 2 - 150400 
Epoch 2 - 153600 
Epoch 2 - 156800 
Epoch 2 - 160000 
Epoch 2 - 163200 
***Epoch 2 Total Loss = 18

  return func(*args, **kwargs)


Accuracy:  0.9295981871068119


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return func(*args, **kwargs)


Epoch 0 - 3200 
Epoch 0 - 6400 
Epoch 0 - 9600 
Epoch 0 - 12800 
Epoch 0 - 16000 
Epoch 0 - 19200 
Epoch 0 - 22400 
Epoch 0 - 25600 
Epoch 0 - 28800 
Epoch 0 - 32000 
Epoch 0 - 35200 
Epoch 0 - 38400 
Epoch 0 - 41600 
Epoch 0 - 44800 
Epoch 0 - 48000 
Epoch 0 - 51200 
Epoch 0 - 54400 
***Epoch 0 Total Loss = 591.0733722820878***
Epoch 1 - 57600 
Epoch 1 - 60800 
Epoch 1 - 64000 
Epoch 1 - 67200 
Epoch 1 - 70400 
Epoch 1 - 73600 
Epoch 1 - 76800 
Epoch 1 - 80000 
Epoch 1 - 83200 
Epoch 1 - 86400 
Epoch 1 - 89600 
Epoch 1 - 92800 
Epoch 1 - 96000 
Epoch 1 - 99200 
Epoch 1 - 102400 
Epoch 1 - 105600 
Epoch 1 - 108800 
***Epoch 1 Total Loss = 274.5867224931717***
Epoch 2 - 112000 
Epoch 2 - 115200 
Epoch 2 - 118400 
Epoch 2 - 121600 
Epoch 2 - 124800 
Epoch 2 - 128000 
Epoch 2 - 131200 
Epoch 2 - 134400 
Epoch 2 - 137600 
Epoch 2 - 140800 
Epoch 2 - 144000 
Epoch 2 - 147200 
Epoch 2 - 150400 
Epoch 2 - 153600 
Epoch 2 - 156800 
Epoch 2 - 160000 
Epoch 2 - 163200 
***Epoch 2 Total Loss = 17

  return func(*args, **kwargs)


Accuracy:  0.929784211594399


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return func(*args, **kwargs)


Epoch 0 - 3200 
Epoch 0 - 6400 
Epoch 0 - 9600 
Epoch 0 - 12800 
Epoch 0 - 16000 
Epoch 0 - 19200 
Epoch 0 - 22400 
Epoch 0 - 25600 
Epoch 0 - 28800 
Epoch 0 - 32000 
Epoch 0 - 35200 
Epoch 0 - 38400 
Epoch 0 - 41600 
Epoch 0 - 44800 
Epoch 0 - 48000 
Epoch 0 - 51200 
Epoch 0 - 54400 
***Epoch 0 Total Loss = 619.0780635774136***
Epoch 1 - 57600 
Epoch 1 - 60800 
Epoch 1 - 64000 
Epoch 1 - 67200 
Epoch 1 - 70400 
Epoch 1 - 73600 
Epoch 1 - 76800 
Epoch 1 - 80000 
Epoch 1 - 83200 
Epoch 1 - 86400 
Epoch 1 - 89600 
Epoch 1 - 92800 
Epoch 1 - 96000 
Epoch 1 - 99200 
Epoch 1 - 102400 
Epoch 1 - 105600 
Epoch 1 - 108800 
***Epoch 1 Total Loss = 294.6299608387053***
Epoch 2 - 112000 
Epoch 2 - 115200 
Epoch 2 - 118400 
Epoch 2 - 121600 
Epoch 2 - 124800 
Epoch 2 - 128000 
Epoch 2 - 131200 
Epoch 2 - 134400 
Epoch 2 - 137600 
Epoch 2 - 140800 
Epoch 2 - 144000 
Epoch 2 - 147200 
Epoch 2 - 150400 
Epoch 2 - 153600 
Epoch 2 - 156800 
Epoch 2 - 160000 
Epoch 2 - 163200 
***Epoch 2 Total Loss = 19

  return func(*args, **kwargs)


Accuracy:  0.9278732327673679


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return func(*args, **kwargs)


Epoch 0 - 3200 
Epoch 0 - 6400 
Epoch 0 - 9600 
Epoch 0 - 12800 
Epoch 0 - 16000 
Epoch 0 - 19200 
Epoch 0 - 22400 
Epoch 0 - 25600 
Epoch 0 - 28800 
Epoch 0 - 32000 
Epoch 0 - 35200 
Epoch 0 - 38400 
Epoch 0 - 41600 
Epoch 0 - 44800 
Epoch 0 - 48000 
Epoch 0 - 51200 
Epoch 0 - 54400 
***Epoch 0 Total Loss = 571.5130968131125***
Epoch 1 - 57600 
Epoch 1 - 60800 
Epoch 1 - 64000 
Epoch 1 - 67200 
Epoch 1 - 70400 
Epoch 1 - 73600 
Epoch 1 - 76800 
Epoch 1 - 80000 
Epoch 1 - 83200 
Epoch 1 - 86400 
Epoch 1 - 89600 
Epoch 1 - 92800 
Epoch 1 - 96000 
Epoch 1 - 99200 
Epoch 1 - 102400 
Epoch 1 - 105600 
Epoch 1 - 108800 
***Epoch 1 Total Loss = 255.6906772106886***
Epoch 2 - 112000 
Epoch 2 - 115200 
Epoch 2 - 118400 
Epoch 2 - 121600 
Epoch 2 - 124800 
Epoch 2 - 128000 
Epoch 2 - 131200 
Epoch 2 - 134400 
Epoch 2 - 137600 
Epoch 2 - 140800 
Epoch 2 - 144000 
Epoch 2 - 147200 
Epoch 2 - 150400 
Epoch 2 - 153600 
Epoch 2 - 156800 
Epoch 2 - 160000 
Epoch 2 - 163200 
***Epoch 2 Total Loss = 16

  return func(*args, **kwargs)


Accuracy:  0.9251843333558818


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return func(*args, **kwargs)


Epoch 0 - 3200 
Epoch 0 - 6400 
Epoch 0 - 9600 
Epoch 0 - 12800 
Epoch 0 - 16000 
Epoch 0 - 19200 
Epoch 0 - 22400 
Epoch 0 - 25600 
Epoch 0 - 28800 
Epoch 0 - 32000 
Epoch 0 - 35200 
Epoch 0 - 38400 
Epoch 0 - 41600 
Epoch 0 - 44800 
Epoch 0 - 48000 
Epoch 0 - 51200 
Epoch 0 - 54400 
***Epoch 0 Total Loss = 603.1028490625322***
Epoch 1 - 57600 
Epoch 1 - 60800 
Epoch 1 - 64000 
Epoch 1 - 67200 
Epoch 1 - 70400 
Epoch 1 - 73600 
Epoch 1 - 76800 
Epoch 1 - 80000 
Epoch 1 - 83200 
Epoch 1 - 86400 
Epoch 1 - 89600 
Epoch 1 - 92800 
Epoch 1 - 96000 
Epoch 1 - 99200 
Epoch 1 - 102400 
Epoch 1 - 105600 
Epoch 1 - 108800 
***Epoch 1 Total Loss = 267.69333369936794***
Epoch 2 - 112000 
Epoch 2 - 115200 
Epoch 2 - 118400 
Epoch 2 - 121600 
Epoch 2 - 124800 
Epoch 2 - 128000 
Epoch 2 - 131200 
Epoch 2 - 134400 
Epoch 2 - 137600 
Epoch 2 - 140800 
Epoch 2 - 144000 
Epoch 2 - 147200 
Epoch 2 - 150400 
Epoch 2 - 153600 
Epoch 2 - 156800 
Epoch 2 - 160000 
Epoch 2 - 163200 
***Epoch 2 Total Loss = 1

  return func(*args, **kwargs)


Accuracy:  0.9265372387201515


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return func(*args, **kwargs)


Epoch 0 - 3200 


KeyboardInterrupt: 