In [None]:
import numpy as np
import pandas as pd

In [None]:
pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
df = pd.read_csv('/content/hate_data.csv')

In [None]:
df[df['label'].isnull()]

In [None]:
df = df.dropna(subset=['text', 'label'])

In [None]:
df['label'] = df['label'].astype(int)

In [None]:
df.info()

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch


In [None]:
from sklearn.model_selection import train_test_split
train_dataset, val_dataset = train_test_split(df, test_size = 0.15)
print(len(train_dataset))
print(len(val_dataset))


In [None]:
train_dataset = train_dataset.reset_index(drop=True)
val_dataset = val_dataset.reset_index(drop=True)

In [None]:
max_length = 128

In [None]:
class YoDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df_data = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, index):

        text = self.df_data.loc[index, 'text']
        label = torch.tensor(self.df_data.loc[index, 'label'])

        encoding = self.tokenizer(text, pad_to_max_length = True, truncation=True, max_length=max_length, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}

In [None]:
tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base")
model = AutoModelForSequenceClassification.from_pretrained("beomi/kcbert-base")


In [None]:
yo_train = YoDataset(train_dataset, tokenizer)
yo_val = YoDataset(val_dataset, tokenizer)

In [None]:
train_dataloader = DataLoader(yo_train, batch_size=2, shuffle=True)
valid_dataloader = DataLoader(yo_val, batch_size=2, shuffle=True)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
learning_rate = 1e-5
epochs = 5

In [None]:
import torch.nn.functional as F
import torch.nn as nn
from torch.optim import Adam


In [None]:
optimizer = Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = logits.argmax(dim=1)
        total += labels.size(0)
        correct += (preds == labels).sum().item()


    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs} - Avg Loss: {avg_loss:.4f}")
    accuracy = 100.0 * correct / total
    print(f"Train Accuracy: {accuracy:.2f}%")

    model.eval()
    val_total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for val_batch in valid_dataloader:

            val_input_ids = val_batch['input_ids']
            val_attention_mask = val_batch['attention_mask']
            val_labels = val_batch['label']

            val_input_ids = val_input_ids.to(device)
            val_attention_mask = val_attention_mask.to(device)
            val_labels = val_labels.to(device)

            val_outputs = model(val_input_ids, attention_mask=val_attention_mask)
            val_logits = val_outputs.logits

            val_loss = criterion(val_logits, val_labels)
            val_total_loss += val_loss.item()

            val_preds = val_logits.argmax(dim=1)
            correct += (val_preds == val_labels).sum().item()
            total += val_labels.size(0)

    val_avg_loss = val_total_loss / len(valid_dataloader)
    val_accuracy = 100.0 * correct / total
    print(f"Validation Loss: {val_avg_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.2f}%")


In [None]:
model_save_path = "/content/drive/MyDrive/kc_bert_yok/best_model.pth"

torch.save(model.state_dict(), model_save_path)

In [None]:
input_data = [
    "씨발련아",
    "존시나 멋있다",
    "안녕하세요",
    "ㅇㅇㅁㄴㅇㄹ"
    ]

In [None]:
yo = tokenizer(input_data, padding=True, truncation=True, return_tensors="pt")


In [None]:
yo = {k: v.to('cuda:0') for k, v in yo.items()}

In [None]:
output = model(**yo)

logits = output.logits
pred = logits.argmax(dim=1)

for i, input_text in enumerate(input_data):
    pred = pred[i].item()
    print(f"text: {input_text} - Label: {pred}")