## 데이터 불러오기

In [None]:
import pandas as pd

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

## 데이터 전처리

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # 특수 문자 제거
    text = re.sub(f"[{string.punctuation}]", " ", text)
    
    # 소문자로 변환
    text = text.lower()
    
    # stopwords 제거
    text = " ".join([word for word in text.split() if word not in stop_words])
    
    return text

train_df["preprocessed_text"] = train_df["text"].apply(preprocess_text)
test_df["preprocessed_text"] = test_df["text"].apply(preprocess_text)


# Dataset 클래스

In [None]:
!pip install transformers

In [None]:
from torch.utils.data import Dataset
from transformers import BertTokenizer

class NewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512, is_test=False):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
        self.encoded_dict = {}
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        if idx not in self.encoded_dict:
            text = self.dataframe.loc[idx, "preprocessed_text"]
            self.encoded_dict[idx] = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt',
                return_attention_mask=True
            )
            
        item = {
            'input_ids': self.encoded_dict[idx]['input_ids'].squeeze(0),
            'attention_mask': self.encoded_dict[idx]['attention_mask'].squeeze(0)
        }
        
        if not self.is_test:
            item['labels'] = self.dataframe.loc[idx, 'label']
            
        return item


## Dataset, DataLoader

In [None]:
import torch
from torch.utils.data import DataLoader, random_split
from transformers import BertTokenizer

# Set random seeds
random_seed = 42
torch.manual_seed(random_seed)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

# Create the datasets
train_val_dataset = NewsDataset(train_df, tokenizer)
test_dataset = NewsDataset(test_df, tokenizer, is_test=True)

# Split the train dataset into train and validation sets
train_size = int(0.8 * len(train_val_dataset))
valid_size = len(train_val_dataset) - train_size

train_dataset, valid_dataset = random_split(train_val_dataset, [train_size, valid_size], generator=torch.Generator().manual_seed(random_seed))

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=4, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)


## Model

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=8)


## criterion, optimizer, metric

In [None]:
from transformers import AdamW
from torch import nn
from sklearn.metrics import f1_score

# Criterion
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=2e-2)

# Metric
def macro_f1_score(true_labels, predicted_labels):
    return f1_score(true_labels, predicted_labels, average='macro')


## Train loop

In [None]:
import tqdm
import wandb

# Initialize wandb
wandb.init(project='Dacon_GPT', name='bert_large_uncased')

# Send model and hyperparameters to wandb
wandb.watch(model, log="all", log_freq=10)
wandb.config.epochs = 3
wandb.config.lr = 1e-5
wandb.config.weight_decay = 2e-2
wandb.config.batch_size = 8

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Train and validation loop
for epoch in range(3):
    model.train()
    train_loss = 0.0
    for batch in tqdm.tqdm(train_dataloader):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        wandb.log({'train_loss': loss.item()})
        tqdm.tqdm.set_postfix(train_loss=train_loss / (len(train_dataloader)))
    
    model.eval()
    valid_loss = 0.0
    true_labels = []
    predicted_labels = []
    with torch.no_grad():
        for batch in tqdm.tqdm(valid_dataloader):
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            valid_loss += loss.item()
            wandb.log({'valid_loss': loss.item()})
            tqdm.tqdm.set_postfix(valid_loss=valid_loss / (len(valid_dataloader)))
            
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            
    valid_f1 = macro_f1_score(true_labels, predicted_labels)
    wandb.log({'valid_f1': valid_f1})
    
    print(f"Epoch {epoch + 1}, Train Loss: {train_loss / len(train_dataloader)}, Valid Loss: {valid_loss / len(valid_dataloader)}, Valid Macro F1: {valid_f1}")


## validation 예측 결과 저장

In [None]:
import csv

# Save true labels and predicted labels to a CSV file after training
with open('validation_results.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['true_label', 'predicted_label'])
    for true_label, pred_label in zip(true_labels, predicted_labels):
        csv_writer.writerow([true_label, pred_label])
