## 구글 드라이브 파일 다운로드

In [None]:
!pip install gdown

In [None]:
!gdown --id 14Wpnt6DPX93rtcH0RsvFZ3eFgtoA1K8K
!gdown --id 1AMiOTLuAqRgtZ624TfTHwW7fSSS5GbCh
!gdown --id 1CeRbGPXPm9RCsFe13eyoBe0N-wKw2DLg

## 파일 불러오기

In [None]:
import pandas as pd

# train.csv 파일 불러오기
train_df = pd.read_csv('/kaggle/working/train.csv')

# test.csv 파일 불러오기
test_df = pd.read_csv('/kaggle/working/test.csv')

## 전처리

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # 특수 문자 제거
    text = re.sub(f"[{string.punctuation}]", " ", text)
    
    # 소문자로 변환
    text = text.lower()
    
    # stopwords 제거
    text = " ".join([word for word in text.split() if word not in stop_words])
    
    return text

train_df["preprocessed_text"] = train_df["text"].apply(preprocess_text)
test_df["preprocessed_text"] = test_df["text"].apply(preprocess_text)

## Dataset 클래스

In [None]:
!pip install transformers

In [None]:
from torch.utils.data import Dataset
from transformers import XLNetTokenizer

class NewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512, is_test=False):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
        self.encoded_dict = {}
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        if idx not in self.encoded_dict:
            text = self.dataframe.loc[idx, "preprocessed_text"]
            self.encoded_dict[idx] = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt',
                return_attention_mask=True,
                return_token_type_ids=True,  # Add token_type_ids
                lowercase=True,  # Use lowercase option
            )
            
        item = {
            'input_ids': self.encoded_dict[idx]['input_ids'].view(-1),
            'attention_mask': self.encoded_dict[idx]['attention_mask'].view(-1),
            'token_type_ids': self.encoded_dict[idx]['token_type_ids'].view(-1),  # Add token_type_ids
            'input_text': self.dataframe.loc[idx, 'text'],  # Add the original text
            'index': idx  # Add the index
        }
        
        if not self.is_test:
            item['labels'] = self.dataframe.loc[idx, 'label']
            
        return item

## Dataset, DataLoader

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import XLNetTokenizer
from sklearn.model_selection import train_test_split

import random
import numpy as np

# Set random seeds for reproducibility
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

# Load tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

# Create the datasets
train_data, val_data = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=random_seed)

# Reset the index of the new dataframes
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

batch_size = 32  # Increase batch size for better memory usage

train_dataset = NewsDataset(train_data, tokenizer, max_length=512)
valid_dataset = NewsDataset(val_data, tokenizer, max_length=512)
test_dataset = NewsDataset(test_df, tokenizer, max_length=512, is_test=True)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Model

In [None]:
from transformers import XLNetConfig, XLNetForSequenceClassification

config = XLNetConfig.from_pretrained('xlnet-base-cased')
config.num_labels = 8

model = XLNetForSequenceClassification.from_pretrained(
    'xlnet-base-cased',
    config=config
)

## criterion, optimizer, metric

In [None]:
from transformers import AdamW
from torch import nn
from sklearn.metrics import f1_score

# Criterion
criterion = nn.CrossEntropyLoss()

# Optimizer
learning_rate = 2e-5  # Decreased learning rate for better performance
weight_decay = 0.01  # Changed weight decay for better regularization
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Metric
def macro_f1_score(true_labels, predicted_labels):
    return f1_score(true_labels, predicted_labels, average='macro')

## Train loop

In [None]:
import csv
import wandb
from tqdm import tqdm

In [None]:
import wandb
import io
import os

from kaggle_secrets import UserSecretsClient

# Wandb login
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_key")
wandb.login(key=secret_value_0)
os.environ["WANDB_SILENT"] = "true"

# Initialize Wandb
wandb.init(project='Dacon_GPT', name='xlnet-base-cased')

# Send model and hyperparameters to wandb
wandb.watch(model, log="all")
wandb.config.epochs = 4
wandb.config.lr = 2e-5
wandb.config.weight_decay = 0.01
wandb.config.batch_size = 32  # Increase batch size for better memory usage

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
best_valid_loss = float('inf')
best_valid_f1 = float('-inf')
patience = 0
early_stopping_patience = 5

# Train and validation loop
for epoch in range(4):
    # Train loop
    model.train()
    train_loss = 0.0
    optimizer.zero_grad()  # Reset gradients tensors
    
    with tqdm(train_dataloader, desc=f"Epoch {epoch+1}, Train", leave=False) as train_progress:
        for batch_idx, batch in enumerate(train_progress):
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs[0], labels)
            loss.backward()
            
            optimizer.step() # Do an optimizer step
            optimizer.zero_grad() # Reset gradients tensors
            
            train_loss += loss.item()
            train_progress.set_postfix({'loss': train_loss / (batch_idx + 1)})

    wandb.log({'train_loss': train_loss / len(train_dataloader)})
    
    # Validation loop
    model.eval()
    valid_loss = 0.0
    validation_results = []  # Initialize an empty list to store validation results
    
    with torch.no_grad():
        progress_bar = tqdm(valid_dataloader, desc=f"Epoch {epoch+1}, Valid", leave=False)
        for batch in progress_bar:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            input_text, index = batch['input_text'], batch['index']
            outputs = model(input_ids, attention_mask            =attention_mask)
            loss = criterion(outputs.logits, labels)
            valid_loss += loss.item()

            progress_bar.set_postfix({'loss': valid_loss / (len(valid_dataloader))})

            true_labels = labels.cpu().numpy()
            predicted_labels = torch.argmax(outputs.logits, dim=1).cpu().numpy()

            # Store the validation results (text, index, predicted_labels, true_labels)
            for text, idx, pred_label, true_label in zip(input_text, index, predicted_labels, true_labels):
                validation_results.append((text, idx, pred_label, true_label))

        valid_loss /= len(valid_dataloader)
        valid_f1 = macro_f1_score([result[3] for result in validation_results], [result[2] for result in validation_results])
        wandb.log({'valid_loss': valid_loss, 'valid_f1': valid_f1})
        
        if valid_loss < best_valid_loss and valid_f1 > best_valid_f1:
            best_valid_loss = valid_loss
            best_valid_f1 = valid_f1
            torch.save(model.state_dict(), 'best_model_state.bin')
            patience = 0
        else:
            patience += 1
            if patience >= early_stopping_patience:
                break
                
    wandb.log({'valid_loss': valid_loss, 'valid_f1': valid_f1})
    print(f"Epoch {epoch + 1}, Valid Loss: {valid_loss}, Valid Macro F1: {valid_f1}")

    if patience >= early_stopping_patience:
        break

# Delete cache
torch.cuda.empty_cache()

### validation.csv

In [None]:
csv_filename = f"validation_results_{wandb.run.name}.csv"
with open(csv_filename, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['text', 'index', 'predicted_label', 'true_label'])
    for result in validation_results:
        csv_writer.writerow(result)
        
# Log the CSV file to wandb
artifact = wandb.Artifact('validation_results', type='dataset')
artifact.add_file(csv_filename)
wandb.log_artifact(artifact)

## 추론

In [None]:
model.load_state_dict(torch.load('/kaggle/working/best_model_state.bin'))

# Test loop
model.eval()
test_results = []  # Initialize an empty list to store test results
with torch.no_grad(), tqdm(test_dataloader, desc="Test", leave=False) as progress_bar:
    for batch in progress_bar:
        input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        predicted_labels = torch.argmax(outputs[0], dim=1).cpu().tolist()
        test_results.extend(predicted_labels)

# Load the sample submission file
submission_df = pd.read_csv('/kaggle/working/sample_submission.csv')

# Overwrite the label column with the test_results
submission_df['label'] = test_results

# Save the updated submission file to a CSV file
submission_df.to_csv('submission.csv', index=False)