## 구글 드라이브 파일 다운로드

In [None]:
!pip install gdown

In [None]:
!gdown --id 14Wpnt6DPX93rtcH0RsvFZ3eFgtoA1K8K
!gdown --id 1AMiOTLuAqRgtZ624TfTHwW7fSSS5GbCh
!gdown --id 1CeRbGPXPm9RCsFe13eyoBe0N-wKw2DLg

## 파일 불러오기

In [None]:
import pandas as pd

# train.csv 파일 불러오기
train_df = pd.read_csv('/kaggle/working/train.csv')

# test.csv 파일 불러오기
test_df = pd.read_csv('/kaggle/working/test.csv')

## 전처리

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # 특수 문자 제거
    text = re.sub(f"[{string.punctuation}]", " ", text)
    
    # 소문자로 변환
    text = text.lower()
    
    # stopwords 제거
    text = " ".join([word for word in text.split() if word not in stop_words])
    
    return text

train_df["preprocessed_text"] = train_df["text"].apply(preprocess_text)
test_df["preprocessed_text"] = test_df["text"].apply(preprocess_text)

## Dataset 클래스

In [None]:
!pip install transformers

In [None]:
from torch.utils.data import Dataset
from transformers import BertTokenizer

class NewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512, is_test=False):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
        self.encoded_dict = {}
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        if idx not in self.encoded_dict:
            text = self.dataframe.loc[idx, "preprocessed_text"]
            self.encoded_dict[idx] = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt',
                return_attention_mask=True,
                return_token_type_ids=False,  # Remove token_type_ids
            )
            
        item = {
            'input_ids': self.encoded_dict[idx]['input_ids'].view(-1),
            'attention_mask': self.encoded_dict[idx]['attention_mask'].view(-1),
            'input_text': self.dataframe.loc[idx, 'text'],  # Add the original text
            'index': idx  # Add the index
        }
        
        if not self.is_test:
            item['labels'] = self.dataframe.loc[idx, 'label']
            
        return item

## Dataset, DataLoader

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline, Pipeline

import random
import numpy as np

# Set random seeds for reproducibility
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Create the datasets
train_data, val_data = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=random_seed)

# Reset the index of the new dataframes
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

# Add the 'text' column to the train_data dataframe before applying TfidfVectorizer
X_train_text = train_data['text']
X_train_preprocessed = train_data['preprocessed_text']

# Vectorize preprocessed text using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train_preprocessed)
X_val = vectorizer.transform(val_data['preprocessed_text'])

y_train = train_data['label']
y_val = val_data['label']

# Create a mapping between indices of X_train and the original text
index_to_text = pd.Series(X_train_text.values, index=train_data.index)

# Apply the combined over and under sampling
X_train_resampled, y_train_resampled = resampler.fit_resample(X_train, y_train)

# Convert resampled data back to DataFrame
train_data_resampled = pd.DataFrame(X_train_resampled.toarray(), columns=vectorizer.get_feature_names_out())
train_data_resampled['preprocessed_text'] = train_data_resampled.apply(lambda row: ' '.join([f"{col}:{row[col]}" for col in vectorizer.get_feature_names() if row[col] > 0]), axis=1)
train_data_resampled['label'] = y_train_resampled

# Map the indices back to the original text
train_data_resampled['text'] = train_data_resampled.index.map(index_to_text)

batch_size = 16  # Decrease batch size to prevent OOM

train_dataset = NewsDataset(train_data_resampled, tokenizer, max_length=512)
valid_dataset = NewsDataset(val_data, tokenizer, max_length=512)
test_dataset = NewsDataset(test_df, tokenizer, max_length=512, is_test=True)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Model

In [None]:
from transformers import BertConfig, BertForSequenceClassification

config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels = 8

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    config=config
)

## criterion, optimizer, metric

In [None]:
from transformers import AdamW
from torch import nn
from sklearn.metrics import f1_score

# Criterion
criterion = nn.CrossEntropyLoss()

# Optimizer
learning_rate = 1e-5  # Change learning rate to 1e-5
weight_decay = 2e-2  # Change weight decay to 2e-2
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Metric
def macro_f1_score(true_labels, predicted_labels):
    return f1_score(true_labels, predicted_labels, average='macro')

## Train loop

In [None]:
import csv
import wandb
from tqdm import tqdm

In [None]:
import wandb
import io
import os

from kaggle_secrets import UserSecretsClient

# Wandb login
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_key")
wandb.login(key=secret_value_0)
os.environ["WANDB_SILENT"] = "true"

# Initialize Wandb
wandb.init(project='Dacon_GPT', name='bert-base')

# Send model and hyperparameters to wandb
wandb.watch(model, log="all")
wandb.config.epochs = 3
wandb.config.lr = 1e-5  # Change learning rate to 1e-5
wandb.config.weight_decay = 2e-2  # Change weight decay to 2e-2
wandb.config.batch_size = 16  # Change batch size to 16

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

def cross_val_train(model, train_dataloader, valid_dataloader, epochs=3, early_stopping_patience=5):
    best_valid_loss = float('inf')
    best_valid_f1 = float('-inf')
    patience = 0

    # Train and validation loop
    for epoch in range(epochs):
        # Train loop
        model.train()
        train_loss = 0.0
        optimizer.zero_grad()  # Reset gradients tensors

        with tqdm(train_dataloader, desc=f"Epoch {epoch+1}, Train", leave=False) as train_progress:
            for batch_idx, batch in enumerate(train_progress):
                input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs[0], labels)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()  # Reset gradients tensors

                train_loss += loss.item()
                train_progress.set_postfix({'loss': train_loss / (batch_idx + 1)})

        wandb.log({'train_loss': train_loss / len(train_dataloader)})

        # Validation loop
        model.eval()
        valid_loss = 0.0
        validation_results = []  # Initialize an empty list to store validation results

        with torch.no_grad():
            progress_bar = tqdm(valid_dataloader, desc=f"Epoch {epoch+1}, Valid", leave=False)
            for batch in progress_bar:
                input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
                input_text, index = batch['input_text'], batch['index']
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)
                valid_loss += loss.item()

                progress_bar.set_postfix({'loss': valid_loss / (len(valid_dataloader))})

                true_labels = labels.cpu().numpy()
                predicted_labels = torch.argmax(outputs.logits, dim=1).cpu().numpy()

                # Store the validation results (text, index, predicted_labels, true_labels)
                for text, idx, pred_label, true_label in zip(input_text, index, predicted_labels, true_labels):
                    validation_results.append({"text": text, "index": idx, "predicted_label": pred_label, "true_label": true_label})

            valid_loss /= len(valid_dataloader)
            valid_f1 = macro_f1_score([result['true_label'] for result in validation_results], [result['predicted_label'] for result in validation_results])
            wandb.log({'valid_loss': valid_loss, 'valid_f1': valid_f1, 'epoch': epoch+1})  # Log epoch number

            if valid_loss < best_valid_loss and valid_f1 > best_valid_f1:
                best_valid_loss = valid_loss
                best_valid_f1 = valid_f1
                patience = 0
            else:
                patience += 1
                if patience >= early_stopping_patience:
                    break

        wandb.log({'valid_loss': valid_loss, 'valid_f1': valid_f1, 'epoch': epoch+1})  # Log epoch number
        print(f"Epoch {epoch + 1}, Valid Loss: {valid_loss}, Valid Macro F1: {valid_f1}")

        if patience >= early_stopping_patience:
            break
    
    # Ensure the model is back in training mode
    model.train()

    # Delete cache
    torch.cuda.empty_cache()

    return valid_f1, validation_results, model

In [None]:
# Stratified K-Fold Cross Validation
n_splits = 4
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
valid_f1_scores = []

trained_models = [] # Add this line to store the trained models

for fold_idx, (train_index, valid_index) in enumerate(skf.split(train_df["preprocessed_text"], train_df["label"])):
    train_data = train_df.iloc[train_index].reset_index(drop=True)
    valid_data = train_df.iloc[valid_index].reset_index(drop=True)

    # Apply combined over and under sampling to the training data
    over_sampler = RandomOverSampler(sampling_strategy='auto', random_state=random_seed)
    under_sampler = RandomUnderSampler(sampling_strategy='auto', random_state=random_seed)
    pipeline = Pipeline([('over', over_sampler), ('under', under_sampler)])
    
    train_X, train_y = train_data["preprocessed_text"], train_data["label"]
    train_X_resampled, train_y_resampled = pipeline.fit_resample(train_X.values.reshape(-1, 1), train_y)

    train_resampled_data = pd.DataFrame({"preprocessed_text": train_X_resampled.flatten(), "label": train_y_resampled})
    
    train_dataset = NewsDataset(train_resampled_data, tokenizer, max_length=512)
    valid_dataset = NewsDataset(valid_data, tokenizer, max_length=512)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

    # Reset model, criterion, and optimizer for each fold
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=wandb.config.lr, weight_decay=wandb.config.weight_decay)

    valid_f1, fold_validation_results, trained_model = cross_val_train(model, train_dataloader, valid_dataloader)
    trained_models.append(trained_model)

    # Save fold validation results to a CSV file
    csv_filename = f"validation_results_{wandb.run.name}_fold_{fold_idx+1}.csv"
    with open(csv_filename, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['text', 'index', 'predicted_label', 'true_label'])
        for result in fold_validation_results:
            csv_writer.writerow([result['text'], result['index'], result['predicted_label'], result['true_label']])

    # Log the CSV file to wandb
    artifact = wandb.Artifact(f'validation_results_fold_{fold_idx+1}', type='dataset')
    artifact.add_file(csv_filename)
    wandb.log_artifact(artifact)

    valid_f1_scores.append(valid_f1)

    torch.save(model.state_dict(), f'best_model_state_fold_{fold_idx+1}.bin')

print("Validation F1 scores:", valid_f1_scores)
print("Mean validation F1 score:", np.mean(valid_f1_scores))

## 추론

In [None]:
import numpy as np
from collections import Counter

n_folds = 4  # Change this to the number of folds used in cross-validation
model_weights = [f"/kaggle/working/best_model_state_fold_{i+1}.bin" for i in range(n_folds)]
model_weight_values = [0.2, 0.3, 0.4, 0.1]  # Assign weights for each model (ensure they sum to 1)

# Test loop
ensemble_predictions = []
for weights in model_weights:
    model.load_state_dict(torch.load(weights))
    model.eval()

    fold_test_results = []
    with torch.no_grad(), tqdm(test_dataloader, desc="Test", leave=False) as progress_bar:
        for batch in progress_bar:
            input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            probabilities = torch.softmax(outputs[0], dim=1).cpu().numpy()
            fold_test_results.append(probabilities)

    ensemble_predictions.append(fold_test_results)

ensemble_probabilities = np.stack(ensemble_predictions, axis=-1)

# Soft ensemble
soft_ensemble_probabilities = np.mean(ensemble_probabilities, axis=-1)
soft_ensemble_labels = np.argmax(soft_ensemble_probabilities, axis=1).tolist()

# Hard ensemble
hard_ensemble_labels = []
for sample_probs in ensemble_probabilities:
    sample_votes = np.argmax(sample_probs, axis=1)
    most_common_label = Counter(sample_votes).most_common(1)[0][0]
    hard_ensemble_labels.append(most_common_label)

# Weighted ensemble
weighted_ensemble_probabilities = np.tensordot(ensemble_probabilities, model_weight_values, axes=((2), (0)))
weighted_ensemble_labels = np.argmax(weighted_ensemble_probabilities, axis=1).tolist()

# Load the sample submission file
submission_df = pd.read_csv('/kaggle/working/sample_submission.csv')

# Save the soft ensemble results
submission_df['label'] = soft_ensemble_labels
submission_df.to_csv('submission_soft_ensemble.csv', index=False)

# Save the hard ensemble results
submission_df['label'] = hard_ensemble_labels
submission_df.to_csv('submission_hard_ensemble.csv', index=False)

# Save the weighted ensemble results
submission_df['label'] = weighted_ensemble_labels
submission_df.to_csv('submission_weighted_ensemble.csv', index=False)