## 구글 드라이브 파일 다운로드

In [None]:
!pip install gdown

In [None]:
!gdown --id 14Wpnt6DPX93rtcH0RsvFZ3eFgtoA1K8K
!gdown --id 1AMiOTLuAqRgtZ624TfTHwW7fSSS5GbCh
!gdown --id 1CeRbGPXPm9RCsFe13eyoBe0N-wKw2DLg

## 파일 불러오기

In [None]:
import pandas as pd

# train.csv 파일 불러오기
train_df = pd.read_csv('/kaggle/working/train.csv')

# test.csv 파일 불러오기
test_df = pd.read_csv('/kaggle/working/test.csv')

## 전처리

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

patterns = [
    "This article is about {mask} news.",
    "The topic discussed in this article is {mask}.",
    "The main subject of the article is {mask}.",
    "The article primarily focuses on {mask} news."
]

verbalizers = {
    0: "Sci/Tech",
    1: "Sports",
    2: "Business",
    3: "World",
    4: "Politics",
    5: "ESG",
    6: "Health",
    7: "Entertainment",
}

def preprocess_text(text, pattern):
    # Remove special characters
    text = re.sub(f"[{string.punctuation}]", " ", text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stopwords
    text = " ".join([word for word in text.split() if word not in stop_words])

    # Add the pattern for cloze-style task with the MASK token
    text = pattern.replace("{mask}", "[MASK]") + " " + text
    
    return text

for pattern in patterns:
    train_df[f"preprocessed_text_{pattern}"] = train_df["text"].apply(lambda x: preprocess_text(x, pattern))
    test_df[f"preprocessed_text_{pattern}"] = test_df["text"].apply(lambda x: preprocess_text(x, pattern))

## Dataset 클래스

In [None]:
!pip install transformers

In [None]:
from torch.utils.data import Dataset
from transformers import BertTokenizer

class NewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, pattern, max_length=512, is_test=False):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.pattern = pattern
        self.max_length = max_length
        self.is_test = is_test
        self.encoded_dict = {}
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        if idx not in self.encoded_dict:
            text = self.dataframe.loc[idx, f"preprocessed_text_{self.pattern}"]
            self.encoded_dict[idx] = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt',
                return_attention_mask=True,
                return_token_type_ids=False,  # Remove token_type_ids
            )
            
        item = {
            'input_ids': self.encoded_dict[idx]['input_ids'].view(-1),
            'attention_mask': self.encoded_dict[idx]['attention_mask'].view(-1),
            'input_text': self.dataframe.loc[idx, 'text'],  # Add the original text
            'index': idx  # Add the index
        }
        
        if not self.is_test:
            item['labels'] = self.dataframe.loc[idx, 'label']
        
        return item

## Dataset, DataLoader

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

import random
import numpy as np

# Set random seeds for reproducibility
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Create the datasets
train_data, val_data = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=random_seed)

# Reset the index of the new dataframes
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

batch_size = 16  # Decrease batch size to prevent OOM

train_dataloaders, valid_dataloaders = {}, {}
for pattern in patterns:
    train_dataset = NewsDataset(train_data, tokenizer, pattern, max_length=512)
    valid_dataset = NewsDataset(val_data, tokenizer, pattern, max_length=512)
    train_dataloaders[pattern] = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_dataloaders[pattern] = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

test_dataloaders = {}
for pattern in patterns:
    test_dataset = NewsDataset(test_df, tokenizer, pattern, max_length=512, is_test=True)
    test_dataloaders[pattern] = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Model

In [None]:
from transformers import BertConfig, BertForMaskedLM
from transformers import AdamW
from torch import nn
from sklearn.metrics import f1_score

config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels = 8
config.mask_token_id = tokenizer.mask_token_id  # Add the mask_token_id to the config

# Criterion
criterion = nn.CrossEntropyLoss()

# Optimizer
learning_rate = 1e-5  # Change learning rate to 1e-5
weight_decay = 2e-2  # Change weight decay to 2e-2
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Metric
def macro_f1_score(true_labels, predicted_labels):
    return f1_score(true_labels, predicted_labels, average='macro')

class BertForMaskedLMMultiClassification(BertForMaskedLM):
    def __init__(self, config, verbalizer):
        super().__init__(config)
        self.verbalizer = verbalizer

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs[0]

        # Get the position of the [MASK] token
        mask_position = torch.nonzero(input_ids == self.config.mask_token_id, as_tuple=True)[1]

        # Extract the embeddings corresponding to the [MASK] token
        mask_embeddings = last_hidden_state[range(input_ids.shape[0]), mask_position]

        # Pass the mask embeddings through the LM head
        logits = self.cls(mask_embeddings)

        # Calculate the scores for each label in the verbalizer
        verbalizer_logits = logits[:, list(self.verbalizer.values())]
        
        if labels is not None:
            loss = criterion(verbalizer_logits.view(-1, self.config.num_labels), labels.view(-1))
            return loss, verbalizer_logits
        else:
            return verbalizer_logits

model = BertForMaskedLMMultiClassification.from_pretrained(
    'bert-base-uncased',
    config=config,
    verbalizer=verbalizers
)

## Train loop

In [None]:
import csv
import wandb
from tqdm import tqdm

In [None]:
import wandb
import io
import os

from kaggle_secrets import UserSecretsClient

# Wandb login
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_key")
wandb.login(key=secret_value_0)
os.environ["WANDB_SILENT"] = "true"

# Initialize Wandb
wandb.init(project='Dacon_GPT', name='bert-base')

# Send model and hyperparameters to wandb
wandb.watch(model, log="all")
wandb.config.epochs = 3
wandb.config.lr = 1e-5
wandb.config.weight_decay = 2e-2
wandb.config.batch_size = 16

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
def train_and_validate(train_dataloader, valid_dataloader, model, criterion, optimizer, pattern, epochs=3):
    wandb.log({'pattern': pattern})  # Log the pattern
            
    # Train and validation loop
    for epoch in range(epochs):
        # Train loop
        model.train()
        train_loss = 0.0
        optimizer.zero_grad()  # Reset gradients tensors

        with tqdm(train_dataloader, desc=f"Epoch {epoch+1}, Train", leave=False) as train_progress:
            for batch_idx, batch in enumerate(train_progress):
                input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()  # Reset gradients tensors

                train_loss += loss.item()
                train_progress.set_postfix({'loss': train_loss / (batch_idx + 1)})

        wandb.log({'train_loss': train_loss / len(train_dataloader), 'pattern': pattern})

        # Validation loop
        model.eval()
        valid_loss = 0.0
        validation_results = []  # Initialize an empty list to store validation results

        with torch.no_grad():
            progress_bar = tqdm(valid_dataloader, desc=f"Epoch {epoch+1}, Valid", leave=False)
            for batch in progress_bar:
                input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
                input_text, index = batch['input_text'], batch['index']
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)
                valid_loss += loss.item()

                progress_bar.set_postfix({'loss': valid_loss / (len(valid_dataloader))})

                true_labels = labels.cpu().numpy()
                predicted_labels = torch.argmax(outputs.logits, dim=1).cpu().numpy()

                # Store the validation results (text, index, predicted_labels, true_labels, pattern)
                for text, idx, pred_label, true_label in zip(input_text, index, predicted_labels, true_labels):
                    validation_results.append({"text": text, "index": idx, "predicted_label": pred_label, "true_label": true_label, "pattern": pattern})

            valid_loss /= len(valid_dataloader)
            valid_f1 = macro_f1_score([result['true_label'] for result in validation_results], [result['predicted_label'] for result in validation_results])
            wandb.log({'valid_loss': valid_loss, 'valid_f1': valid_f1, 'epoch': epoch+1, 'pattern': pattern})  # Log epoch number

            print(f"Epoch {epoch + 1}, Valid Loss: {valid_loss}, Valid Macro F1: {valid_f1}")

    # Delete cache
    torch.cuda.empty_cache()

    return valid_f1, validation_results

In [None]:
# Define the patterns here
patterns = [
    "This article is about {mask} news.",
    "The topic discussed in this article is {mask}.",
    "The main subject of the article is {mask}.",
    "The article primarily focuses on {mask} news."
]

# Train and validate with the ensemble of patterns
all_pattern_results = []
for pattern in patterns:
    # Initialize a new model for each pattern
    model = BertForMaskedLMMultiClassification.from_pretrained(
        'bert-base-uncased',
        config=config,
        verbalizer=verbalizers)
    model.to(device)

    # Create a new optimizer for each model
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # Get the dataloaders for the current pattern
    train_dataloader = train_dataloaders[pattern]
    valid_dataloader = valid_dataloaders[pattern]

    # Train and validate with the pattern
    valid_f1, pattern_validation_results = train_and_validate(train_dataloader, valid_dataloader, model, criterion, optimizer, pattern)
    all_pattern_results.extend(pattern_validation_results)

    # Clean up memory
    del model
    del optimizer
    torch.cuda.empty_cache()

# Save pattern validation results to a CSV file
csv_filename = f"validation_results_{wandb.run.name}.csv"
with open(csv_filename, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['text', 'index', 'predicted_label', 'true_label', 'pattern'])
    for result in all_pattern_results:
        csv_writer.writerow([result['text'], result['index'], result['predicted_label'], result['true_label'], result['pattern']])

# Log the CSV file to wandb
artifact = wandb.Artifact('validation_results', type='dataset')
artifact.add_file(csv_filename)
wandb.log_artifact(artifact)

## 추론

In [None]:
import numpy as np
from collections import Counter
from tqdm import tqdm

n_models = 4  # Change this to the number of models used
model_weights = [f"best_model_state_weight_combination_{i+1}.bin" for i in range(n_models)]
model_weight_values = [0.25, 0.35, 0.25, 0.15]  # Assign weights for each model

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config).to(device)

def generate_predictions(test_dataloader):
    model_predictions = []

    for i, model_weight in enumerate(tqdm(model_weights, desc="Loading model weights")):
        model.load_state_dict(torch.load(model_weight))
        model.eval()

        model_test_results = []
        with torch.no_grad():
            for batch in tqdm(test_dataloader, desc=f"Predicting with model {i+1}"):
                input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                probabilities = torch.softmax(outputs[0], dim=1).cpu().numpy()
                model_test_results.append(probabilities)

        model_predictions.append(np.concatenate(model_test_results, axis=0))

    return model_predictions

# Generate predictions
model_predictions = generate_predictions(test_dataloader)

# Soft ensemble
soft_ensemble_probabilities = np.mean(model_predictions, axis=0)
soft_ensemble_labels = np.argmax(soft_ensemble_probabilities, axis=1).tolist()

# Hard ensemble
hard_ensemble_labels = []
for i in range(model_predictions[0].shape[0]):
    model_wise_preds = [np.argmax(model_predictions[j][i]) for j in range(n_models)]
    most_common_label = Counter(model_wise_preds).most_common(1)[0][0]
    hard_ensemble_labels.append(most_common_label)

# Weighted ensemble
weighted_ensemble_probabilities = np.average(model_predictions, axis=0, weights=model_weight_values)
weighted_ensemble_labels = np.argmax(weighted_ensemble_probabilities, axis=1).tolist()

# Load the sample submission file
submission_df = pd.read_csv('/kaggle/working/sample_submission.csv')

# Save the soft ensemble results
submission_df['label'] = np.array(soft_ensemble_labels)
submission_df.to_csv('submission_soft_ensemble.csv', index=False)

# Save the hard ensemble results
submission_df['label'] = np.array(hard_ensemble_labels)
submission_df.to_csv('submission_hard_ensemble.csv', index=False)

# Save the weighted ensemble results
submission_df['label'] = np.array(weighted_ensemble_labels)
submission_df.to_csv('submission_weighted_ensemble.csv', index=False)