## 구글 드라이브 파일 다운로드

In [None]:
!pip install gdown

In [None]:
!gdown --id 14Wpnt6DPX93rtcH0RsvFZ3eFgtoA1K8K
!gdown --id 1AMiOTLuAqRgtZ624TfTHwW7fSSS5GbCh

## 파일 불러오기

In [None]:
import pandas as pd

train_df = pd.read_csv('/kaggle/working/train.csv')
test_df = pd.read_csv('/kaggle/working/test.csv')

## 전처리

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # 특수 문자 제거
    text = re.sub(f"[{string.punctuation}]", " ", text)
    
    # 소문자로 변환
    text = text.lower()
    
    # stopwords 제거
    text = " ".join([word for word in text.split() if word not in stop_words])
    
    return text

train_df["preprocessed_text"] = train_df["text"].apply(preprocess_text)
test_df["preprocessed_text"] = test_df["text"].apply(preprocess_text)


# Dataset 클래스

In [None]:
!pip install transformers

In [None]:
from torch.utils.data import Dataset
from transformers import BertTokenizer

class NewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512, is_test=False):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
        self.encoded_dict = {}
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        if idx not in self.encoded_dict:
            text = self.dataframe.loc[idx, "preprocessed_text"]
            self.encoded_dict[idx] = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt',
                return_attention_mask=True
            )
            
        item = {
            'input_ids': self.encoded_dict[idx]['input_ids'].squeeze(0),
            'attention_mask': self.encoded_dict[idx]['attention_mask'].squeeze(0),
            'input_text': self.dataframe.loc[idx, 'text'],  # Add the original text
            'index': idx  # Add the index
        }
        
        if not self.is_test:
            item['labels'] = self.dataframe.loc[idx, 'label']
            
        return item


## Dataset, DataLoader

In [None]:
import torch
from torch.utils.data import DataLoader, random_split
from transformers import BertTokenizer

# Set random seeds
random_seed = 42
torch.manual_seed(random_seed)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

# Create the datasets
train_val_dataset = NewsDataset(train_df, tokenizer)
test_dataset = NewsDataset(test_df, tokenizer, is_test=True)

# Split the train dataset into train and validation sets
train_size = int(0.8 * len(train_val_dataset))
valid_size = len(train_val_dataset) - train_size

train_dataset, valid_dataset = random_split(train_val_dataset, [train_size, valid_size], generator=torch.Generator().manual_seed(random_seed))

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=4, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)  # Updated batch_size to 4


## Model

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=8)


## criterion, optimizer, metric

In [None]:
from transformers import AdamW
from torch import nn
from sklearn.metrics import f1_score

# Criterion
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=2e-2)

# Metric
def macro_f1_score(true_labels, predicted_labels):
    return f1_score(true_labels, predicted_labels, average='macro')


## Train loop

In [None]:
import csv
import wandb

In [None]:
import tqdm
import torch
import csv
from kaggle_secrets import UserSecretsClient
import io
import os

# Wandb login
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_key")
wandb.login(key=secret_value_0)
os.environ["WANDB_SILENT"] = "true"
wandb.init(project='Dacon_GPT', name='bert_large_uncased_kaggle')

# Send model and hyperparameters to wandb
wandb.watch(model, log="all", log_freq=10)
wandb.config.epochs = 3
wandb.config.lr = 1e-5
wandb.config.weight_decay = 2e-2
wandb.config.batch_size = 16  # Changed batch_size from 4 to 16

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the number of gradient accumulation steps
gradient_accumulation_steps = 4

# Train and validation loop
for epoch in range(3):
    # Train loop
    model.train()
    train_loss = 0.0
    accumulation_steps = 0  # Initialize the accumulation step counter
    with tqdm.tqdm(train_dataloader, desc=f"Epoch {epoch+1}, Train", leave=False) as progress_bar:
        for batch in progress_bar:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            
            # Accumulate gradients for the specified number of steps
            if (accumulation_steps + 1) % gradient_accumulation_steps == 0 or (accumulation_steps + 1) == len(train_dataloader):
                optimizer.step()
                model.zero_grad()
            else:
                optimizer.virtual_step()  # Perform a virtual step without updating the model parameters
            
            accumulation_steps += 1
            train_loss += loss.item()
            wandb.log({'train_loss': loss.item()})
            progress_bar.set_postfix({'loss': train_loss / (len(train_dataloader) // gradient_accumulation_steps)})
    
    # Validation loop
    model.eval()
    valid_loss = 0.0
    validation_results = []  # Initialize an empty list to store validation results
    with torch.no_grad(), tqdm.tqdm(valid_dataloader, desc=f"Epoch {epoch+1}, Valid", leave=False) as progress_bar:
        for batch in progress_bar:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            input_text, index = batch['input_text'], batch['index']
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            valid_loss += loss.item()
            wandb.log({'valid_loss': loss.item()})
            progress_bar.set_postfix({'loss': valid_loss / (len(valid_dataloader))})
            
            true_labels = labels.cpu().numpy()
            predicted_labels = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            
            # Store the validation results (text, index, predicted_labels, true_labels)
            for text, idx, pred_label, true_label in zip(input_text, index, predicted_labels, true_labels):
                validation_results.append((text, idx, pred_label, true_label))
    
    valid_f1 = macro_f1_score([result[3] for result in validation_results], [result[2] for result in validation_results])
    wandb.log({'valid_f1': valid_f1})
    
    print(f"Epoch {epoch + 1}, Train Loss: {train_loss / (len(train_dataloader) // gradient_accumulation_steps)}, Valid Loss: {valid_loss / len(valid_dataloader)}, Valid Macro F1: {valid_f1}")

# Save the validation results to a CSV file
with open('validation_results.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['text', 'index', 'predicted_label', 'true_label'])
    for result in validation_results:
        csv_writer.writerow(result)

## 추론

In [None]:
# Test loop
model.eval()
test_results = []  # Initialize an empty list to store test results
with torch.no_grad(), tqdm.tqdm(test_dataloader, desc="Test", leave=False) as progress_bar:
    for batch in progress_bar:
        input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
        index = batch['index']
        outputs = model(input_ids, attention_mask=attention_mask)
        
        predicted_labels = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        
        # Store the test results (index, predicted_labels)
        for idx, pred_label in zip(index, predicted_labels):
            test_results.append((idx, pred_label))

# Save the test results to a CSV file
with open('submission.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['id', 'label'])
    for result in test_results:
        csv_writer.writerow(result)
