## Check .csv files

In [2]:
import pandas as pd
import os

# 設定你的資料夾路徑
data_dir = '.'

# 遍歷資料夾中所有 csv 檔案
for filename in os.listdir(data_dir):
    if filename.endswith('.csv'):
        filepath = os.path.join(data_dir, filename)
        print(f"📄 檔案名稱: {filename}")
        df = pd.read_csv(filepath, nrows=5)  # 只讀取前5行
        print(df)
        print("\n" + "="*80 + "\n")

📄 檔案名稱: sample_submission.csv
                 id  toxic  severe_toxic  obscene  threat  insult  \
0  00001cee341fdb12    0.5           0.5      0.5     0.5     0.5   
1  0000247867823ef7    0.5           0.5      0.5     0.5     0.5   
2  00013b17ad220c46    0.5           0.5      0.5     0.5     0.5   
3  00017563c3f7919a    0.5           0.5      0.5     0.5     0.5   
4  00017695ad8997eb    0.5           0.5      0.5     0.5     0.5   

   identity_hate  
0            0.5  
1            0.5  
2            0.5  
3            0.5  
4            0.5  


📄 檔案名稱: test.csv
                 id                                       comment_text
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...
1  0000247867823ef7  == From RfC == \n\n The title is fine as it is...
2  00013b17ad220c46  " \n\n == Sources == \n\n * Zawe Ashton on Lap...
3  00017563c3f7919a  :If you have a look back at the source, the in...
4  00017695ad8997eb          I don't anonymously edit articles at a

## Install module

In [4]:
!pip install transformers scikit-learn pandas

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp312-cp312-win_amd64.whl.metadata (14 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.0-cp312-cp312-win_amd64.whl (10.7 MB)
   ---------------------------------------- 0.0/10.7 MB ? eta -:--:--
   ----- ---------------------------------- 1.6/10.7 MB 10.5 MB/s eta 0:00:01
   ------------- -------------------------- 3.7/10.7 MB 10.9 MB/s eta 0:00:01
   ---------------------------- ----------- 7.6/10.7 MB 13.8 MB/s eta 0:00:01
   -------------------------------------- - 10.2/10.7 MB 13.3 MB/s eta 0:00:01
   ---------------------------------------- 10.7/10.7 MB 13.1 MB/s eta 0:00:00
Downloading joblib-1.5.1-py3-n


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Prepare Pytorch Dataset

In [13]:
from torch.utils.data import Dataset
from transformers import BertTokenizer

class ToxicCommentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer_name='bert-base-uncased', max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),  # [max_len]
            'attention_mask': encoding['attention_mask'].squeeze(0),  # [max_len]
            'labels': torch.tensor(label, dtype=torch.float)  # [6]
        }

## Create Classifier

In [14]:
import torch
import torch.nn as nn
from transformers import BertModel

class ToxicCommentClassifier(nn.Module):
    def __init__(self, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(self.bert.config.hidden_size, 6)  # 六個類別

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.pooler_output  # [batch_size, hidden_size]
        x = self.dropout(pooled)
        return self.out(x)  # 不加 sigmoid，直接用 BCEWithLogitsLoss

## Data Split

In [15]:
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd

# 讀取資料
df = pd.read_csv(".\\train.csv")
X = df['comment_text'].fillna("")
y = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

print(type(X))
print(type(y))

# 資料切分
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

# 建立 Dataset / DataLoader
train_dataset = ToxicCommentDataset(X_train, y_train)
val_dataset = ToxicCommentDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>


## Define Training Flow

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import numpy as np

# 1️⃣ 設定裝置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2️⃣ 建立模型
model = ToxicCommentClassifier().to(device)

# 3️⃣ 損失與優化器
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# 4️⃣ 訓練函數
def train_epoch(model, dataloader):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training", leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

# 5️⃣ 驗證函數
def eval_epoch(model, dataloader):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validation", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            total_loss += loss.item()

    return total_loss / len(dataloader)

## Train the model

In [17]:
EPOCHS = 3  # 你可以調高

best_val_loss = float('inf')

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")

    train_loss = train_epoch(model, train_loader)
    val_loss = eval_epoch(model, val_loader)

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val   Loss: {val_loss:.4f}")

    # 如果需要，可以儲存模型
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pt")
        print("✅ Saved best model.")


Epoch 1/3


                                                                                                                                                           

Train Loss: 0.0478
Val   Loss: 0.0395
✅ Saved best model.

Epoch 2/3


                                                                                                                                                           

Train Loss: 0.0341
Val   Loss: 0.0393
✅ Saved best model.

Epoch 3/3


                                                                                                                                                           

Train Loss: 0.0270
Val   Loss: 0.0425




## Predict submission.csv

In [18]:
import pandas as pd

# 讀入測試資料
test_df = pd.read_csv('.\\test.csv')
test_texts = test_df['comment_text'].fillna("").reset_index(drop=True)

# 建立 dataset 和 dataloader
test_dataset = ToxicCommentDataset(test_texts, labels=[[0]*6]*len(test_texts))  # dummy label
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [19]:
model = ToxicCommentClassifier().to(device)
model.load_state_dict(torch.load("best_model.pt", map_location=device))
model.eval()

all_predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask)  # [B, 6]
        preds = torch.sigmoid(outputs)              # 轉成機率 [0~1]
        all_predictions.append(preds.cpu().numpy())

# 合併為 numpy array
import numpy as np
final_preds = np.concatenate(all_predictions, axis=0)  # shape: [num_test_samples, 6]

Predicting: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 9573/9573 [16:37<00:00,  9.59it/s]


In [20]:
submission_df = pd.read_csv(".\\sample_submission.csv")

# 替換每一欄預測分數
submission_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = final_preds

# 儲存結果
submission_df.to_csv("submission.csv", index=False)
print("✅ submission.csv 已產出，可上傳到 Kaggle！")

✅ submission.csv 已產出，可上傳到 Kaggle！
