In [None]:
# %% [markdown]
# # Notebook Identifikasi Penulis Horor
#
# Notebook ini menggunakan model BERT untuk melakukan klasifikasi kutipan horor ke dalam tiga kelas:
# - **EAP**: Edgar Allan Poe
# - **HPL**: H.P. Lovecraft
# - **MWS**: Mary Shelley
#
# Langkah-langkah utama yang dilakukan:
# 1. Pemuatan dan eksplorasi data
# 2. Pra-pemrosesan dengan tokenizer BERT
# 3. Fine-tuning model pre-trained BERT dengan Trainer API
# 4. Evaluasi model dengan validasi silang sederhana (train/validation split)
# 5. Prediksi pada data test dan pembuatan file submission

# %%
# Import libraries yang diperlukan
import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Cek apakah GPU tersedia
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Menggunakan device:", device)

# %%
# Pemuatan data
# Pastikan file train.csv, test.csv, sample_submission.csv sudah berada di direktori kerja yang sama
train_df = pd.read_csv("./train/train.csv")
test_df = pd.read_csv("./test/test.csv")
sample_submission = pd.read_csv("./sample_submission/sample_submission.csv")

print("Jumlah data train:", len(train_df))
print("Jumlah data test:", len(test_df))
print("Contoh data train:")
print(train_df.head())

# %%
# Ubah label author menjadi angka (misalnya mapping: EAP -> 0, HPL -> 1, MWS -> 2)
label2id = {"EAP": 0, "HPL": 1, "MWS": 2}
id2label = {0: "EAP", 1: "HPL", 2: "MWS"}
train_df["label"] = train_df["author"].map(label2id)

# %%
# Split data train menjadi train dan validation (misalnya 90:10)
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df["label"])
print("Train split:", len(train_data))
print("Validation split:", len(val_data))

# %%
# Konversi ke Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_data.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# %%
# Gunakan model dan tokenizer pre-trained
model_name = "bert-base-uncased"  # atau bisa gunakan model lain seperti roberta-base
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Fungsi tokenisasi
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False)  # padding nanti dilakukan secara batch

# Tokenisasi dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# %%
# Format dataset untuk Trainer (set format tensor dan kolom label)
columns_to_remove = ["author", "label"]  # "author" tidak diperlukan lagi pada input
train_dataset = train_dataset.remove_columns(["author"]).with_format("torch")
val_dataset = val_dataset.remove_columns(["author"]).with_format("torch")
# Untuk test data, tidak ada label
test_dataset = test_dataset.with_format("torch")

# %%
# Buat model dengan 3 label
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)
model.to(device)

# %%
# Data collator untuk padding dinamis
data_collator = DataCollatorWithPadding(tokenizer)

# %%
# Fungsi evaluasi: menghitung log loss dan akurasi
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.softmax(torch.tensor(logits), dim=1).numpy()
    loss = log_loss(labels, preds, labels=[0, 1, 2])
    pred_labels = np.argmax(preds, axis=1)
    acc = accuracy_score(labels, pred_labels)
    return {"log_loss": loss, "accuracy": acc}

# %%
# Definisikan TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="log_loss",
    greater_is_better=False,
    seed=42
)

# %%
# Buat objek Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# %%
# Mulai proses fine-tuning
trainer.train()

# %%
# Evaluasi model pada validation set
eval_results = trainer.evaluate()
print("Hasil evaluasi:", eval_results)

# %%
# Lakukan prediksi pada data test
# Perlu diingat test dataset hanya memiliki kolom "id" dan "text"
# Dapatkan prediksi probabilitas untuk masing-masing kelas
predictions = trainer.predict(test_dataset)
logits = predictions.predictions
# Ubah logits menjadi probabilitas melalui softmax
probs = torch.softmax(torch.tensor(logits), dim=1).numpy()

# %%
# Buat file submission
submission_df = pd.DataFrame({
    "id": test_df["id"],
    "EAP": probs[:, label2id["EAP"]],
    "HPL": probs[:, label2id["HPL"]],
    "MWS": probs[:, label2id["MWS"]]
})
submission_df.to_csv("./sample_submission/submission_v11.csv", index=False)
print("File submission telah tersimpan sebagai 'submission.csv'")

  from .autonotebook import tqdm as notebook_tqdm


Menggunakan device: cpu
Jumlah data train: 19579
Jumlah data test: 8392
Contoh data train:
        id                                               text author
0  id26305  This process, however, afforded me no means of...    EAP
1  id17569  It never once occurred to me that the fumbling...    HPL
2  id11008  In his left hand was a gold snuff box, from wh...    EAP
3  id27763  How lovely is spring As we looked from Windsor...    MWS
4  id12958  Finding nothing else, not even gold, the Super...    HPL
Train split: 17621
Validation split: 1958


Map: 100%|██████████| 17621/17621 [00:00<00:00, 20171.75 examples/s]
Map: 100%|██████████| 1958/1958 [00:00<00:00, 23611.06 examples/s]
Map: 100%|██████████| 8392/8392 [00:00<00:00, 25175.93 examples/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

: 