In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -U transformers

In [None]:
import transformers
print(transformers.__version__)

# IndoBERT

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    default_data_collator
)
import torch

# ✅ Nonaktifkan Weights & Biases agar tidak minta API key
os.environ["WANDB_DISABLED"] = "true"

# 🔧 Konfigurasi
model_path = "drive/MyDrive/indobert"  # Folder berisi config.json & pytorch_model.bin
max_seq_len = 512
batch_size = 32
lr = 3e-6
epochs = 5
random_state = 1
test_size = 0.2

# ✅ Mapping label teks → angka
label2id = {
    "positive": 0,
    "neutral": 1,
    "negative": 2
}

# ✅ Load model dan tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

# ✅ Load dan map dataset
csv_path = "drive/MyDrive/Data/dataset.csv"
df = pd.read_csv(csv_path)
df = df[df["sentiment"].isin(label2id.keys())]  # filter hanya label valid
df["label"] = df["sentiment"].map(label2id)

# ✅ Split data
train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

# ✅ Tokenisasi dan tambahkan ke DataFrame
def tokenize_and_add_to_df(dataframe):
    texts = dataframe["text"].tolist()
    labels = dataframe["label"].tolist()
    encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=max_seq_len)
    dataframe["input_ids"] = encodings["input_ids"]
    dataframe["token_type_ids"] = encodings.get("token_type_ids", [[0]*max_seq_len]*len(texts))
    dataframe["attention_mask"] = encodings["attention_mask"]
    return dataframe

train_df = tokenize_and_add_to_df(train_df)
test_df = tokenize_and_add_to_df(test_df)

# ✅ Dataset HuggingFace
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Pilih hanya kolom yang diperlukan
selected_cols = ["input_ids", "token_type_ids", "attention_mask", "label"]
train_dataset = train_dataset.select_columns(selected_cols)
test_dataset = test_dataset.select_columns(selected_cols)

# ✅ Training arguments
training_args = TrainingArguments(
    report_to="none",
    output_dir="results/",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    logging_dir="logs/"
)

# ✅ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=default_data_collator
)

# ✅ Fine-tuning
trainer.train()

In [None]:
# Tentukan path di Google Drive Anda (Anda bisa membuat folder baru)
drive_path = "/content/drive/MyDrive/model_indobert/"

# Simpan model dan tokenizer ke path tersebut
trainer.save_model(drive_path)
tokenizer.save_pretrained(drive_path)

# XLNet

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    XLNetForSequenceClassification,
    XLNetTokenizer,
    TrainingArguments,
    Trainer,
    default_data_collator
)
import torch

# ✅ Nonaktifkan Weights & Biases
os.environ["WANDB_DISABLED"] = "true"

# 🔧 Konfigurasi
model_path = "drive/MyDrive/xlnet"  # folder harus berisi config.json + spiece.model + pytorch_model.bin
max_seq_len = 512
batch_size = 4
lr = 3e-6
epochs = 5
random_state = 1
test_size = 0.4

# ✅ Mapping label
label2id = {
    "positive": 0,
    "neutral": 1,
    "negative": 2
}

# ✅ Load model dan tokenizer
model = XLNetForSequenceClassification.from_pretrained(
    model_path,
    num_labels=3,
    id2label={str(v): k for k, v in label2id.items()},
    label2id=label2id,
    ignore_mismatched_sizes=True # Add this line to ignore size mismatches
)

tokenizer = XLNetTokenizer.from_pretrained(model_path)

# ✅ Load dan siapkan data
csv_path = "drive/MyDrive/Data/dataset.csv"
df = pd.read_csv(csv_path)
df = df[df["sentiment"].isin(label2id.keys())]
df["label"] = df["sentiment"].map(label2id)

# ✅ Split data
train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

# ✅ Tokenisasi
def tokenize_and_add_to_df(dataframe):
    texts = dataframe["text"].tolist()
    labels = dataframe["label"].tolist()
    encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=max_seq_len)
    dataframe["input_ids"] = encodings["input_ids"]
    dataframe["token_type_ids"] = encodings.get("token_type_ids", [[0]*max_seq_len]*len(texts))
    dataframe["attention_mask"] = encodings["attention_mask"]
    return dataframe

train_df = tokenize_and_add_to_df(train_df)
test_df = tokenize_and_add_to_df(test_df)

# ✅ Buat HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Pilih kolom input
selected_cols = ["input_ids", "token_type_ids", "attention_mask", "label"]
train_dataset = train_dataset.select_columns(selected_cols)
test_dataset = test_dataset.select_columns(selected_cols)

# ✅ Argumen training
training_args = TrainingArguments(
    report_to="none",
    output_dir="results_xlnet/",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    logging_dir="logs_xlnet/"
)

# ✅ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=default_data_collator
)

# ✅ Mulai training
trainer.train()

In [None]:
# Tentukan path di Google Drive Anda (Anda bisa membuat folder baru)
drive_path = "/content/drive/MyDrive/model_xlnet/"

# Simpan model dan tokenizer ke path tersebut
trainer.save_model(drive_path)
tokenizer.save_pretrained(drive_path)

In [None]:
print(model.logits_proj.weight.shape)  # Harusnya (3, 768)
print(model.config.num_labels)        # Harusnya 3

# Sentiment (Excel)

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F

# --- SETUP PATH DAN DEVICE ---
model_path = "drive/MyDrive/model_indobert"  # Ganti dengan path folder model kamu
# model_path = "drive/MyDrive/model_xlnet"  # Ganti dengan path folder model kamu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- LOAD MODEL & TOKENIZER ---
# This will load config.json (with num_labels=3) and pytorch_model.bin
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

# --- LABEL MAP (PASTIKAN SESUAI TRAINING) ---
# Map the model's output indices (0, 1, 2) to your sentiment labels (0, 1, 2)
# Based on the w2i output earlier: {'neutral': 0, 'positive': 1, 'negative': 2}
# and the order in the confusion matrix (positive, neutral, negative),
# it seems your labels in the DataFrame are 'Positif', 'Netral', 'Negatif'.
# Let's confirm the mapping from the model's output index to your desired label integer.
# Based on your training data and the w2i map {0: 'neutral', 1: 'positive', 2: 'negative'},
# the model outputs 0 for neutral, 1 for positive, and 2 for negative.
# Your label_map seems to map model output index to itself, which is correct if
# your desired output integers are 0 for neutral, 1 for positive, 2 for negative.
# If your DataFrame labels are strings, you'll need to map the predicted index back to string.
# Assuming 0->Netral, 1->Positif, 2->Negatif for the final output:
label_map_to_string = {
    0: 'Positive',
    1: 'Neutral',
    2: 'Negative'
}


# --- FUNGSI PREDIKSI LABEL ---
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
    probs = F.softmax(outputs.logits, dim=1)
    pred_index = torch.argmax(probs, dim=1).item()
    # Map the predicted index to the desired string label
    return label_map_to_string.get(pred_index, 'Unknown') # Return 'Unknown' if index is unexpected


# --- LOAD FILE EXCEL ---
df = pd.read_csv("drive/MyDrive/Data/data_baru_prediksi.csv")  # Ganti nama file Excel kamu

# --- LAKUKAN PREDIKSI & TAMBAHKAN KOLOM SENTIMEN ---
# Add error handling in case any text causes issues during prediction
def safe_predict_sentiment(text):
    if pd.isna(text): # Handle NaN or None input
        return None
    try:
        return predict_sentiment(str(text)) # Ensure input is string
    except Exception as e:
        print(f"Error predicting sentiment for text: {str(text)[:50]}... Error: {e}")
        return None # Or some other indicator of failure

df["sentiment"] = df["text"].apply(safe_predict_sentiment)

# --- SIMPAN KE FILE BARU ---
df.to_csv("hasil_sentimen_label.csv", index=False)

print("✅ Pelabelan selesai! File disimpan sebagai 'hasil_sentimen_label.csv'.")

In [None]:
print(df["sentiment"].value_counts())
print(df["sentiment"].unique())