In [None]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
%pip install transformers datasets scikit-learn pandas matplotlib notebook

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

In [None]:
import pandas as pd

path_train = './dataset/clean_train.csv'
path_test = './dataset/clean_test.csv'

train_df = pd.read_csv(path_train).sample(n=8000, random_state=42)
test_df = pd.read_csv(path_test).sample(n=2000, random_state=42)

label_map = {'neg': 0, 'neu': 1, 'pos': 2}
train_df['label'] = train_df['review_class'].map(label_map)
test_df['label'] = test_df['review_class'].map(label_map)

print(train_df[['clean_text', 'review_class', 'label']].head())
print(test_df[['clean_text', 'review_class', 'label']].head())

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df[['clean_text', 'label']].rename(columns={"clean_text": "text"}))
test_dataset = Dataset.from_pandas(test_df[['clean_text', 'label']].rename(columns={"clean_text": "text"}))

In [None]:
from transformers import AutoTokenizer

checkpoint = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True)

train_tokenized = train_dataset.map(tokenize_fn, batched=True)
test_tokenized = test_dataset.map(tokenize_fn, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

training_args = TrainingArguments(
    output_dir="./result",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none",
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1_macro": f1_score(labels, predictions, average="macro"),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
from sklearn.metrics import classification_report

predictions = trainer.predict(test_tokenized)

predicted_labels = np.argmax(predictions.predictions, axis=-1)

true_labels = predictions.label_ids

accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print("Accuracy:", accuracy)
print("F1-score:", f1)

print("Classification Report:")
print(classification_report(true_labels, predicted_labels, target_names=["neg", "neu", "pos"]))


In [None]:
save_path = "./model/indobert_sentiment_model"

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

load_path = "Fathualr/sentiment-analysis-dataset-product-review"

model = AutoModelForSequenceClassification.from_pretrained(load_path)
tokenizer = AutoTokenizer.from_pretrained(load_path)

In [None]:
from transformers import pipeline

predictor = pipeline("text-classification", model=model, tokenizer=tokenizer)

texts = [
    "produk jelek dan bodoh",
    "Saya sangat senang dengan produk ini!",
    "Aku suka produk desainnya, tapi ini jelek kualitasnya",
    "Produk ini luar biasa, saya akan membelinya lagi!",
    "Pengiriman sangat cepat dan sesuai harapan.",
    "Pelayanan pelanggan sangat membantu dan ramah.",
    "Saya puas dengan kualitas dan harganya.",
    "Desainnya elegan dan sangat nyaman dipakai.",

    "Pelayanannya mengecewakan",
    "banyak fitur bermasalah dan menghambat",
    "Barang datang rusak dan tidak sesuai deskripsi.",
    "Sangat kecewa, tidak akan beli lagi di sini.",
    "Aplikasi sering crash dan membuat frustasi.",
    "Kualitasnya buruk, terasa murahan.",
    "Pengalaman belanja yang sangat buruk.",

    "Produk sesuai deskripsi.",
    "Masih perlu dicoba beberapa hari ke depan.",
    "Barang diterima. Belum diuji.",
    "Warnanya beda sedikit dari foto.",
    "Tidak ada masalah berarti sejauh ini."
]

results = predictor(texts)
label_id_to_str = {
    "LABEL_0": 'negative',
    "LABEL_1": 'neutral',
    "LABEL_2": 'positive'
}

for text, result in zip(texts, results):
    print(f"Teks: {text}")
    print(f"Raw Label: {result['label']}")
    print(f"Label: {label_id_to_str[result['label']]}")
    print(f"Confidence: {result['score']:.2f}\n")
