In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import random

  from .autonotebook import tqdm as notebook_tqdm





In [10]:
versi_model=1

In [2]:
# Data awal
sentiments = [
    'Saya sangat puas dengan layanan ini',
    'Layanan ini cukup baik',
    'Saya tidak puas dengan layanan ini',
    'Pelayanan sangat memuaskan',
    'Cukup bagus, tidak ada keluhan',
    'Saya kecewa dengan layanan yang diberikan',
    'Sangat puas, terima kasih!',
    'Netral, tidak ada yang spesial',
    'Layanan sangat buruk',
    'Pelayanan baik, terima kasih',
    'Tidak buruk, tapi bisa lebih baik',
    'Sangat tidak puas dengan layanan ini',
    'Bagus, saya suka',
    'Tidak begitu bagus, tapi bisa diterima',
    'Saya tidak akan menggunakan layanan ini lagi',
    'Puas dengan pelayanan',
    'Biasa saja, tidak istimewa',
    'Layanan sangat lambat dan mengecewakan',
    'Luar biasa, sangat memuaskan',
    'Pelayanan cepat dan ramah',
]

labels = [1, 0, -1, 1, 0, -1, 1, 0, -1, 1, 0, -1, 1, 0, -1, 1, 0, -1, 1, 1]

# Buat dataset 100 entri
random.seed(42)
dataset = {'sentimen': [], 'label': []}

for _ in range(100):
    idx = random.randint(0, len(sentiments) - 1)
    dataset['sentimen'].append(sentiments[idx])
    dataset['label'].append(labels[idx])

df = pd.DataFrame(dataset)
df['label'] = df['label'].map({-1: "tidak puas", 0: "netral", 1: "puas"})


In [3]:
# Ubah label -1 menjadi 2
df['label'] = df['label'].map({"puas": 2, "netral": 1, "tidak puas": 0})

# Validasi label
assert df['label'].isin([0, 1, 2]).all(), "Ada label yang tidak valid"
print(df['label'].unique())  # Output harus [0, 1, 2]


[2 0 1]


In [4]:
# Tokenisasi Data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128)

df['tokenized'] = df['sentimen'].apply(lambda x: tokenize_function(x))


In [5]:
# Split Dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df['sentimen'], df['label'], test_size=0.2, random_state=42)


In [6]:
# Buat Dataset PyTorch
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)

train_dataset = SentimentDataset(train_encodings, train_labels.tolist())
val_dataset = SentimentDataset(val_encodings, val_labels.tolist())


In [7]:
# Ubah BERT dan Trainer ke mode GPU jika tersedia
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [8]:

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to('cpu')

training_args = TrainingArguments(
    output_dir='../results'+ 'v_' + str(versi_model),
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 73%|███████▎  | 11/15 [00:02<00:00,  7.22it/s]

{'loss': 1.2185, 'grad_norm': 4.915512561798096, 'learning_rate': 1.0000000000000002e-06, 'epoch': 2.0}


100%|██████████| 15/15 [00:03<00:00,  4.88it/s]

{'train_runtime': 3.0659, 'train_samples_per_second': 78.28, 'train_steps_per_second': 4.893, 'train_loss': 1.2108866055806478, 'epoch': 3.0}





TrainOutput(global_step=15, training_loss=1.2108866055806478, metrics={'train_runtime': 3.0659, 'train_samples_per_second': 78.28, 'train_steps_per_second': 4.893, 'train_loss': 1.2108866055806478, 'epoch': 3.0})

In [9]:
# Evaluasi model setelah pelatihan
eval_result = trainer.evaluate()
eval_result

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 1/1 [00:00<00:00, 140.90it/s]


{'eval_loss': 1.258344054222107,
 'eval_accuracy': 0.3,
 'eval_f1': 0.15384615384615385,
 'eval_precision': 0.09999999999999999,
 'eval_recall': 0.3333333333333333,
 'eval_runtime': 0.0275,
 'eval_samples_per_second': 728.039,
 'eval_steps_per_second': 36.402,
 'epoch': 3.0}

In [13]:

# Save model
model.save_pretrained("../results/" + "v_" + str(versi_model))
tokenizer.save_pretrained("../results/" + "v_" + str(versi_model))

('../results/v_1\\tokenizer_config.json',
 '../results/v_1\\special_tokens_map.json',
 '../results/v_1\\vocab.txt',
 '../results/v_1\\added_tokens.json')