In [32]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import random

In [33]:
versi_model=3

In [34]:
df = pd.read_csv('../Dataset/dataset_1.csv')
df
# df = pd.DataFrame(dataset)
# df['label'] = df['label'].map({-1: "tidak puas", 0: "netral", 1: "puas"})


Unnamed: 0,Ulasan,Label
0,barang,Neutral
1,standard sepatu harga segitu,Neutral
2,sayang ukur standard internasional standard in...,Neutral
3,miring kak,Neutral
4,ekspektasi,Neutral
...,...,...
715,sepatu belah kiri tekuk utk tumit,Tidak Puas
716,kecewa sih packingnya dibungkus plastik transp...,Tidak Puas
717,jelek barangnya,Tidak Puas
718,noda kayak sepatu bekas,Tidak Puas


In [35]:
# Ubah label -1 menjadi 2
df['Label'] = df['Label'].map({"Puas": 2, "Neutral": 1, "Tidak Puas": 0})

# Validasi Label
assert df['Label'].isin([0, 1, 2]).all(), "Ada Label yang tidak valid"
print(df['Label'].unique())  # Output harus [0, 1, 2]


[1 2 0]


In [36]:
# Tokenisasi Data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128)

df['tokenized'] = df['Ulasan'].apply(lambda x: tokenize_function(x))


In [37]:
# Split Dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Ulasan'], df['Label'], test_size=0.2, random_state=42)


In [38]:
# Buat Dataset PyTorch
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)

train_dataset = SentimentDataset(train_encodings, train_labels.tolist())
val_dataset = SentimentDataset(val_encodings, val_labels.tolist())


In [39]:
# Ubah BERT dan Trainer ke mode GPU jika tersedia
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [40]:

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to('cuda')
training_args = TrainingArguments(
    output_dir='../results/v_' + str(versi_model),
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=20,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=200,
    save_total_limit=2,
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 11%|█         | 20/180 [00:17<02:22,  1.13it/s]

{'loss': 1.1047, 'grad_norm': 3.272338390350342, 'learning_rate': 5e-06, 'epoch': 0.56}


 22%|██▏       | 40/180 [00:34<02:01,  1.15it/s]

{'loss': 1.1023, 'grad_norm': 3.424889087677002, 'learning_rate': 1e-05, 'epoch': 1.11}


 33%|███▎      | 60/180 [00:52<01:47,  1.12it/s]

{'loss': 1.0482, 'grad_norm': 10.82409954071045, 'learning_rate': 1.5e-05, 'epoch': 1.67}


 44%|████▍     | 80/180 [01:10<01:29,  1.12it/s]

{'loss': 0.9455, 'grad_norm': 11.643259048461914, 'learning_rate': 2e-05, 'epoch': 2.22}


 56%|█████▌    | 100/180 [01:28<01:11,  1.13it/s]

{'loss': 0.8968, 'grad_norm': 10.007346153259277, 'learning_rate': 2.5e-05, 'epoch': 2.78}


                                                 
 56%|█████▌    | 100/180 [01:28<01:11,  1.13it/s]

{'eval_loss': 0.8136744499206543, 'eval_accuracy': 0.6319444444444444, 'eval_f1': 0.5885059363737811, 'eval_precision': 0.6359720483938256, 'eval_recall': 0.5975314889788573, 'eval_runtime': 0.4493, 'eval_samples_per_second': 320.463, 'eval_steps_per_second': 6.676, 'epoch': 2.78}


 56%|█████▌    | 101/180 [01:29<01:21,  1.03s/it]

In [None]:
# Evaluasi model setelah pelatihan
eval_result = trainer.evaluate()
eval_result

100%|██████████| 3/3 [00:00<00:00, 12.37it/s]


{'eval_loss': 0.7747302055358887,
 'eval_accuracy': 0.6736111111111112,
 'eval_f1': 0.6724791001671987,
 'eval_precision': 0.6781968344726055,
 'eval_recall': 0.6812022042285201,
 'eval_runtime': 0.4307,
 'eval_samples_per_second': 334.37,
 'eval_steps_per_second': 6.966,
 'epoch': 3.0}

In [None]:

# Save model
print("Model berhasil disimpan di direktori ../results/" + "v_" + str(versi_model))
model.save_pretrained("../results/" + "v_" + str(versi_model))
tokenizer.save_pretrained("../results/" + "v_" + str(versi_model))

SafetensorError: Error while serializing: IoError(Os { code: 1224, kind: Uncategorized, message: "The requested operation cannot be performed on a file with a user-mapped section open." })