In [11]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [12]:
versi_model=6
df = pd.read_csv('../Dataset/dataset_2.csv')
df


Unnamed: 0,Ulasan,Label
0,not bad,Neutral
1,beli bintang bicara,Neutral
2,alas warna kuning luntur kaos kaki putih,Neutral
3,harga segini worth sih kesalahan cuman salah m...,Neutral
4,ukurannya sih kayanya pake sepatu adidas new b...,Neutral
...,...,...
1149,sepatu belah kiri tekuk utk tumit,Tidak Puas
1150,layan ratih listya buruk,Tidak Puas
1151,jelek barangnya,Tidak Puas
1152,kirim lamaa bangetttt,Tidak Puas


In [13]:
# Ubah label -1 menjadi 2
df['Label'] = df['Label'].map({"Puas": 2, "Neutral": 1, "Tidak Puas": 0})

# Validasi Label
assert df['Label'].isin([0, 1, 2]).all(), "Ada Label yang tidak valid"
print(df['Label'].unique())  # Output harus [0, 1, 2]


[1 2 0]


In [14]:
# Tokenisasi Data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128)

df['tokenized'] = df['Ulasan'].apply(lambda x: tokenize_function(x))


In [15]:
# Split Dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Ulasan'], df['Label'], test_size=0.2, random_state=42)


In [16]:
# Buat Dataset PyTorch
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)

train_dataset = SentimentDataset(train_encodings, train_labels.tolist())
val_dataset = SentimentDataset(val_encodings, val_labels.tolist())


In [17]:
# Ubah BERT dan Trainer ke mode GPU jika tersedia
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [18]:

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to('cuda')
training_args = TrainingArguments(
    output_dir='../results/v_' + str(versi_model),
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=400,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=20,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=200,
    save_total_limit=2,
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  3%|▎         | 20/580 [00:08<03:32,  2.64it/s]

{'loss': 1.1225, 'grad_norm': 5.3195576667785645, 'learning_rate': 2.5e-06, 'epoch': 0.34}


  7%|▋         | 40/580 [00:16<03:25,  2.62it/s]

{'loss': 1.0944, 'grad_norm': 5.547304630279541, 'learning_rate': 5e-06, 'epoch': 0.69}


 10%|█         | 60/580 [00:23<03:11,  2.71it/s]

{'loss': 1.0688, 'grad_norm': 6.412815093994141, 'learning_rate': 7.5e-06, 'epoch': 1.03}


 14%|█▍        | 80/580 [00:31<03:11,  2.61it/s]

{'loss': 1.0183, 'grad_norm': 5.810873031616211, 'learning_rate': 1e-05, 'epoch': 1.38}


 17%|█▋        | 100/580 [00:39<03:03,  2.62it/s]

{'loss': 1.0088, 'grad_norm': 11.551931381225586, 'learning_rate': 1.25e-05, 'epoch': 1.72}



 17%|█▋        | 100/580 [00:40<03:03,  2.62it/s]

{'eval_loss': 0.9278714656829834, 'eval_accuracy': 0.6406926406926406, 'eval_f1': 0.6232344555421276, 'eval_precision': 0.6805786016766052, 'eval_recall': 0.6228366729272476, 'eval_runtime': 1.0796, 'eval_samples_per_second': 213.968, 'eval_steps_per_second': 7.41, 'epoch': 1.72}


 21%|██        | 120/580 [00:48<02:52,  2.66it/s]

{'loss': 0.8807, 'grad_norm': 6.152594089508057, 'learning_rate': 1.5e-05, 'epoch': 2.07}


 24%|██▍       | 140/580 [00:55<02:49,  2.60it/s]

{'loss': 0.7183, 'grad_norm': 6.993831634521484, 'learning_rate': 1.75e-05, 'epoch': 2.41}


 28%|██▊       | 160/580 [01:03<02:41,  2.60it/s]

{'loss': 0.6735, 'grad_norm': 10.683965682983398, 'learning_rate': 2e-05, 'epoch': 2.76}


 31%|███       | 180/580 [01:10<02:32,  2.63it/s]

{'loss': 0.5992, 'grad_norm': 7.2453293800354, 'learning_rate': 2.25e-05, 'epoch': 3.1}


 34%|███▍      | 200/580 [01:18<02:26,  2.60it/s]

{'loss': 0.4196, 'grad_norm': 12.922057151794434, 'learning_rate': 2.5e-05, 'epoch': 3.45}



 34%|███▍      | 200/580 [01:19<02:26,  2.60it/s]

{'eval_loss': 0.6673805117607117, 'eval_accuracy': 0.7445887445887446, 'eval_f1': 0.7456038940392306, 'eval_precision': 0.7570625281151596, 'eval_recall': 0.7512415584030703, 'eval_runtime': 1.0766, 'eval_samples_per_second': 214.569, 'eval_steps_per_second': 7.431, 'epoch': 3.45}


 38%|███▊      | 220/580 [01:29<02:18,  2.61it/s]

{'loss': 0.4992, 'grad_norm': 13.463290214538574, 'learning_rate': 2.7500000000000004e-05, 'epoch': 3.79}


 41%|████▏     | 240/580 [01:37<02:10,  2.60it/s]

{'loss': 0.5406, 'grad_norm': 13.402976036071777, 'learning_rate': 3e-05, 'epoch': 4.14}


 45%|████▍     | 260/580 [01:45<02:03,  2.60it/s]

{'loss': 0.3427, 'grad_norm': 6.909241676330566, 'learning_rate': 3.2500000000000004e-05, 'epoch': 4.48}


 48%|████▊     | 280/580 [01:52<01:55,  2.60it/s]

{'loss': 0.3501, 'grad_norm': 26.75240707397461, 'learning_rate': 3.5e-05, 'epoch': 4.83}


 52%|█████▏    | 300/580 [02:00<01:47,  2.60it/s]

{'loss': 0.3661, 'grad_norm': 25.719985961914062, 'learning_rate': 3.7500000000000003e-05, 'epoch': 5.17}



 52%|█████▏    | 300/580 [02:01<01:47,  2.60it/s]

{'eval_loss': 0.518571138381958, 'eval_accuracy': 0.8354978354978355, 'eval_f1': 0.8362196482310648, 'eval_precision': 0.8455788010616626, 'eval_recall': 0.8382443701077635, 'eval_runtime': 1.089, 'eval_samples_per_second': 212.114, 'eval_steps_per_second': 7.346, 'epoch': 5.17}


 55%|█████▌    | 320/580 [02:09<01:40,  2.59it/s]

{'loss': 0.2734, 'grad_norm': 23.803829193115234, 'learning_rate': 4e-05, 'epoch': 5.52}


 59%|█████▊    | 340/580 [02:17<01:32,  2.60it/s]

{'loss': 0.2562, 'grad_norm': 13.496171951293945, 'learning_rate': 4.25e-05, 'epoch': 5.86}


 62%|██████▏   | 360/580 [02:24<01:24,  2.60it/s]

{'loss': 0.2348, 'grad_norm': 8.092042922973633, 'learning_rate': 4.5e-05, 'epoch': 6.21}


 66%|██████▌   | 380/580 [02:32<01:15,  2.65it/s]

{'loss': 0.2228, 'grad_norm': 6.2275710105896, 'learning_rate': 4.75e-05, 'epoch': 6.55}


 69%|██████▉   | 400/580 [02:39<01:07,  2.65it/s]

{'loss': 0.2365, 'grad_norm': 9.188411712646484, 'learning_rate': 5e-05, 'epoch': 6.9}



 69%|██████▉   | 400/580 [02:40<01:07,  2.65it/s]

{'eval_loss': 0.8729096055030823, 'eval_accuracy': 0.8008658008658008, 'eval_f1': 0.8008028164710949, 'eval_precision': 0.8186232425034216, 'eval_recall': 0.8029776656553421, 'eval_runtime': 1.0642, 'eval_samples_per_second': 217.075, 'eval_steps_per_second': 7.518, 'epoch': 6.9}


 72%|███████▏  | 420/580 [02:50<01:02,  2.57it/s]

{'loss': 0.1476, 'grad_norm': 2.0730905532836914, 'learning_rate': 4.4444444444444447e-05, 'epoch': 7.24}


 76%|███████▌  | 440/580 [02:58<00:53,  2.59it/s]

{'loss': 0.1712, 'grad_norm': 5.377874374389648, 'learning_rate': 3.888888888888889e-05, 'epoch': 7.59}


 79%|███████▉  | 460/580 [03:06<00:46,  2.60it/s]

{'loss': 0.1675, 'grad_norm': 3.308448076248169, 'learning_rate': 3.3333333333333335e-05, 'epoch': 7.93}


 83%|████████▎ | 480/580 [03:13<00:38,  2.60it/s]

{'loss': 0.0922, 'grad_norm': 0.28214210271835327, 'learning_rate': 2.777777777777778e-05, 'epoch': 8.28}


 86%|████████▌ | 500/580 [03:21<00:30,  2.60it/s]

{'loss': 0.0885, 'grad_norm': 0.12413609772920609, 'learning_rate': 2.2222222222222223e-05, 'epoch': 8.62}



 86%|████████▌ | 500/580 [03:22<00:30,  2.60it/s]

{'eval_loss': 0.9734193682670593, 'eval_accuracy': 0.8225108225108225, 'eval_f1': 0.8221740126991209, 'eval_precision': 0.824028000713394, 'eval_recall': 0.8233189969734352, 'eval_runtime': 1.09, 'eval_samples_per_second': 211.936, 'eval_steps_per_second': 7.34, 'epoch': 8.62}


 90%|████████▉ | 520/580 [03:30<00:23,  2.58it/s]

{'loss': 0.0765, 'grad_norm': 10.553447723388672, 'learning_rate': 1.6666666666666667e-05, 'epoch': 8.97}


 93%|█████████▎| 540/580 [03:38<00:15,  2.59it/s]

{'loss': 0.0407, 'grad_norm': 0.19714026153087616, 'learning_rate': 1.1111111111111112e-05, 'epoch': 9.31}


 97%|█████████▋| 560/580 [03:45<00:07,  2.60it/s]

{'loss': 0.0479, 'grad_norm': 0.08613096922636032, 'learning_rate': 5.555555555555556e-06, 'epoch': 9.66}


100%|██████████| 580/580 [03:53<00:00,  2.48it/s]

{'loss': 0.0402, 'grad_norm': 0.04435143247246742, 'learning_rate': 0.0, 'epoch': 10.0}
{'train_runtime': 233.5398, 'train_samples_per_second': 39.522, 'train_steps_per_second': 2.484, 'train_loss': 0.4413453757762909, 'epoch': 10.0}





TrainOutput(global_step=580, training_loss=0.4413453757762909, metrics={'train_runtime': 233.5398, 'train_samples_per_second': 39.522, 'train_steps_per_second': 2.484, 'train_loss': 0.4413453757762909, 'epoch': 10.0})

In [19]:
# Evaluasi model setelah pelatihan
eval_result = trainer.evaluate()
eval_result

100%|██████████| 8/8 [00:00<00:00,  8.49it/s]


{'eval_loss': 0.9896822571754456,
 'eval_accuracy': 0.8181818181818182,
 'eval_f1': 0.8181887078438802,
 'eval_precision': 0.8270822230797973,
 'eval_recall': 0.8162367610940088,
 'eval_runtime': 1.0868,
 'eval_samples_per_second': 212.55,
 'eval_steps_per_second': 7.361,
 'epoch': 10.0}

In [20]:

# Save model
print("Model berhasil disimpan di direktori ../results/" + "v_" + str(versi_model))
model.save_pretrained("../results/" + "v_" + str(versi_model))
tokenizer.save_pretrained("../results/" + "v_" + str(versi_model))

Model berhasil disimpan di direktori ../results/v_6


('../results/v_6\\tokenizer_config.json',
 '../results/v_6\\special_tokens_map.json',
 '../results/v_6\\vocab.txt',
 '../results/v_6\\added_tokens.json')