In [2]:
import os
os.environ['TF_USE_LEGACY_KERAS'] = 'True'

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
import logging

In [5]:
# Menyesuaikan level logging
logging.getLogger('tensorflow').setLevel(logging.ERROR)

In [6]:
# Input Dataset
df = pd.read_csv('Dataset_Manual_stemming.csv', encoding='latin1', delimiter=';')
df.head()

Unnamed: 0,No,Type,Mentions,Date,Media,Sentiment,Author,Followers,Retweeted,Favourited,Mentions1,Sentiment1,stemmed_text
0,1,rt,RT @LANGKAHANIES: Jangan ada intervensi politi...,31/05/2023 23:59,Twitter,Positive,@YTrigusmintara (ManusiaBebas),19.0,64.0,0.0,jangan ada intervensi politik penjegalan pilpr...,negatif,jangan ada intervensi politik jegal pilpres 20...
1,2,rt,RT @triwul82: Sejumlah perwakilan Koalisi Peru...,31/05/2023 23:59,Twitter,Negative,@INA_NKRI (100% Indonesia ÃÂÃÂÃÂÃÂ°ÃÂ...,1250.0,83.0,0.0,sejumlah perwakilan koalisi perubahan yang men...,positif,jumlah wakil koalisi ubah yang usung anies bag...
2,3,rt,RT @ajengcute16__: Merupakan Open Legal Policy...,31/05/2023 23:59,Twitter,Positive,@sri08054 (Sri anies),356.0,50.0,0.0,merupakan open legal policy perludem sangat be...,negatif,rupa open legal policy perludem sangat bahaya ...
3,4,rt,RT @Jatayu_45: JOKOWI HARUS MUNDUR DARI JABATA...,31/05/2023 23:59,Twitter,Neutral,@wongedan1708 (BAGong Modern),16.0,108.0,0.0,jokowi harus mundur dari jabatan presiden kala...,negatif,jokowi harus mundur dari jabat presiden kalau ...
4,5,mention,"Langkahi Presiden dan DPR, Demokrat: Bukan Wew...",31/05/2023 23:59,Twitter,Negative,@Simanjunta9Nico (Nico Simanjuntak),650.0,0.0,0.0,langkahi presiden dan dpr demokrat bukan wewen...,negatif,langkah presiden dan dpr demokrat bukan wewena...


In [7]:
# Pastikan data teks berupa string
df['stemmed_text'] = df['stemmed_text'].fillna('').astype(str)

In [8]:
# Mengkodekan label
label_encoder = LabelEncoder()
df['Sentiment1'] = label_encoder.fit_transform(df['Sentiment1'])

In [9]:
# Siapkan tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
# Konversikan data ke BERT InputExamples
def convert_data_to_examples(data, data_column, label_column):
    return data.apply(lambda x: InputExample(guid=None,
                                             text_a=x[data_column],
                                             text_b=None,
                                             label=x[label_column]), axis=1)


In [11]:
# Ubah InputExamples menjadi InputFeatures
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = []
    for e in examples:
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            truncation=True
        )
        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
                                                     input_dict["token_type_ids"],
                                                     input_dict["attention_mask"])

        features.append(
            InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label)
        )

    def gen():
        for f in features:
            yield (
                {
                    'input_ids': f.input_ids,
                    'attention_mask': f.attention_mask,
                    'token_type_ids': f.token_type_ids
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({'input_ids': tf.int32, 'attention_mask': tf.int32, 'token_type_ids': tf.int32}, tf.int64),
        ({'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None]), 'token_type_ids': tf.TensorShape([None])}, tf.TensorShape([])),
    )

In [15]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list = []

for fold, (train_index, test_index) in enumerate(kf.split(df), 1):
    train = df.iloc[train_index]
    test = df.iloc[test_index]
    
    # Convert to BERT input format
    train_InputExamples = convert_data_to_examples(train, 'stemmed_text', 'Sentiment1')
    test_InputExamples = convert_data_to_examples(test, 'stemmed_text', 'Sentiment1')
    
    train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
    train_data = train_data.shuffle(100).batch(32).repeat()  # Menambahkan .repeat()
    
    test_data = convert_examples_to_tf_dataset(list(test_InputExamples), tokenizer)
    test_data = test_data.batch(32)
    
    # Load BERT model
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
    
    # Compile the model with run_eagerly=True
    from tensorflow.keras.optimizers.legacy import Adam  
    optimizer = Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1.0)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric], run_eagerly=True)
    
    # Determine the number of steps per epoch and validation steps
    train_steps_per_epoch = len(train) // 32
    validation_steps = len(test) // 32
    
    # Train the model
    model.fit(train_data, epochs=20, steps_per_epoch=train_steps_per_epoch, validation_data=test_data, validation_steps=validation_steps)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [22]:
# Evaluate the model on the validation set
loss, accuracy = model.evaluate(test_data, steps=validation_steps)



In [23]:
# Predict on the validation set
predictions = model.predict(test_data, steps=validation_steps)
y_pred = np.argmax(predictions.logits, axis=1)
y_true = test['Sentiment1'][:len(y_pred)]



In [24]:
# Calculate average accuracy across all folds
average_accuracy = np.mean(accuracy_list)
print(f'Average Accuracy: {average_accuracy * 100:.2f}%\n')
report = classification_report(y_true, y_pred, target_names=label_encoder.classes_)
print(f'Fold {fold} Classification Report:\n{report}')

Average Accuracy: 90.02%

Fold 5 Classification Report:
              precision    recall  f1-score   support

     negatif       0.93      0.90      0.91       306
      netral       0.93      0.86      0.89       291
     positif       0.86      0.93      0.90       395

    accuracy                           0.90       992
   macro avg       0.91      0.90      0.90       992
weighted avg       0.90      0.90      0.90       992

