## Loading Processed Data

### Setting up environment

In [1]:
DATA_BASE_PATH = "../Datasets/processed/"

### Loading metadata

In [2]:
import json

metadata = {}

with open(DATA_BASE_PATH+'metadata.json', 'r') as file:
    metadata = json.load(file)

print(metadata)


{'name': 'optimalisasi-model-machine-learning-untuk-klasifikasi-ujaran-kebencian-pada-x-twitter', 'author': 'doddy-s', 'created_at': '2024-11-23T22:16:28.616924', 'meta': {'dataset': {'longest_text': 45, 'count': 11659, 'file_name': 'indonesian-hate-speech-processed-dataset', 'file_format': 'csv'}, 'dataset_ready_to_train': {'longest_text': 45, 'count': 11659, 'file_name': 'indonesian-hate-speech-processed-dataset-ready-to-train', 'file_format': 'parquet'}, 'dataset_ready_to_train_json': {'longest_text': 45, 'count': 11659, 'file_name': 'indonesian-hate-speech-processed-dataset-ready-to-train-json', 'file_format': 'json'}, 'word_index': {'count': 9905, 'file_name': 'indonesian-hate-speech-processed-word-index', 'file_format': 'json'}}}


### Loading dataset tokenized

In [3]:
import pandas as pd

dataset_file = metadata['meta']['dataset_ready_to_train']['file_name']+'.'+metadata['meta']['dataset_ready_to_train']['file_format']

df = pd.read_parquet(DATA_BASE_PATH+dataset_file)

In [4]:
df.head()

Unnamed: 0,text,label
0,"[1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1,"[6, 7, 7, 8, 9, 10, 11, 2, 12, 6, 6, 6, 6, 13,...",0
2,"[17, 18, 19, 6, 3, 20, 21, 22, 23, 24, 25, 26,...",0
3,"[32, 2, 6, 33, 34, 6, 2, 35, 3, 36, 37, 38, 39...",0
4,"[42, 43, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0


### Loading word index

In [5]:
import json

word_index_file = metadata['meta']['word_index']['file_name']+'.'+metadata['meta']['word_index']['file_format']
word_index = {}

with open(DATA_BASE_PATH+word_index_file, 'r') as file:
    word_index = json.load(file)

print(word_index)

{'teliti': 1, 'orang': 2, 'kristen': 3, 'cina': 4, 'hehe': 5, 'agama': 6, 'pakai': 7, 'ketat': 8, 'lekuk': 9, 'tubuh': 10, 'sama': 11, 'telanjang': 12, 'bom': 13, 'bunuh': 14, 'kena': 15, 'pere': 16, 'tomohon': 17, 'pusat': 18, 'ajar': 19, 'katolik': 20, 'khusus': 21, 'doa': 22, 'meditasi': 23, 'bukit': 24, 'kasih': 25, 'suci': 26, 'indah': 27, 'tenang': 28, 'international': 29, 'flower': 30, 'festival': 31, 'mandang': 32, 'kayak': 33, 'peduli': 34, 'tulis': 35, 'jidat': 36, 'langsung': 37, 'nyes': 38, 'pikir': 39, 'harus': 40, 'pindah': 41, 'guntur': 42, 'romli': 43, 'jungyeon': 44, 'menang': 45, 'acara': 46, 'gede': 47, 'pelihara': 48, 'mati': 49, 'henti': 50, 'bilang': 51, 'kg': 52, 'profesional': 53, 'kucing': 54, 'mencret': 55, 'bencana': 56, 'mega': 57, 'rona': 58, 'utus': 59, 'malaysia': 60, 'april': 61, 'politik': 62, 'beli': 63, 'langgan': 64, 'islam': 65, 'muslim': 66, 'alquran': 67, 'keluarga': 68, 'wanita': 69, 'sehat': 70, 'hibur': 71, 'a': 72, 'is': 73, 'budaya': 74, 'sai

## Splitting Data

Splitting train, validation, and test data

In [6]:
from sklearn.model_selection import train_test_split
import numpy as np

X = np.array([arr.astype(np.float32) for arr in df['text'].values])

X_train_, X_test, y_train_, y_test = train_test_split(
    X,
    df['label'].values,
    test_size=0.2,
    random_state=69,
    stratify=df['label'].values,
)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_train_,
    y_train_,
    test_size=0.25,
    random_state=69,
    stratify=y_train_
)

In [8]:
import tensorflow as tf

X_train_tensor = tf.convert_to_tensor(X_train)
X_val_tensor = tf.convert_to_tensor(X_val)
X_test_tensor = tf.convert_to_tensor(X_test)


## Creating Model

In [9]:
import tensorflow as tf
from tensorflow.keras import regularizers

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(word_index)+1, output_dim=64, mask_zero=True, name='embedding'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2, recurrent_activation='sigmoid'), name='bilstm'),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.02), name='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid', name='sigmoid')
])


model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss = 'binary_crossentropy',
              metrics=['accuracy'])

In [10]:
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss = 'binary_crossentropy',
              metrics=['accuracy'])

In [11]:
model.summary()

## Training Model

In [12]:
import tensorflow as tf

# Check if TensorFlow is built with GPU support
print("Is built with GPU support: ", tf.test.is_built_with_gpu_support())

# List available GPUs
gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs: ", gpus)

# Check if TensorFlow is using the GPU
if gpus:
    print("Using GPU for training")
else:
    print("Using CPU for training")

Is built with GPU support:  False
Available GPUs:  []
Using CPU for training


In [13]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=2,
                               restore_best_weights=True)

model_history = model.fit(X_train_tensor, y_train,
                          validation_data=(X_val_tensor, y_val),
                          batch_size=32,
                          epochs=8,
                          callbacks=[early_stopping])

Epoch 1/8
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 32ms/step - accuracy: 0.6819 - loss: 1.7760 - val_accuracy: 0.8834 - val_loss: 0.3369
Epoch 2/8
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 27ms/step - accuracy: 0.9306 - loss: 0.2377 - val_accuracy: 0.9022 - val_loss: 0.2871
Epoch 3/8
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.9588 - loss: 0.1431 - val_accuracy: 0.9009 - val_loss: 0.3050
Epoch 4/8
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.9768 - loss: 0.0964 - val_accuracy: 0.8971 - val_loss: 0.2876


In [14]:
model.summary()

## Summary

In [16]:
y_pred = model.predict(X_test_tensor)
y_pred = [0 if pred < 0.5 else 1 for pred in y_pred]

[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step


In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.87      0.89      1162
           1       0.87      0.92      0.89      1170

    accuracy                           0.89      2332
   macro avg       0.89      0.89      0.89      2332
weighted avg       0.89      0.89      0.89      2332

