## Loading Processed Data

### Setting up environment

In [2]:
DATA_BASE_PATH = "../Datasets/processed-en/"

### Loading metadata

In [3]:
import json

metadata = {}

with open(DATA_BASE_PATH+'metadata.json', 'r') as file:
    metadata = json.load(file)

print(metadata)


{'name': 'optimalisasi-model-machine-learning-untuk-klasifikasi-ujaran-kebencian-pada-x-twitter', 'author': 'doddy-s', 'created_at': '2024-11-30T18:07:36.391282', 'meta': {'dataset': {'longest_text': 264, 'count': 31977, 'file_name': 'english-hate-speech-processed-dataset', 'file_format': 'csv'}, 'dataset_ready_to_train': {'longest_text': 264, 'count': 31977, 'file_name': 'english-hate-speech-processed-dataset-ready-to-train', 'file_format': 'parquet'}, 'dataset_ready_to_train_json': {'longest_text': 264, 'count': 31977, 'file_name': 'english-hate-speech-processed-dataset-ready-to-train-json', 'file_format': 'json'}, 'word_index': {'count': 28224, 'file_name': 'english-hate-speech-processed-word-index', 'file_format': 'json'}}}


### Loading dataset tokenized

In [4]:
import pandas as pd

dataset_file = metadata['meta']['dataset_ready_to_train']['file_name']+'.'+metadata['meta']['dataset_ready_to_train']['file_format']

df = pd.read_parquet(DATA_BASE_PATH+dataset_file)

In [5]:
df.head()

Unnamed: 0,text,label
0,"[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1,"[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 0, 0,...",0
2,"[14, 15, 16, 17, 18, 19, 4, 20, 21, 22, 23, 24...",0
3,"[42, 43, 44, 45, 46, 47, 48, 49, 50, 43, 44, 5...",0
4,"[14, 85, 86, 87, 88, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0


### Loading word index

In [6]:
import json

word_index_file = metadata['meta']['word_index']['file_name']+'.'+metadata['meta']['word_index']['file_format']
word_index = {}

with open(DATA_BASE_PATH+word_index_file, 'r') as file:
    word_index = json.load(file)

print(word_index)

{'translat': 1, 'target': 2, 'awar': 3, 'thi': 4, 'cocktail': 5, 'certainli': 6, 'drink': 7, 'deserv': 8, 'recip': 9, 'consid': 10, 'older': 11, 'side': 12, 'thing': 13, 'well': 14, 'perhap': 15, 'could': 16, 'explain': 17, 'exactli': 18, 'think': 19, 'articl': 20, 'suppos': 21, 'becaus': 22, 'frankli': 23, 'baffl': 24, 'total': 25, 'obscur': 26, 'huge': 27, 'vijay': 28, 'prasad': 29, 'quot': 30, 'seriou': 31, 'weight': 32, 'problem': 33, 'indic': 34, 'pleas': 35, 'let': 36, 'know': 37, 'move': 38, 'vehicl': 39, 'pov': 40, 'thank': 41, 'index': 42, 'saddam': 43, 'hussein': 44, 'recent': 45, 'editor': 46, 'ha': 47, 'chang': 48, 'default': 49, 'sort': 50, 'rather': 51, 'edit': 52, 'summari': 53, 'base': 54, 'comment': 55, 'way': 56, 'follow': 57, 'behind': 58, 'event': 59, 'seem': 60, 'odd': 61, 'cours': 62, 'also': 63, 'unbeknownst': 64, 'standard': 65, 'arab': 66, 'name': 67, 'doubt': 68, 'howev': 69, 'see': 70, 'hi': 71, 'son': 72, 'inde': 73, 'befor': 74, 'ani': 75, 'revert': 76, 'ev

## Splitting Data

Splitting train, validation, and test data

In [7]:
from sklearn.model_selection import train_test_split
import numpy as np

X = np.array([arr.astype(np.float32) for arr in df['text'].values])

X_train_, X_test, y_train_, y_test = train_test_split(
    X,
    df['label'].values,
    test_size=0.2,
    random_state=69,
    stratify=df['label'].values,
)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_train_,
    y_train_,
    test_size=0.25,
    random_state=69,
    stratify=y_train_
)

In [9]:
import tensorflow as tf

X_train_tensor = tf.convert_to_tensor(X_train)
X_val_tensor = tf.convert_to_tensor(X_val)
X_test_tensor = tf.convert_to_tensor(X_test)


## Creating Model

In [10]:
import tensorflow as tf
from tensorflow.keras import regularizers

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(word_index)+1, output_dim=64, mask_zero=True, name='embedding'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2, recurrent_activation='sigmoid'), name='bilstm'),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.02), name='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid', name='sigmoid')
])


model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss = 'binary_crossentropy',
              metrics=['accuracy'])

In [11]:
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss = 'binary_crossentropy',
              metrics=['accuracy'])

In [12]:
model.summary()

## Training Model

In [13]:
import tensorflow as tf

# Check if TensorFlow is built with GPU support
print("Is built with GPU support: ", tf.test.is_built_with_gpu_support())

# List available GPUs
gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs: ", gpus)

# Check if TensorFlow is using the GPU
if gpus:
    print("Using GPU for training")
else:
    print("Using CPU for training")

Is built with GPU support:  False
Available GPUs:  []
Using CPU for training


In [14]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=2,
                               restore_best_weights=True)

model_history = model.fit(X_train_tensor, y_train,
                          validation_data=(X_val_tensor, y_val),
                          batch_size=32,
                          epochs=8,
                          callbacks=[early_stopping])

Epoch 1/8
[1m281/600[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m41s[0m 129ms/step - accuracy: 0.6384 - loss: 1.6245

KeyboardInterrupt: 

In [14]:
model.summary()

## Summary

In [16]:
y_pred = model.predict(X_test_tensor)
y_pred = [0 if pred < 0.5 else 1 for pred in y_pred]

[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step


In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.87      0.89      1162
           1       0.87      0.92      0.89      1170

    accuracy                           0.89      2332
   macro avg       0.89      0.89      0.89      2332
weighted avg       0.89      0.89      0.89      2332

