# CODE

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras
from keras import layers, models

2024-05-25 02:17:25.373629: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-25 02:17:25.409958: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = models.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim)]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

Dataset file from: J. Wang, K. Fu, C.T. Lu, “SOSNet: A Graph Convolutional Network Approach to Fine-Grained Cyberbullying Detection,” Proceedings of the 2020 IEEE International Conference on Big Data (IEEE BigData 2020), December 10-13, 2020.

In [3]:
# load cyberbullying dataset
tweets_data = pd.read_csv('data/cyberbullying_tweets.csv')
print(tweets_data.head())

# encode labels
tweets_data['label'] = tweets_data['cyberbullying_type'].factorize()[0]
print(f'\nAfter encoding labels:\n{tweets_data.head()}')

                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying

After encoding labels:
                                          tweet_text cyberbullying_type  label
0  In other words #katandandre, your food was cra...  not_cyberbullying      0
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying      0
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying      0
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying      0
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying      0


In [4]:
# split data
x_train, x_val, y_train, y_val = train_test_split(
    tweets_data['tweet_text'], 
    tweets_data['label'], 
    test_size=0.2, random_state=42
)

In [5]:
print('{0}, {1}'.format(x_train[0], y_train[0]))

In other words #katandandre, your food was crapilicious! #mkr, 0


In [6]:
maxlen = 100
vocab_size = 20000

# tokenize and pad sequences
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(x_train) # sets up the internal vocabulary using the training data
x_train = tokenizer.texts_to_sequences(x_train) # convert text to sequence, i.e. each word is represented by a number
x_val = tokenizer.texts_to_sequences(x_val)

x_train = pad_sequences(x_train, maxlen=maxlen)
x_val = pad_sequences(x_val, maxlen=maxlen)

In [7]:
x_train[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,    40, 14912,   529,   143,
         277,    27,    95,   113,  1144,    42,   628,  1346,     8,
          87,   775, 14913,   290,  5413,  1125,   190, 14914,    85,
         193], dtype=int32)

In [8]:
# model construction
embed_dim = 100 # embedding size for each token
num_heads = 3 # number of attention heads
ff_dim = 128 # hidden layer size in feed forward network inside transformer

num_classes = len(tweets_data['cyberbullying_type'].unique())
inputs = layers.Input(shape=(embed_dim,))
embedding_layer = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
x = TransformerBlock(embed_dim, num_heads, ff_dim)(embedding_layer)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(50, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)

model = models.Model(inputs=inputs, outputs=outputs)

2024-05-25 02:17:27.814033: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2024-05-25 02:17:27.814058: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:134] retrieving CUDA diagnostic information for host: vito-msi
2024-05-25 02:17:27.814064: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:141] hostname: vito-msi
2024-05-25 02:17:27.814181: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:165] libcuda reported version is: 550.54.15
2024-05-25 02:17:27.814195: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:169] kernel reported version is: 550.54.15
2024-05-25 02:17:27.814198: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:248] kernel version seems to match DSO: 550.54.15


In [9]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0001), 
    loss="sparse_categorical_crossentropy", 
    metrics=["accuracy"]
)

history = model.fit(
    x_train, y_train, batch_size=32, epochs=10, validation_data=(x_val, y_val)
)


Epoch 1/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 29ms/step - accuracy: 0.3901 - loss: 1.4546 - val_accuracy: 0.7667 - val_loss: 0.4877
Epoch 2/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 49ms/step - accuracy: 0.7850 - loss: 0.4720 - val_accuracy: 0.8165 - val_loss: 0.4441
Epoch 3/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 81ms/step - accuracy: 0.8407 - loss: 0.3811 - val_accuracy: 0.8374 - val_loss: 0.3957
Epoch 4/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 126ms/step - accuracy: 0.8755 - loss: 0.3104 - val_accuracy: 0.8236 - val_loss: 0.4212
Epoch 5/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 57ms/step - accuracy: 0.8980 - loss: 0.2654 - val_accuracy: 0.8430 - val_loss: 0.4276
Epoch 6/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 48ms/step - accuracy: 0.9114 - loss: 0.2245 - val_accuracy: 0.8382 - val_loss: 0.4679
Ep