# Import Packages and Dataset

## Packages

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Input

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,hamming_loss

import matplotlib.pyplot as plt
import seaborn as sns

## Dataset

In [2]:
df = pd.read_csv('data_after_cleansing_normalized.csv')

In [3]:
df.head()

Unnamed: 0,Tweet,HS,normalized_tweet,clean_tweet
0,- disaat semua cowok berusaha melacak perhatia...,1,- di saat semua pria berusaha melacak perhatia...,di saat semua pria berusaha melacak perhatian...
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,RT USER: USER siapa telat memberi tau elu?eda...,rt user user siapa telat memberi tau eluedan s...
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,"41. Kadang aku berfikir, kenapa aku tetap perc...",kadang aku berfikir kenapa aku tetap percaya ...
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,user user aku itu akunnku tau matamu sipit tap...
4,USER USER Kaum cebong kapir udah keliatan dong...,1,USER USER Kaum kecebong kafir sudah kelihatan ...,user user kaum kecebong kafir sudah kelihatan ...


In [4]:
df.shape

(18396, 4)

In [5]:
df['HS'].value_counts()

HS
0    10947
1     7449
Name: count, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

In [7]:
input_df=df['clean_tweet']
output_df=df['HS']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(input_df, output_df, test_size = 0.2, random_state = 42)

In [9]:
y_test.shape

(3680,)

In [10]:
x_train.shape

(14716,)

In [11]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=42)

In [12]:
x_val.shape

(1472,)

In [13]:
x_train.head()

3927     katauntukhariini ahoker goblok tuntut lengserk...
7130     user user user egp kalau individualnya kiraan ...
17884    shameonyoumalaysia shameofyoumalaysia anything...
15132    we must disband those are threatening our home...
15415    the place is a warehouse that has been filled ...
Name: clean_tweet, dtype: object

In [14]:
y_train.head()

3927     1
7130     0
17884    1
15132    1
15415    0
Name: HS, dtype: int64

# Size of Vocabulary

In [15]:
vocab_size = 5000

In [16]:
vect = Tokenizer(num_words = vocab_size)
vect.fit_on_texts(x_train)
# vocab_size = len(vect.word_index)+1

print(vocab_size)

5000


# Modelling Using LSTMs

## Padding and preparing input sequences

In [17]:
encoded_docs_train = vect.texts_to_sequences(x_train)
padded_docs_train = sequence.pad_sequences(encoded_docs_train,maxlen=100,padding='post')
print(padded_docs_train)

[[2798  160  226 ...    0    0    0]
 [   1    1    1 ...    0    0    0]
 [  58 4358  650 ...    0    0    0]
 ...
 [   9  366   15 ...    0    0    0]
 [2340  652 1058 ...    0    0    0]
 [ 281  861  314 ...    0    0    0]]


In [18]:
encoded_docs_test = vect.texts_to_sequences(x_test)
padded_docs_test = sequence.pad_sequences(encoded_docs_test,maxlen=100,padding='post')
print(padded_docs_test)

[[  25    1 2683 ...    0    0    0]
 [   1    1   54 ...    0    0    0]
 [3991  223 1343 ...    0    0    0]
 ...
 [   1   91  153 ...    0    0    0]
 [   1    1  157 ...    0    0    0]
 [ 439  399   17 ...    0    0    0]]


In [19]:
encoded_docs_val = vect.texts_to_sequences(x_val)
padded_docs_val = sequence.pad_sequences(encoded_docs_val,maxlen=100,padding='post')
print(padded_docs_val)

[[  26   82    0 ...    0    0    0]
 [1414   21 2701 ...    0    0    0]
 [ 738  131 2565 ...    0    0    0]
 ...
 [   1   67   46 ...    0    0    0]
 [   1 1318  678 ...    0    0    0]
 [   1    1   53 ...    0    0    0]]


## Defining Model

**Penjelasan:**
- vocab_size adalah ukuran kosakata (jumlah kata yang berbeda dalam korpus teks)
- output_dim adalah dimensi dari vektor ruang kata yang dihasilkan. Dalam kasus ini, dimensi vektor adalah 64.
- Lapisan LSTM dalam kode ini memiliki 64 unit LSTM. Bidirectional menandakan bahwa LSTM ini akan dipelajari baik dari data masa lalu ke masa depan maupun sebaliknya, meningkatkan kapabilitasnya dalam memahami konteks dalam urutan.
- Menambahkan lapisan Dense (fully connected) dengan 64 unit dan fungsi aktivasi ReLU (Rectified Linear Activation). Lapisan Dense ini berfungsi sebagai lapisan tersembunyi, yang bertanggung jawab untuk mempelajari representasi fitur yang lebih tingkat dari data.
- fungsi aktivasi sigmoid digunakan untuk menghasilkan probabilitas untuk masing-masing kelas (dsini terdapat 1 kelas).

In [20]:
# # Inisialisasi model
# model = Sequential()
# model.add(Embedding(vocab_size, output_dim=64))
# model.add(Bidirectional(LSTM(64)))
# model.add(Dropout(0.5))
# model.add(Dense(16, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))

model = Sequential()
# Configuring the parameters
# model.add(Input((5000,)))
model.add(Embedding(vocab_size, output_dim=64))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          320000    
                                                                 
 bidirectional (Bidirectiona  (None, 128)              66048     
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 394,369
Trainable params: 394,369
Non-trainable params: 0
_________________________________________________________________


In [28]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model.png')

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


## Training using adam optimizer and binary cross entropy

In [25]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

history = model.fit(padded_docs_train, y_train.values, 
                    validation_data=(padded_docs_val, y_val.values),
                    epochs=15, batch_size=256, 
                    verbose=2)

Epoch 1/15
52/52 - 21s - loss: 0.6322 - accuracy: 0.6323 - val_loss: 0.5080 - val_accuracy: 0.7588 - 21s/epoch - 399ms/step
Epoch 2/15
52/52 - 21s - loss: 0.4115 - accuracy: 0.8139 - val_loss: 0.4236 - val_accuracy: 0.8077 - 21s/epoch - 405ms/step
Epoch 3/15
52/52 - 22s - loss: 0.3101 - accuracy: 0.8680 - val_loss: 0.4494 - val_accuracy: 0.8125 - 22s/epoch - 422ms/step
Epoch 4/15
52/52 - 22s - loss: 0.2655 - accuracy: 0.8894 - val_loss: 0.4681 - val_accuracy: 0.8003 - 22s/epoch - 416ms/step
Epoch 5/15
52/52 - 22s - loss: 0.2244 - accuracy: 0.9126 - val_loss: 0.5425 - val_accuracy: 0.7948 - 22s/epoch - 417ms/step
Epoch 6/15
52/52 - 23s - loss: 0.1975 - accuracy: 0.9207 - val_loss: 0.5853 - val_accuracy: 0.7942 - 23s/epoch - 434ms/step
Epoch 7/15
52/52 - 22s - loss: 0.1641 - accuracy: 0.9385 - val_loss: 0.6596 - val_accuracy: 0.7846 - 22s/epoch - 429ms/step
Epoch 8/15
52/52 - 22s - loss: 0.1355 - accuracy: 0.9524 - val_loss: 0.7948 - val_accuracy: 0.7819 - 22s/epoch - 430ms/step
Epoch 9/

In [26]:
# Final evaluation of the model
scores = model.evaluate(padded_docs_test, y_test.values)

print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 77.61%


# Predict and Result

## Predict Data Test

In [27]:
predict = model.predict(padded_docs_test)
predict_train = model.predict(padded_docs_train)
predict_val = model.predict(padded_docs_val)
thresholds=0.5



In [28]:
def labelSetAccuracy(y_true, y_pred):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
#         print('\nset_true: {0}'.format(set_true))
#         print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [29]:
pred_test = predict.copy()
pred_test[pred_test>=0.5] = 1
pred_test[pred_test<0.5] = 0
acc = accuracy_score(y_test.values,pred_test)
haml_loss = hamming_loss(y_test.values,pred_test)
label_acc = labelSetAccuracy(y_test.values,pred_test)
print("threshold = ",thresholds)
print("exact accuracy = ", acc)
print("hamming loss = ",haml_loss)
print("label based accuracy = ",label_acc)
print("==============================")

threshold =  0.5
exact accuracy =  0.7760869565217391
hamming loss =  0.22391304347826088
label based accuracy =  0.7760869565217391


In [30]:
from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_test, pred_test, target_names=['0','1']))


Classification Report

              precision    recall  f1-score   support

           0       0.81      0.81      0.81      2167
           1       0.73      0.73      0.73      1513

    accuracy                           0.78      3680
   macro avg       0.77      0.77      0.77      3680
weighted avg       0.78      0.78      0.78      3680

