# Import Packages and Dataset

## Packages

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Input

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,hamming_loss

import matplotlib.pyplot as plt
import seaborn as sns

## Dataset

In [3]:
# Mengatur pandas agar menampilkan semua teks tanpa terpotong
pd.set_option('display.max_colwidth', None)

In [9]:
# Mengatur pandas agar menampilkan hingga 100 baris
pd.set_option('display.max_rows', None)

In [2]:
df_test = pd.read_csv('data_test_clean.csv')
df_train = pd.read_csv('data_train_clean.csv')

In [10]:
df_train.head(100)

Unnamed: 0,Tweet_Parsed,HS
0,kadang will be kind enough to show you how to look berkelas tidak mau punya teman kampungan drinks only the finest wine orang no drinking when he being modest will accompany you to watch movies does not listen to you that much but will know when kamu not ok because he actualy cares about kamu,0
1,ternyata komunis juga bisa menangis,0
2,user user kenapa harus bom seperti benar menunjukkan kita ini siapa dengan embel bom masih buat takut orang orang sebenarnya apalagi keluar menjadi korban,0
3,user sumpah kaya kalau habis iya dilanjut olimpiade sama pagelaran budaya tidak,0
4,rt user user user user user fungsi media sosial di indonesia untuk sebar fitnah di as untuk kecerdasan url,1
5,calon wakil presiden terduga korupsi e kartu tanda penduduk calon gubernur jawa tengahnya terduga korupsi e kartu tanda penduduk tapi masih belum ditindak ganti presiden presiden baru,1
6,israel unforgiven you are the real terrorist laknat allah menantimu zionis pray for gaza pray for palestine,1
7,how do we know that they just said a bullshit rt user tetap pergi ke yunani anggota buku dewan perwakilan rakyat jamin tidak pelesiran,1
8,happy anniversary failed bulan dengan pria berengsek,0
9,user anjing benar benar ayam kampus kamu,1


In [27]:
# Mengambil 100 baris secara acak
random_sample = df_train.sample(n=100)

In [28]:
random_sample

Unnamed: 0,Tweet_Parsed,HS
1844,di arab koruptor di potong tangan di cina koruptor di potong kepala di indonesia koruptor di potong masa tahanan,1
6789,angry disappointed emptiness like no hope at all tahun kasus novel jokowi gagal ganti presiden,0
10916,user anjing tai goblok idiot bangsat monyet babi fuck kont kangen goblok iya tau kasar banget maaf,1
2252,user fitnah am intel yahudi belum dihukum,0
10683,pemerintah diminta pergi peraturan presiden tenaga kerja asing,0
4634,apa the fuck apa girl friend bangsat anjing memang,1
8401,kamu kaya jablay grand theft auto maunya dijemput pakai mobil,1
11789,usir cina dari negara kesatuan republik indonesia warga negara indonesia keturunan cina diskrining dulu,1
7951,kesepakatan bersama dodi giri sumatera selatan user user user user user,0
8492,user merong merong bukan karena gendut atau kurus bukan karena kurang ngewe tetapi karena banyak uit senang saja tidak perlu merong merong tinggal nih nih,0


In [19]:
df_val = pd.read_csv('data_val_nostemstop_preprocessed.csv',sep=";")

In [20]:
kolom_yang_diinginkan = ['Tweet_Parsed', 'HS']

In [21]:
df_val = df_val[kolom_yang_diinginkan]

In [22]:
df_val

Unnamed: 0,Tweet_Parsed,HS
0,so much berengsek people in the world,1
1,so how can i get my rekening berengsek rekenin...,0
2,rt user dasar bajingan url,0
3,user karena kecebong tidak akan mengakui pria ...,1
4,rt user tidak ada urusannya monyet aku broken ...,1
...,...,...
1467,apa dengan this shame on you malaysia apa happen,0
1468,kemana paduka ketika ulama jabar diserang bahk...,0
1469,user cakap cakap bolot,1
1470,user user if you heard from our government an...,1


In [23]:
df_val.to_csv('data_val_clean.csv', index=False)

In [4]:
df_train.head()

Unnamed: 0,Tweet_Parsed,HS
0,kadang will be kind enough to show you how to...,0
1,ternyata komunis juga bisa menangis,0
2,user user kenapa harus bom seperti benar men...,0
3,user sumpah kaya kalau habis iya dilanjut olim...,0
4,rt user user user user user fungsi media sosia...,1


In [5]:
df_train['HS'].value_counts()

HS
0    7884
1    5360
Name: count, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

In [24]:
x_train = df_train['Tweet_Parsed']
x_test = df_test['Tweet_Parsed']
x_val = df_val['Tweet_Parsed']

In [25]:
y_train = df_train['HS']
y_test = df_test['HS']
y_val = df_val['HS']

In [10]:
x_train.head()

0     kadang will be kind enough to show you how to...
1                 ternyata komunis juga bisa menangis 
2    user user kenapa harus bom  seperti  benar men...
3    user sumpah kaya kalau habis iya dilanjut olim...
4    rt user user user user user fungsi media sosia...
Name: Tweet_Parsed, dtype: object

In [11]:
y_train.head()

0    0
1    0
2    0
3    0
4    1
Name: HS, dtype: int64

In [12]:
y_test.shape

(3680,)

# Size of Vocabulary

In [26]:
vocab_size = 5000

In [27]:
vect = Tokenizer(num_words = vocab_size)
vect.fit_on_texts(x_train)
# vocab_size = len(vect.word_index)+1

print(vocab_size)

5000


# Modelling Using LSTMs

## Padding and preparing input sequences

In [28]:
encoded_docs_train = vect.texts_to_sequences(x_train)
padded_docs_train = sequence.pad_sequences(encoded_docs_train,maxlen=100,padding='post')
print(padded_docs_train)

[[935 157  80 ...   0   0   0]
 [344  59  40 ...   0   0   0]
 [  1   1  83 ...   0   0   0]
 ...
 [779 635 295 ...   0   0   0]
 [126   8 492 ...   0   0   0]
 [  9  94  75 ...   0   0   0]]


In [29]:
encoded_docs_test = vect.texts_to_sequences(x_test)
padded_docs_test = sequence.pad_sequences(encoded_docs_test,maxlen=100,padding='post')
print(padded_docs_test)

[[1010   62  160 ...    0    0    0]
 [ 513 1204  254 ...    0    0    0]
 [ 372  209   39 ...    0    0    0]
 ...
 [ 840 2775  843 ...    0    0    0]
 [   1    1 1099 ...    0    0    0]
 [   1 1729   66 ...    0    0    0]]


In [30]:
encoded_docs_val = vect.texts_to_sequences(x_val)
padded_docs_val = sequence.pad_sequences(encoded_docs_val,maxlen=100,padding='post')
print(padded_docs_val)

[[  77  414  160 ...    0    0    0]
 [  77  195  109 ...    0    0    0]
 [  37    1  161 ...    0    0    0]
 ...
 [   1 1209 1209 ...    0    0    0]
 [   1    1  126 ...    0    0    0]
 [   1    1    1 ...    0    0    0]]


## Defining Model

**Penjelasan:**
- vocab_size adalah ukuran kosakata (jumlah kata yang berbeda dalam korpus teks)
- output_dim adalah dimensi dari vektor ruang kata yang dihasilkan. Dalam kasus ini, dimensi vektor adalah 64.
- Lapisan LSTM dalam kode ini memiliki 64 unit LSTM. Bidirectional menandakan bahwa LSTM ini akan dipelajari baik dari data masa lalu ke masa depan maupun sebaliknya, meningkatkan kapabilitasnya dalam memahami konteks dalam urutan.
- Menambahkan lapisan Dense (fully connected) dengan 64 unit dan fungsi aktivasi ReLU (Rectified Linear Activation). Lapisan Dense ini berfungsi sebagai lapisan tersembunyi, yang bertanggung jawab untuk mempelajari representasi fitur yang lebih tingkat dari data.
- fungsi aktivasi sigmoid digunakan untuk menghasilkan probabilitas untuk masing-masing kelas (dsini terdapat 1 kelas).

In [31]:
# # Inisialisasi model
# model = Sequential()
# model.add(Embedding(vocab_size, output_dim=64))
# model.add(Bidirectional(LSTM(64)))
# model.add(Dropout(0.5))
# model.add(Dense(16, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))

model = Sequential()
# Configuring the parameters
# model.add(Input((5000,)))
model.add(Embedding(vocab_size, output_dim=64))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          320000    
                                                                 
 bidirectional (Bidirectiona  (None, 128)              66048     
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 394,369
Trainable params: 394,369
Non-trainable params: 0
_________________________________________________________________


In [28]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model.png')

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


## Training using adam optimizer and binary cross entropy

In [32]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

history = model.fit(padded_docs_train, y_train.values, 
                    validation_data=(padded_docs_val, y_val.values),
                    epochs=15, batch_size=256, 
                    verbose=2)

Epoch 1/15
52/52 - 21s - loss: 0.6394 - accuracy: 0.6265 - val_loss: 0.5235 - val_accuracy: 0.7643 - 21s/epoch - 400ms/step
Epoch 2/15
52/52 - 23s - loss: 0.4267 - accuracy: 0.8077 - val_loss: 0.4278 - val_accuracy: 0.7982 - 23s/epoch - 436ms/step
Epoch 3/15
52/52 - 23s - loss: 0.3298 - accuracy: 0.8578 - val_loss: 0.4583 - val_accuracy: 0.7894 - 23s/epoch - 435ms/step
Epoch 4/15
52/52 - 22s - loss: 0.2790 - accuracy: 0.8797 - val_loss: 0.5127 - val_accuracy: 0.7908 - 22s/epoch - 429ms/step
Epoch 5/15
52/52 - 23s - loss: 0.2434 - accuracy: 0.9034 - val_loss: 0.5583 - val_accuracy: 0.7846 - 23s/epoch - 435ms/step
Epoch 6/15
52/52 - 23s - loss: 0.2145 - accuracy: 0.9152 - val_loss: 0.5973 - val_accuracy: 0.7806 - 23s/epoch - 440ms/step
Epoch 7/15
52/52 - 22s - loss: 0.1866 - accuracy: 0.9305 - val_loss: 0.6130 - val_accuracy: 0.7717 - 22s/epoch - 417ms/step
Epoch 8/15
52/52 - 22s - loss: 0.1623 - accuracy: 0.9400 - val_loss: 0.7482 - val_accuracy: 0.7765 - 22s/epoch - 429ms/step
Epoch 9/

In [33]:
# Final evaluation of the model
scores = model.evaluate(padded_docs_test, y_test.values)

print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 78.67%


# Predict and Result

## Predict Data Test

In [34]:
predict = model.predict(padded_docs_test)
predict_train = model.predict(padded_docs_train)
predict_val = model.predict(padded_docs_val)
thresholds=0.5



In [35]:
def labelSetAccuracy(y_true, y_pred):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
#         print('\nset_true: {0}'.format(set_true))
#         print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [36]:
pred_test = predict.copy()
pred_test[pred_test>=0.5] = 1
pred_test[pred_test<0.5] = 0
acc = accuracy_score(y_test.values,pred_test)
haml_loss = hamming_loss(y_test.values,pred_test)
label_acc = labelSetAccuracy(y_test.values,pred_test)
print("threshold = ",thresholds)
print("exact accuracy = ", acc)
print("hamming loss = ",haml_loss)
print("label based accuracy = ",label_acc)
print("==============================")

threshold =  0.5
exact accuracy =  0.7866847826086957
hamming loss =  0.21331521739130435
label based accuracy =  0.7866847826086957


In [37]:
from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_test, pred_test, target_names=['0','1']))


Classification Report

              precision    recall  f1-score   support

           0       0.81      0.84      0.83      2204
           1       0.75      0.71      0.73      1476

    accuracy                           0.79      3680
   macro avg       0.78      0.77      0.78      3680
weighted avg       0.79      0.79      0.79      3680

