In [1]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data_after_cleansing_normalized.csv')

In [3]:
df.head()

Unnamed: 0,Tweet,HS,normalized_tweet,clean_tweet
0,- disaat semua cowok berusaha melacak perhatia...,1,- di saat semua pria berusaha melacak perhatia...,di saat semua pria berusaha melacak perhatian...
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,RT USER: USER siapa telat memberi tau elu?eda...,rt user user siapa telat memberi tau eluedan s...
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,"41. Kadang aku berfikir, kenapa aku tetap perc...",kadang aku berfikir kenapa aku tetap percaya ...
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,user user aku itu akunnku tau matamu sipit tap...
4,USER USER Kaum cebong kapir udah keliatan dong...,1,USER USER Kaum kecebong kafir sudah kelihatan ...,user user kaum kecebong kafir sudah kelihatan ...


In [4]:
df.shape

(18396, 4)

In [5]:
df['HS'].value_counts()

HS
0    10947
1     7449
Name: count, dtype: int64

In [6]:
#check maximum length of word in sentence 
max_sen = df['clean_tweet'].str.split().str.len().max()

In [7]:
max_sen

62

**Split the Data**

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

In [9]:
x_train, x_test, y_train, y_test = train_test_split(df['clean_tweet'], df['HS'], test_size = 0.2, random_state = 42,stratify=df['HS'])

In [10]:
len(x_train)

14716

In [11]:
 x_train.str.split().str.len().max()

62

**Remove Stopword and Tokenization**

In [12]:
#Define Stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('indonesian')
# Get a list of stop words in the English language
stop_words = set(stopwords.words('indonesian'))

# Display the top 20 stop words
list(stop_words)[:20]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gilan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gilan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading indonesian: Package 'indonesian' not found
[nltk_data]     in index


['teringat-ingat',
 'sedangkan',
 'mendatang',
 'terhadapnya',
 'jelas',
 'makin',
 'menyatakan',
 'diperlukan',
 'tidak',
 'nyatanya',
 'apa',
 'secara',
 'kenapa',
 'seharusnya',
 'demikianlah',
 'sedemikian',
 'sendiri',
 'bagian',
 'diucapkan',
 'memang']

In [13]:
#tokenization
word_token=[word_tokenize(i) for i in x_train]

In [14]:
word_token

[['some',
  'berpikir',
  'doesnt',
  'add',
  'up',
  'from',
  'the',
  'present',
  'racial',
  'situation',
  'in',
  'melayu',
  'islam',
  'cina',
  'bukan',
  'islam',
  'kafir',
  'biadap',
  'dapigs',
  'menterbalikkan',
  'jalur',
  'gemilang',
  'komunis',
  'since',
  'when',
  'race',
  'and',
  'religion',
  'became',
  'the',
  'sama',
  'thing',
  'make',
  'up',
  'your',
  'mind',
  'which',
  'card',
  'are',
  'you',
  'playing',
  'race',
  'orang',
  'religion',
  'pictwittercomftbmnagsh'],
 ['because',
  'he',
  'cant',
  'actualy',
  'provide',
  'a',
  'satisfactory',
  'answer',
  'debatcapres',
  'prabowo',
  'tolak',
  'jawab',
  'soal',
  'ham'],
 ['user',
  'ini',
  'aku',
  'salut',
  'dari',
  'rejim',
  'skrg',
  'konsisten',
  'kenceng',
  'di',
  'media',
  'massa',
  'kicep',
  'di',
  'dunia',
  'nyata',
  'ntab'],
 ['user',
  'user',
  'user',
  'celengnya',
  'teriak',
  'teriak',
  'kecebong',
  'tolol',
  'padahal',
  'celeng',
  'lebih',
  'dun

In [15]:
len(word_token)

14716

In [16]:
# Remove stopwords from each sublist in word_token
filtered_tokens_train = [[word for word in sublist if word not in stop_words] for sublist in word_token]

# Display the first sublist of filtered tokens after removing stopwords
print(filtered_tokens_train[0])

['some', 'berpikir', 'doesnt', 'add', 'up', 'from', 'the', 'present', 'racial', 'situation', 'in', 'melayu', 'islam', 'cina', 'islam', 'kafir', 'biadap', 'dapigs', 'menterbalikkan', 'jalur', 'gemilang', 'komunis', 'since', 'when', 'race', 'and', 'religion', 'became', 'the', 'thing', 'make', 'up', 'your', 'mind', 'which', 'card', 'are', 'you', 'playing', 'race', 'orang', 'religion', 'pictwittercomftbmnagsh']


In [17]:
print(filtered_tokens_train[1])

['because', 'he', 'cant', 'actualy', 'provide', 'a', 'satisfactory', 'answer', 'debatcapres', 'prabowo', 'tolak', 'ham']


**Vectorization Word2Vec - Skipgram**

In [18]:
import gensim
from gensim.models import Word2Vec 
model_skipgram = gensim.models.Word2Vec(filtered_tokens_train, min_count = 1,vector_size = 64, window = 3, sg=1) 

In [19]:
vocabulary_skipgram = model_skipgram.wv.index_to_key
word_vec_dict={}
for word in vocabulary_skipgram:
    word_vec_dict[word]=model_skipgram.wv.get_vector(word)
print("The no of key-value pairs : ",len(word_vec_dict)) # should come equal to vocab size

The no of key-value pairs :  32815


In [20]:
word_vec_dict

{'user': array([ 0.56944907, -0.35083717,  0.6027212 ,  0.4549855 , -0.31903425,
        -0.45887846,  0.6518304 , -0.23057774, -0.8117003 , -0.3435235 ,
         0.48262367, -0.4624106 , -0.18731427,  0.11348974, -0.19761647,
         0.51539636, -0.6018836 , -0.2124719 , -0.52464837,  0.47709563,
         0.4714616 ,  1.12566   ,  1.0667888 , -0.5725639 , -0.13207665,
         0.56391567, -0.6942477 ,  0.10319134, -0.16421665, -0.5395064 ,
        -0.29234603, -0.06625048, -0.42586473, -0.63329995, -0.33021492,
        -0.13823389,  0.44768164,  0.10850125,  0.78654844,  0.14443341,
         0.12643114,  0.47285652, -0.27146548, -0.5676319 ,  0.07741218,
        -0.28702694, -0.10087931,  0.1614685 ,  0.29429808,  0.551974  ,
         0.32733777, -0.09789289, -0.07580118,  0.7856838 ,  0.6009493 ,
         0.05723035,  0.32894334, -0.6754641 , -0.3926221 ,  0.8260333 ,
        -0.5736606 , -0.361503  , -0.27014145, -0.0194324 ], dtype=float32),
 'the': array([ 0.14609577, -0.25512627

In [66]:
max_sen_len= max_sen # max lenght of word in a sentence
vocab_size =50000  #ideally it should be len(tok.word_index) + 1  or total no of words in data in this case = 4750, but to handle number of data which not appear in train, for example in test, make the size higher
embed_dim=64 # embedding dimension as choosen in word2vec constructor

In [67]:
from keras.preprocessing.text import one_hot,Tokenizer
tok = Tokenizer()
tok.fit_on_texts(filtered_tokens_train)
vocab_size = vocab_size
encd_rev = tok.texts_to_sequences(filtered_tokens_train)

In [68]:
# now creating the embedding matrix
embed_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.word_index.items():
    embed_vector=word_vec_dict.get(word) #mapping the vector to word in our skipgram dictionary
    if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
        embed_matrix[i]=embed_vector
  # if word is not found then embed_vector corressponding to that vector will stay zero.

**Preparing the Data for Embedding Layer**

In [69]:
encd_rev

[[253,
  348,
  642,
  6358,
  105,
  66,
  2,
  4324,
  6359,
  2418,
  12,
  787,
  20,
  29,
  20,
  43,
  6360,
  12564,
  12565,
  3292,
  12566,
  50,
  2661,
  90,
  1513,
  5,
  678,
  2944,
  2,
  307,
  207,
  105,
  36,
  812,
  379,
  3293,
  26,
  7,
  5135,
  1513,
  9,
  678,
  12567],
 [96, 70, 193, 490, 3734, 8, 12568, 2419, 8326, 58, 1064, 660],
 [1, 2662, 465, 1250, 1920, 12569, 315, 1921, 12570, 212, 693, 8327],
 [1,
  1,
  1,
  12571,
  308,
  308,
  75,
  202,
  1790,
  137,
  52,
  2945,
  2420,
  248,
  16,
  12572,
  328,
  6361,
  12573,
  2080,
  283,
  5136,
  56,
  328,
  8328,
  3294],
 [12574, 8329, 4325, 8330, 903, 72, 5137, 20, 8331, 752, 5138],
 [2946,
  3735,
  2241,
  2663,
  4326,
  4327,
  4328,
  4329,
  15,
  1,
  2947,
  87,
  941,
  27,
  1115,
  1298,
  1422,
  1922,
  1923,
  4330],
 [22, 39, 2, 13, 12575, 12576],
 [3736,
  832,
  56,
  2948,
  120,
  1065,
  537,
  5,
  5139,
  36,
  2421,
  49,
  128,
  39,
  208,
  443,
  201,
  45,
  19,


In [70]:
vocab_size

50000

In [71]:
from keras.utils import pad_sequences
# now padding to have a maximum length of 64
pad_rev= pad_sequences(encd_rev, maxlen=max_sen_len, padding='post')
pad_rev.shape   # note that we had 4749 data and we have padded each review to have  a lenght of 64 words.

(14716, 62)

In [72]:
pad_rev

array([[ 253,  348,  642, ...,    0,    0,    0],
       [  96,   70,  193, ...,    0,    0,    0],
       [   1, 2662,  465, ...,    0,    0,    0],
       ...,
       [3594,    2, 3595, ...,    0,    0,    0],
       [   1,    1,    1, ...,    0,    0,    0],
       [ 106, 8135,    6, ...,    0,    0,    0]])

**Classification**

In [73]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Activation, Flatten

In [116]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Input

model = keras.Sequential()
# Configuring the parameters
# model.add(Input((5000,)))
model.add(Embedding(input_dim=vocab_size,output_dim=embed_dim,input_length=max_sen_len,embeddings_initializer=Constant(embed_matrix)))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(64, activation="relu"))
model.add(Dense(2, activation="sigmoid"))
model.summary()

# model = keras.Sequential()
# model.add(Embedding(input_dim=vocab_size,output_dim=embed_dim,input_length=max_sen_len,embeddings_initializer=Constant(embed_matrix)))
# model.add(layers.LSTM(64))
# model.add(layers.BatchNormalization())
# model.add(layers.Dense(2))
# print(model.summary())

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 62, 64)            3200000   
                                                                 
 bidirectional_10 (Bidirecti  (None, 256)              197632    
 onal)                                                           
                                                                 
 dense_21 (Dense)            (None, 64)                16448     
                                                                 
 dense_22 (Dense)            (None, 2)                 130       
                                                                 
Total params: 3,414,210
Trainable params: 3,414,210
Non-trainable params: 0
_________________________________________________________________


In [117]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer="sgd",
    metrics=["accuracy"],
)

In [118]:
model.fit(pad_rev, y_train, batch_size=2, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1e400400bd0>

**Testing**

**Preparing  Test Data for Embedding Layer**

In [119]:
# Tokenize the words in the testing text data
word_token_test =[word_tokenize(i) for i in x_test]

In [120]:
word_token_test[0]

['di',
 'amerika',
 'diketahui',
 'sekitar',
 'orang',
 'setiap',
 'tahunnya',
 'pindah',
 'dari',
 'agama',
 'kristen',
 'ke',
 'agama',
 'islam']

In [121]:
# Remove stopwords from each sublist in word_token_test
filtered_tokens_test = [[word for word in sublist if word not in stop_words] for sublist in word_token_test]
# Display the first sublist of filtered tokens after removing stopwords
print(filtered_tokens_test[0])

['amerika', 'orang', 'tahunnya', 'pindah', 'agama', 'kristen', 'agama', 'islam']


In [122]:
tok.fit_on_texts(filtered_tokens_test)
encd_rev_test = tok.texts_to_sequences(filtered_tokens_test)

In [123]:
encd_rev_test

[[1148, 8, 9553, 816, 25, 70, 25, 20],
 [1, 1, 809, 1916, 85, 381, 10275, 38, 5421, 861, 10276],
 [812, 186, 2711, 2124, 22],
 [10277, 7, 24, 119, 330, 218, 38, 64],
 [1, 1307, 408, 2185, 3984, 136, 2243],
 [1, 221, 257, 10278, 1612, 8, 2344, 10279, 418, 54],
 [1, 1, 1, 10280, 3750, 582, 3446, 190, 4125, 936, 10281, 10282],
 [16, 3404, 2867, 722, 388],
 [118, 555],
 [9517, 2, 32, 10, 73, 77, 206, 1052, 130, 138, 9381],
 [55,
  126,
  42,
  1689,
  40,
  28,
  853,
  987,
  258,
  695,
  539,
  4,
  2735,
  33,
  305,
  2069,
  24,
  98],
 [31,
  58,
  2122,
  657,
  522,
  637,
  8,
  213,
  1,
  81,
  846,
  1820,
  26,
  6,
  3141,
  19,
  1359,
  32,
  98],
 [1, 1, 675, 10283, 7991, 178, 675, 323, 7969, 800, 3250],
 [117, 200, 511, 133, 9106, 1655, 581],
 [1, 5279, 10284, 472, 2240],
 [7301,
  611,
  605,
  8034,
  949,
  10285,
  10286,
  302,
  10287,
  2297,
  10288,
  206,
  25,
  364,
  10289,
  1548,
  11,
  143,
  4800],
 [1, 1, 1, 1, 7386, 304, 4698, 8134, 4698],
 [23, 966, 

In [124]:
from keras.utils import pad_sequences
# now padding to have a maximum length of 64
pad_rev_test= pad_sequences(encd_rev_test, maxlen=max_sen_len, padding='post')
pad_rev_test.shape   # note that we had 10 testing data and we have padded each review to have  a lenght of 64 words.

# from keras.utils import pad_sequences
# # now padding to have a maximum length of 64
# pad_rev= pad_sequences(encd_rev, maxlen=max_sen_len, padding='post')
# pad_rev.shape   # note that we had 4749 data and we have padded each review to have  a lenght of 64 words.

(3680, 62)

In [125]:
pad_rev_test

array([[1148,    8, 9553, ...,    0,    0,    0],
       [   1,    1,  809, ...,    0,    0,    0],
       [ 812,  186, 2711, ...,    0,    0,    0],
       ...,
       [ 980, 5402,  844, ...,    0,    0,    0],
       [3674, 6339, 4046, ...,    0,    0,    0],
       [1439, 4050, 6975, ...,    0,    0,    0]])

In [126]:
test_predict=model.predict(pad_rev_test)
classe_test=np.argmax(test_predict,axis=1)



In [127]:
classe_test

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [128]:
print('\nClassification Report\n')
print(classification_report(y_test, classe_test))


Classification Report

              precision    recall  f1-score   support

           0       0.64      0.85      0.73      2190
           1       0.57      0.29      0.39      1490

    accuracy                           0.62      3680
   macro avg       0.60      0.57      0.56      3680
weighted avg       0.61      0.62      0.59      3680

