In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

import gensim
from gensim.models import Word2Vec,KeyedVectors

from sklearn.feature_extraction.text import TfidfVectorizer

import re
from tqdm import tqdm
import nltk
from nltk.corpus import wordnet 
from nltk.tokenize import word_tokenize

**Input Data dan Split Data**

In [2]:
df_train = pd.read_csv('data_train_clean.csv')
df_test = pd.read_csv('data_test_clean.csv')

In [3]:
df_train.head()

Unnamed: 0,Tweet_Parsed,HS
0,kadang will be kind enough to show you how to...,0
1,ternyata komunis juga bisa menangis,0
2,user user kenapa harus bom seperti benar men...,0
3,user sumpah kaya kalau habis iya dilanjut olim...,0
4,rt user user user user user fungsi media sosia...,1


In [4]:
x_train = df_train['Tweet_Parsed']
x_test = df_test['Tweet_Parsed']
y_train = df_train['HS']
y_test = df_test['HS']

**Model Word2Vec**

In [5]:
tokenized_corpus = [word_tokenize(sentence) for sentence in df_train['Tweet_Parsed']]

In [6]:
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, sg=1, min_count=1)

# Training the Word2Vec model
model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=100)

(21812301, 24895600)

In [21]:
model.wv.most_similar("saya",topn=20)

[('analogi', 0.5685824751853943),
 ('menikmatinya', 0.5651764273643494),
 ('nisan', 0.5462781190872192),
 ('kerudungan', 0.543755829334259),
 ('nurdiana', 0.5394580960273743),
 ('mbokne', 0.5393359065055847),
 ('menuliskan', 0.5378443002700806),
 ('cate', 0.5333769917488098),
 ('andin', 0.5326834321022034),
 ('kaiju', 0.5308528542518616),
 ('souvenir', 0.529785692691803),
 ('melantunkan', 0.5269097685813904),
 ('marahi', 0.5231413245201111),
 ('owa', 0.5228658318519592),
 ('nyeletuk', 0.5213983654975891),
 ('rangkuman', 0.520933210849762),
 ('jinan', 0.5201530456542969),
 ('er', 0.518860399723053),
 ('sepang', 0.5185209512710571),
 ('sudin', 0.5182080268859863)]

**TF-IDF**

In [9]:
vectorizer = TfidfVectorizer()
train_tfidf= vectorizer.fit_transform(x_train)

In [10]:
TFIDF_train=pd.DataFrame(train_tfidf.toarray(),columns=vectorizer.get_feature_names_out())
TFIDF_train.head()

Unnamed: 0,aaaaaaah,aaaamiiiiiiinnnn,aaid,aamiin,aamiinkan,aarman,aaron,aarze,ab,aba,...,zona,zone,zoom,zorn,zouk,zuck,zul,zulkifli,zumi,zzed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
test_tfidf = vectorizer.transform(x_test)
TFIDF_test=pd.DataFrame(test_tfidf.toarray(),columns=vectorizer.get_feature_names_out())
TFIDF_test.head()

Unnamed: 0,aaaaaaah,aaaamiiiiiiinnnn,aaid,aamiin,aamiinkan,aarman,aaron,aarze,ab,aba,...,zona,zone,zoom,zorn,zouk,zuck,zul,zulkifli,zumi,zzed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Feature Expansion**

In [12]:
def feature_expansion(df, feature, n):
    for col in tqdm(df.columns): #loop per kolom / feature
        try: 
            sim_word = model.wv.most_similar(col, topn=n)#mencari Similarity untuk feature
        except:
            sim_word = []
        if sim_word != []: #kalo similarity-nya tidak kosong
            for term in [sim_word[i][0] for i in range(len(sim_word))]: #loop per-word yang ada di Similarity
                if term in feature:
                    #untuk semua feature yang mempunyai nilai = 0, tetapi mempunyai nilai != 0 pada term
                    #nilainya diganti dengan nilai kolom term yang mempunyai nilai bukan 0
                    df[col][(df[col]==0) & (df[term]!=0)] = df[term][(df[col]==0) & (df[term]!=0)]
    return df

In [13]:
#Get Features Name
feature_tfidf = vectorizer.get_feature_names_out()

#implement Feature Expansion for TF-IDF Data
df_fe_tfidf_train = feature_expansion(TFIDF_train, feature_tfidf, 10)
df_fe_tfidf_test = feature_expansion(TFIDF_test, feature_tfidf, 10)

100%|████████████████████████████████████████████████████████████████████████████| 22459/22459 [11:33<00:00, 32.41it/s]
100%|████████████████████████████████████████████████████████████████████████████| 22459/22459 [06:40<00:00, 56.02it/s]


In [14]:
df_fe_tfidf_train

Unnamed: 0,aaaaaaah,aaaamiiiiiiinnnn,aaid,aamiin,aamiinkan,aarman,aaron,aarze,ab,aba,...,zona,zone,zoom,zorn,zouk,zuck,zul,zulkifli,zumi,zzed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.195937,0.187829,0.000000,0.187829,0.187829,0.000000,0.000000,0.0,0.000000,0.195937
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.375147,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.131352,0.000000,0.287272,0.000000,0.243751,0.000000,0.000000,0.0,0.303436,0.303436
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.084048,0.000000,0.439753,0.000000,0.439753,0.000000,0.439753,0.0,0.439753,0.439753
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.368140,0.000000,0.364282,0.345005,0.000000,0.000000,0.000000,0.0,0.000000,0.364282
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.222391,0.222391,0.222391,0.222391,0.222391,0.222391,0.222391,0.0,0.222391,0.222391
13240,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.520821,0.000000,0.710663,0.710663,0.710663,0.000000,0.710663,0.0,0.710663,0.710663
13241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.191958,0.000000,0.191958,0.000000,0.200245,0.000000,0.191958,0.0,0.177793,0.150331
13242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.175702,0.000000,0.000000,0.000000,0.175702,0.0,0.142084,0.175702


**Modeling**

In [16]:
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

RF_class = RandomForestClassifier()
RF_class.fit(df_fe_tfidf_train, y_train)

**Testing**

In [18]:
test_RF=RF_class.predict(df_fe_tfidf_test)

In [19]:
print('\nClassification Report\n')
print(classification_report(y_test, test_RF, target_names=['0','1']))


Classification Report

              precision    recall  f1-score   support

           0       0.73      0.84      0.78      2204
           1       0.69      0.54      0.60      1476

    accuracy                           0.72      3680
   macro avg       0.71      0.69      0.69      3680
weighted avg       0.72      0.72      0.71      3680

