In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import svm

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import nltk
from nltk.corpus import wordnet 
from nltk.tokenize import word_tokenize


In [2]:
df_train = pd.read_csv("data_train_clean.csv")
df_test = pd.read_csv("data_test_clean.csv")

In [3]:
df_train.head()

Unnamed: 0,Tweet_Parsed,HS
0,kadang will be kind enough to show you how to...,0
1,ternyata komunis juga bisa menangis,0
2,user user kenapa harus bom seperti benar men...,0
3,user sumpah kaya kalau habis iya dilanjut olim...,0
4,rt user user user user user fungsi media sosia...,1


In [4]:
df_test.head()

Unnamed: 0,Tweet_Parsed,HS
0,ganteng tapi berengsek buat apa who do you thi...,1
1,sakit jiwa kali,0
2,does insulting one for being cina kafir make y...,0
3,user user karena partaimu partai demokrasi ind...,1
4,user user jangan salah itu janji jokowi di tu...,1


In [5]:
X_train = df_train['Tweet_Parsed']
y_train = df_train['HS']

X_test = df_test['Tweet_Parsed']
y_test = df_test['HS']

In [6]:
tokenized_corpus = [word_tokenize(sentence) for sentence in df_train['Tweet_Parsed']]

In [7]:
import gensim
from gensim.models import Word2Vec,KeyedVectors

In [8]:
model = Word2Vec(sentences=tokenized_corpus)
#model = Word2Vec(sentences=tokenized_corpus, vector_size=50, window=8, sg=0, min_count=3)
# Training the Word2Vec model
model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=100)

(18774315, 24895600)

In [9]:
model.wv.most_similar("kamu",topn=20)

[('aku', 0.5912880897521973),
 ('apa', 0.5436927080154419),
 ('seseorang', 0.46027082204818726),
 ('tai', 0.4477623999118805),
 ('anjing', 0.42978543043136597),
 ('dia', 0.424623966217041),
 ('orang', 0.4179839789867401),
 ('bangkai', 0.40880513191223145),
 ('kakak', 0.4064479172229767),
 ('bangsat', 0.4048042595386505),
 ('kalian', 0.39630207419395447),
 ('bodoh', 0.39595937728881836),
 ('sama', 0.38899552822113037),
 ('mu', 0.3720608651638031),
 ('burik', 0.3714412450790405),
 ('kasar', 0.3667048215866089),
 ('babi', 0.3649771213531494),
 ('kebalik', 0.3608976900577545),
 ('bajingan', 0.3605107069015503),
 ('tidak', 0.3530711829662323)]

In [18]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
#train_tfidf= vectorizer.fit_transform(X_train)
train_tfidf= vectorizer.fit_transform(X_train)

In [19]:
TFIDF_train=pd.DataFrame(train_tfidf.toarray(),columns=vectorizer.get_feature_names_out())
TFIDF_train.head()

Unnamed: 0,aamiin,aamiin rabbal,ab,abad,abadi,abal,abang,abdi,abdul,abdullah,...,zat,zen,zero,zina,zinnirah,zionis,zionis laknat,zon,zul,zumi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
test_tfidf = vectorizer.transform(X_test)
TFIDF_test=pd.DataFrame(test_tfidf.toarray(),columns=vectorizer.get_feature_names_out())
TFIDF_test.head()

Unnamed: 0,aamiin,aamiin rabbal,ab,abad,abadi,abal,abang,abdi,abdul,abdullah,...,zat,zen,zero,zina,zinnirah,zionis,zionis laknat,zon,zul,zumi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
def feature_expansion(df, feature, n):
    for col in tqdm(df.columns): #loop per kolom / feature
        try: 
            sim_word = model.wv.most_similar(col, topn=n)#mencari Similarity untuk feature
        except:
            sim_word = []
        if sim_word != []: #kalo similarity-nya tidak kosong
            for term in [sim_word[i][0] for i in range(len(sim_word))]: #loop per-word yang ada di Similarity
                if term in feature:
                    #untuk semua feature yang mempunyai nilai = 0, tetapi mempunyai nilai != 0 pada term
                    #nilainya diganti dengan nilai kolom term yang mempunyai nilai bukan 0
                    df[col][(df[col]==0) & (df[term]!=0)] = df[term][(df[col]==0) & (df[term]!=0)]
    return df

In [22]:
#Get Features Name
feature_tfidf = vectorizer.get_feature_names_out()

#implement Feature Expansion for TF-IDF Data
df_fe_tfidf_train = feature_expansion(TFIDF_train, feature_tfidf, 10)
df_fe_tfidf_test = feature_expansion(TFIDF_test, feature_tfidf, 10)

100%|████████████████████████████████████████████████████████████████████████████| 20000/20000 [03:23<00:00, 98.05it/s]
100%|███████████████████████████████████████████████████████████████████████████| 20000/20000 [02:00<00:00, 166.42it/s]


In [15]:
df_fe_tfidf_train

Unnamed: 0,aamiin,ab,abad,abadi,abang,abdul,abdullah,ability,able,able to,...,zaman pak,zaman sekarang,zaman susilo,zen,zero,zina,zionis,zionis laknat,zon,zul
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.157118,0.0,...,0.0,0.0,0.0,0.051628,0.078212,0.260216,0.260216,0.0,0.108967,0.120503
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.000000,0.0,...,0.0,0.0,0.0,0.406389,0.282108,0.282108,0.282108,0.0,0.282108,0.282108
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.000000,0.0,...,0.0,0.0,0.0,0.147631,0.110797,0.110797,0.110797,0.0,0.142974,0.110797
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.000000,0.0,...,0.0,0.0,0.0,0.147441,0.147441,0.147441,0.147441,0.0,0.306087,0.147441
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.000000,0.0,...,0.0,0.0,0.0,0.220134,0.220134,0.220134,0.220134,0.0,0.220134,0.220134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13239,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.1026,0.151687,0.0,...,0.0,0.0,0.0,0.095771,0.172734,0.071838,0.144666,0.0,0.144666,0.079623
13240,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.000000,0.0,...,0.0,0.0,0.0,0.508442,0.508442,0.508442,0.508442,0.0,0.508442,0.508442
13241,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.000000,0.0,...,0.0,0.0,0.0,0.199286,0.120584,0.120584,0.120584,0.0,0.199286,0.260184
13242,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0000,0.000000,0.0,...,0.0,0.0,0.0,0.303748,0.074116,0.141681,0.141681,0.0,0.300683,0.300683


In [23]:
svm_class = svm.LinearSVC(random_state=42)
svm_class.fit(df_fe_tfidf_train, y_train)



In [24]:
test_svm_class=svm_class.predict(df_fe_tfidf_test)

In [25]:
print('\nClassification Report\n')
print(classification_report(y_test, test_svm_class, target_names=['0','1']))


Classification Report

              precision    recall  f1-score   support

           0       0.81      0.82      0.81      2204
           1       0.72      0.71      0.72      1476

    accuracy                           0.77      3680
   macro avg       0.77      0.76      0.76      3680
weighted avg       0.77      0.77      0.77      3680



In [26]:
#Get Features Name
feature_tfidf = vectorizer.get_feature_names_out()

#before
TFIDF_train[feature_tfidf].sum().sum()

6193669.950693104