In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import svm

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import nltk
from nltk.corpus import wordnet 
from nltk.tokenize import word_tokenize


### Input Dataset

In [2]:
df_train = pd.read_csv("data_train_clean.csv")
df_test = pd.read_csv("data_test_clean.csv")

In [3]:
df_train.head()

Unnamed: 0,Tweet_Parsed,HS
0,kadang will be kind enough to show you how to...,0
1,ternyata komunis juga bisa menangis,0
2,user user kenapa harus bom seperti benar men...,0
3,user sumpah kaya kalau habis iya dilanjut olim...,0
4,rt user user user user user fungsi media sosia...,1


In [4]:
df_test.head()

Unnamed: 0,Tweet_Parsed,HS
0,ganteng tapi berengsek buat apa who do you thi...,1
1,sakit jiwa kali,0
2,does insulting one for being cina kafir make y...,0
3,user user karena partaimu partai demokrasi ind...,1
4,user user jangan salah itu janji jokowi di tu...,1


### Split Data

In [5]:
X_train = df_train['Tweet_Parsed']
y_train = df_train['HS']

X_test = df_test['Tweet_Parsed']
y_test = df_test['HS']

### Model FastText

In [6]:
tokenized_corpus = [word_tokenize(sentence) for sentence in df_train['Tweet_Parsed']]

In [7]:
import gensim
from gensim.models import FastText

In [8]:
model = FastText(sentences=tokenized_corpus, vector_size=100, window=5, sg=1, min_count=1)

# Training the FastText model
model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=100)

(21813484, 24895600)

In [10]:
model.wv.most_similar("anda",topn=20)

[('wanda', 0.747553825378418),
 ('randa', 0.7253077626228333),
 ('janda', 0.7069796323776245),
 ('banda', 0.7031187415122986),
 ('rwanda', 0.6743406057357788),
 ('andalan', 0.6741406917572021),
 ('ganda', 0.6739581227302551),
 ('pertanda', 0.6634045243263245),
 ('tanda', 0.6497642993927002),
 ('andalannya', 0.6495592594146729),
 ('vanda', 0.6423444151878357),
 ('andaikan', 0.6386123895645142),
 ('tetanda', 0.6297962665557861),
 ('dinda', 0.6292146444320679),
 ('kalap', 0.6242552995681763),
 ('ronda', 0.6234015226364136),
 ('belgia', 0.6100491881370544),
 ('baginda', 0.606590986251831),
 ('andalkan', 0.6052596569061279),
 ('dibiatlah', 0.6020923256874084)]

### Feature Extraction (TD-IDF)

In [11]:
vectorizer = TfidfVectorizer()
train_tfidf= vectorizer.fit_transform(X_train)

In [12]:
TFIDF_train=pd.DataFrame(train_tfidf.toarray(),columns=vectorizer.get_feature_names_out())
TFIDF_train.head()

Unnamed: 0,aaaaaaah,aaaamiiiiiiinnnn,aaid,aamiin,aamiinkan,aarman,aaron,aarze,ab,aba,...,zona,zone,zoom,zorn,zouk,zuck,zul,zulkifli,zumi,zzed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
test_tfidf = vectorizer.transform(X_test)
TFIDF_test=pd.DataFrame(test_tfidf.toarray(),columns=vectorizer.get_feature_names_out())
TFIDF_test.head()

Unnamed: 0,aaaaaaah,aaaamiiiiiiinnnn,aaid,aamiin,aamiinkan,aarman,aaron,aarze,ab,aba,...,zona,zone,zoom,zorn,zouk,zuck,zul,zulkifli,zumi,zzed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Feature Expansion (TF-IDF + FastText)

In [14]:
def feature_expansion(df, feature, n):
    for col in tqdm(df.columns): #loop per kolom / feature
        try: 
            sim_word = model.wv.most_similar(col, topn=n)#mencari Similarity untuk feature
        except:
            sim_word = []
        if sim_word != []: #kalo similarity-nya tidak kosong
            for term in [sim_word[i][0] for i in range(len(sim_word))]: #loop per-word yang ada di Similarity
                if term in feature:
                    #untuk semua feature yang mempunyai nilai = 0, tetapi mempunyai nilai != 0 pada term
                    #nilainya diganti dengan nilai kolom term yang mempunyai nilai bukan 0
                    df[col][(df[col]==0) & (df[term]!=0)] = df[term][(df[col]==0) & (df[term]!=0)]
    return df

#### Combine on TF-IDF Data

In [15]:
#Get Features Name
feature_tfidf = vectorizer.get_feature_names_out()

#implement Feature Expansion for TF-IDF Data
df_fe_tfidf_train = feature_expansion(TFIDF_train, feature_tfidf, 10)
df_fe_tfidf_test = feature_expansion(TFIDF_test, feature_tfidf, 10)

100%|████████████████████████████████████████████████████████████████████████████| 22459/22459 [10:07<00:00, 36.94it/s]
100%|████████████████████████████████████████████████████████████████████████████| 22459/22459 [07:35<00:00, 49.33it/s]


In [24]:
df_fe_tfidf_train

Unnamed: 0,aaaaaaah,aaaamiiiiiiinnnn,aaid,aamiin,aamiinkan,aarman,aaron,aarze,ab,aba,...,zona,zone,zoom,zorn,zouk,zuck,zul,zulkifli,zumi,zzed
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.195937,0.000000,0.195937,0.195937,0.195937,0.097845,0.0,0.097845,0.195937
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.131352,0.000000,0.000000,0.0,0.303436,0.000000
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.439753,0.000000,0.084048,0.000000,0.439753,0.0,0.439753,0.000000
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.368140,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13239,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.222391,0.222391,0.000000,0.222391,0.222391,0.222391,0.222391,0.0,0.222391,0.222391
13240,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.710663,0.710663,0.520821,0.000000,0.710663,0.0,0.710663,0.000000
13241,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.191958,0.000000,0.125904,0.000000,0.135726,0.000000,0.000000,0.0,0.191958,0.000000
13242,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000


### Classification (Linear SVC)

In [16]:
svm_class = svm.LinearSVC(random_state=42)
svm_class.fit(df_fe_tfidf_train, y_train)



#### Testing

In [17]:
test_svm_class=svm_class.predict(df_fe_tfidf_test)

In [None]:
print('\nClassification Report\n')
print(classification_report(y_test, test_svm_class, target_names=['0','1']))