In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import svm

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

import gensim
from gensim.models import FastText

from tqdm import tqdm

In [2]:
df_train = pd.read_csv("data_train_clean.csv")
df_test = pd.read_csv("data_test_clean.csv")

In [3]:
df_train.head()

Unnamed: 0,Tweet_Parsed,HS
0,kadang will be kind enough to show you how to...,0
1,ternyata komunis juga bisa menangis,0
2,user user kenapa harus bom seperti benar men...,0
3,user sumpah kaya kalau habis iya dilanjut olim...,0
4,rt user user user user user fungsi media sosia...,1


In [4]:
df_test.head()

Unnamed: 0,Tweet_Parsed,HS
0,ganteng tapi berengsek buat apa who do you thi...,1
1,sakit jiwa kali,0
2,does insulting one for being cina kafir make y...,0
3,user user karena partaimu partai demokrasi ind...,1
4,user user jangan salah itu janji jokowi di tu...,1


In [5]:
X_train = df_train['Tweet_Parsed']
y_train = df_train['HS']

X_test = df_test['Tweet_Parsed']
y_test = df_test['HS']

In [6]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
train_tfidf= vectorizer.fit_transform(X_train)

In [7]:
TFIDF_train=pd.DataFrame(train_tfidf.toarray(),columns=vectorizer.get_feature_names_out())
TFIDF_train.head()

Unnamed: 0,aamiin,ab,abadi,abang,able,able to,about,about it,about that,about the,...,youtube,yudhoyono,yudhoyono tidak,zalim,zaman,zaman now,zaman sekarang,zaman susilo,zionis,zon
0,0.0,0.0,0.0,0.0,0.0,0.0,0.107702,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
test_tfidf = vectorizer.transform(X_test)
TFIDF_test=pd.DataFrame(test_tfidf.toarray(),columns=vectorizer.get_feature_names_out())
TFIDF_test.head()

Unnamed: 0,aamiin,ab,abadi,abang,able,able to,about,about it,about that,about the,...,youtube,yudhoyono,yudhoyono tidak,zalim,zaman,zaman now,zaman sekarang,zaman susilo,zionis,zon
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
model_ft = FastText.load_fasttext_format('fasttext/cc.id.300.bin')
ft = model_ft.wv

  model_ft = FastText.load_fasttext_format('fasttext/cc.id.300.bin')


In [10]:
def feature_expansion(df, feature, n):
    for col in tqdm(df.columns): #loop per kolom / feature
        try: 
            sim_word = ft.similar_by_word(col, topn=n)#mencari Similarity untuk feature
        except:
            sim_word = []
        if sim_word != []: #kalo similarity-nya tidak kosong
            for term in [sim_word[i][0] for i in range(len(sim_word))]: #loop per-word yang ada di Similarity
                if term in feature:
                    #untuk semua feature yang mempunyai nilai = 0, tetapi mempunyai nilai != 0 pada term
                    #nilainya diganti dengan nilai kolom term yang mempunyai nilai bukan 0
                    df[col][(df[col]==0) & (df[term]!=0)] = df[term][(df[col]==0) & (df[term]!=0)]
    return df

In [11]:
#Get Features Name
feature_tfidf = vectorizer.get_feature_names_out()

#implement Feature Expansion for TF-IDF Data
df_fe_tfidf_train = feature_expansion(TFIDF_train, feature_tfidf, 20)
df_fe_tfidf_test = feature_expansion(TFIDF_test, feature_tfidf, 20)

100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [14:40<00:00,  5.68it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [13:35<00:00,  6.13it/s]


In [12]:
svm_class = svm.LinearSVC(random_state=42)
svm_class.fit(df_fe_tfidf_train, y_train)



In [13]:
test_svm_class=svm_class.predict(df_fe_tfidf_test)

In [14]:
print('\nClassification Report\n')
print(classification_report(y_test, test_svm_class, target_names=['0','1']))


Classification Report

              precision    recall  f1-score   support

           0       0.82      0.86      0.84      2204
           1       0.78      0.72      0.75      1476

    accuracy                           0.81      3680
   macro avg       0.80      0.79      0.80      3680
weighted avg       0.81      0.81      0.81      3680

