In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import svm

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

import gensim
from gensim.models import FastText

from tqdm import tqdm

In [2]:
df_train = pd.read_csv("data_train_clean.csv")
df_test = pd.read_csv("data_test_clean.csv")

In [3]:
X_train = df_train['Tweet_Parsed']
y_train = df_train['HS']

X_test = df_test['Tweet_Parsed']
y_test = df_test['HS']

In [4]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=10000) #, max_features=15000
train_tfidf= vectorizer.fit_transform(X_train)

In [5]:
TFIDF_train=pd.DataFrame(train_tfidf.toarray(),columns=vectorizer.get_feature_names_out())
TFIDF_train.head()

Unnamed: 0,aamiin,ab,abad,abadi,abang,abdul,abdullah,ability,able,able to,...,zaman pak,zaman sekarang,zaman susilo,zen,zero,zina,zionis,zionis laknat,zon,zul
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
test_tfidf = vectorizer.transform(X_test)
TFIDF_test=pd.DataFrame(test_tfidf.toarray(),columns=vectorizer.get_feature_names_out())
TFIDF_test.head()

Unnamed: 0,aamiin,ab,abad,abadi,abang,abdul,abdullah,ability,able,able to,...,zaman pak,zaman sekarang,zaman susilo,zen,zero,zina,zionis,zionis laknat,zon,zul
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
model_ft = FastText.load_fasttext_format('fasttext/cc.id.300.bin')
ft = model_ft.wv

  model_ft = FastText.load_fasttext_format('fasttext/cc.id.300.bin')


In [8]:
def feature_expansion(df, feature, n):
    for col in tqdm(df.columns): #loop per kolom / feature
        try: 
            sim_word = ft.similar_by_word(col, topn=n)#mencari Similarity untuk feature
        except:
            sim_word = []
        if sim_word != []: #kalo similarity-nya tidak kosong
            for term in [sim_word[i][0] for i in range(len(sim_word))]: #loop per-word yang ada di Similarity
                if term in feature:
                    #untuk semua feature yang mempunyai nilai = 0, tetapi mempunyai nilai != 0 pada term
                    #nilainya diganti dengan nilai kolom term yang mempunyai nilai bukan 0
                    df[col][(df[col]==0) & (df[term]!=0)] = df[term][(df[col]==0) & (df[term]!=0)]
    return df

In [9]:
#Get Features Name
feature_tfidf = vectorizer.get_feature_names_out()

#implement Feature Expansion for TF-IDF Data
df_fe_tfidf_train = feature_expansion(TFIDF_train, feature_tfidf, 7)
df_fe_tfidf_test = feature_expansion(TFIDF_test, feature_tfidf, 7)

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [22:47<00:00,  7.31it/s]
100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [24:49<00:00,  6.71it/s]


In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

test_SVM_class = make_pipeline(SVC(probability=True))
test_SVM_class.fit(df_fe_tfidf_train, y_train)

In [11]:
SVM_class=test_SVM_class.predict(df_fe_tfidf_test)

In [12]:
print('\nClassification Report\n')
print(classification_report(y_test, SVM_class, target_names=['0','1']))


Classification Report

              precision    recall  f1-score   support

           0       0.81      0.92      0.86      2204
           1       0.84      0.67      0.75      1476

    accuracy                           0.82      3680
   macro avg       0.82      0.79      0.80      3680
weighted avg       0.82      0.82      0.81      3680

