## data


In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# read data
df1 = pd.read_csv("/content/drive/MyDrive/PROPOSAL dan TA/code/hadis_preprocessed_stemakhir.csv")
df2= pd.read_csv("/content/drive/MyDrive/PROPOSAL dan TA/code/hadis_preprocessed_stemdulu.csv")

In [4]:
df1.head()

Unnamed: 0,no_stopword,bersih,Class
0,"['iman', 'memiliki', 'enam', 'puluh', 'cabang'...",iman milik enam puluh cabang malu iman,1
1,"['muslim', 'orang', 'kaum', 'muslimin', 'selam...",muslim orang kaum muslimin selamat lisan tanga...,2
2,"['makan', 'salam', 'orang', 'kenal', 'kenal']",makan salam orang kenal kenal,1
3,"['beriman', 'dicintainya', 'orang', 'tuanya', ...",iman cinta orang tua anak,1
4,"['tanda', 'iman', 'mencintai', 'kaum', 'anshar...",tanda iman cinta kaum anshar tanda nifaq benci...,1


In [5]:
df2.head()

Unnamed: 0,no_stopword,bersih,Class
0,"['iman', 'milik', 'enam', 'puluh', 'cabang', '...",iman milik enam puluh cabang malu iman,1
1,"['orang', 'muslim', 'orang', 'kaum', 'muslimin...",orang muslim orang kaum muslimin selamat lisan...,2
2,"['makan', 'salam', 'orang', 'kenal', 'kenal']",makan salam orang kenal kenal,1
3,"['iman', 'orang', 'cinta', 'orang', 'tua', 'an...",iman orang cinta orang tua anak,1
4,"['tanda', 'iman', 'cinta', 'kaum', 'anshar', '...",tanda iman cinta kaum anshar tanda nifaq benci...,1


##FE

In [6]:
# tf-idf 
tfidf_vect_ = TfidfVectorizer()
df_stemakhir_tfidf = tfidf_vect_.fit_transform(df1['bersih'].apply(lambda x:np.str_(x)))
df_stemakhir_tfidf.shape

(1626, 3429)

In [7]:
# tf-idf 
tfidf_vect_ = TfidfVectorizer()
df_stemdulu_tfidf = tfidf_vect_.fit_transform(df2['bersih'].apply(lambda x:np.str_(x)))
df_stemdulu_tfidf.shape

(1626, 3304)

## FS

In [8]:
def chi_square_stemakhir(x, y):
  chi2_selector_stemakhir = SelectKBest(chi2, k=280)
  X_kbest = chi2_selector_stemakhir.fit_transform(x, y)
  return X_kbest

In [9]:
def chi_square_stemdulu(x, y):
  chi2_selector_stemdulu = SelectKBest(chi2, k=280)
  X_kbest = chi2_selector_stemdulu.fit_transform(x, y)
  return X_kbest

In [10]:
y_stemakhir = df1['Class']
y_stemdulu = df2['Class']
x_stemakhir = chi_square_stemakhir(df_stemakhir_tfidf,y_stemakhir)
x_stemdulu = chi_square_stemdulu(df_stemdulu_tfidf,y_stemdulu)

In [11]:
print("Untuk Stem Akhir")
print('Original feature number:', df_stemakhir_tfidf.shape[1])
print('Reduced feature number:', x_stemakhir.shape[1])

Untuk Stem Akhir
Original feature number: 3429
Reduced feature number: 280


In [12]:
print("Untuk Stem Dahulu")
print('Original feature number:', df_stemdulu_tfidf.shape[1])
print('Reduced feature number:', x_stemdulu.shape[1])

Untuk Stem Dahulu
Original feature number: 3304
Reduced feature number: 280


In [13]:
# membagi train test data (stemakhir)

X_train_stemakhir, X_test_stemakhir, y_train_stemakhir, y_test_stemakhir = train_test_split(x_stemakhir, y_stemakhir, test_size=0.2, random_state=0)

print("X Train : ", (X_train_stemakhir.shape))
print("y Train : ", (y_train_stemakhir.shape))
print("X Test : ", (X_test_stemakhir.shape))
print("y Test : ", (y_test_stemakhir.shape))

X Train :  (1300, 280)
y Train :  (1300,)
X Test :  (326, 280)
y Test :  (326,)


In [14]:
# membagi train test data (stemdulu)

X_train_stemdulu, X_test_stemdulu, y_train_stemdulu, y_test_stemdulu = train_test_split(x_stemdulu, y_stemdulu, test_size=0.2, random_state=0)

print("X Train : ", (X_train_stemdulu.shape))
print("y Train : ", (y_train_stemdulu.shape))
print("X Test : ", (X_test_stemdulu.shape))
print("y Test : ", (y_test_stemdulu.shape))

X Train :  (1300, 280)
y Train :  (1300,)
X Test :  (326, 280)
y Test :  (326,)


## Modeling

In [15]:
# stem dulu

xg = XGBClassifier()

xg.fit(X_train_stemdulu, y_train_stemdulu)
predictions_stemdulu = xg.predict(X_test_stemdulu)
print(accuracy_score(y_test_stemdulu, predictions_stemdulu))
print(confusion_matrix(y_test_stemdulu, predictions_stemdulu))
print(classification_report(y_test_stemdulu, predictions_stemdulu))

0.7239263803680982
[[90  1  7]
 [24 74 10]
 [40  8 72]]
              precision    recall  f1-score   support

           1       0.58      0.92      0.71        98
           2       0.89      0.69      0.77       108
           3       0.81      0.60      0.69       120

    accuracy                           0.72       326
   macro avg       0.76      0.73      0.73       326
weighted avg       0.77      0.72      0.73       326



In [None]:
# stem dulu with parameter

xg = XGBClassifier(max_depth=44)

xg.fit(X_train_stemdulu, y_train_stemdulu)
predictions_stemdulu = xg.predict(X_test_stemdulu)
print(accuracy_score(y_test_stemdulu, predictions_stemdulu))
print(confusion_matrix(y_test_stemdulu, predictions_stemdulu))
print(classification_report(y_test_stemdulu, predictions_stemdulu))

0.7423312883435583
[[76  8 14]
 [10 84 14]
 [28 10 82]]
              precision    recall  f1-score   support

           1       0.67      0.78      0.72        98
           2       0.82      0.78      0.80       108
           3       0.75      0.68      0.71       120

    accuracy                           0.74       326
   macro avg       0.75      0.75      0.74       326
weighted avg       0.75      0.74      0.74       326



In [16]:
# stem akhir
xg = XGBClassifier()

xg.fit(X_train_stemakhir, y_train_stemakhir)
predictions_stemakhir = xg.predict(X_test_stemakhir)
print(accuracy_score(y_test_stemakhir, predictions_stemakhir))
print(confusion_matrix(y_test_stemakhir, predictions_stemakhir))
print(classification_report(y_test_stemakhir, predictions_stemakhir))

0.6993865030674846
[[89  1  8]
 [22 74 12]
 [46  9 65]]
              precision    recall  f1-score   support

           1       0.57      0.91      0.70        98
           2       0.88      0.69      0.77       108
           3       0.76      0.54      0.63       120

    accuracy                           0.70       326
   macro avg       0.74      0.71      0.70       326
weighted avg       0.74      0.70      0.70       326



In [35]:
# stem akhir with parameter

xg = XGBClassifier(max_depth=15)

xg.fit(X_train_stemakhir, y_train_stemakhir)
predictions_stemakhir = xg.predict(X_test_stemakhir)
print(accuracy_score(y_test_stemakhir, predictions_stemakhir))
print(confusion_matrix(y_test_stemakhir, predictions_stemakhir))
print(classification_report(y_test_stemakhir, predictions_stemakhir))

0.745398773006135
[[83  4 11]
 [13 86  9]
 [35 11 74]]
              precision    recall  f1-score   support

           1       0.63      0.85      0.72        98
           2       0.85      0.80      0.82       108
           3       0.79      0.62      0.69       120

    accuracy                           0.75       326
   macro avg       0.76      0.75      0.75       326
weighted avg       0.76      0.75      0.75       326

