## data


In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# read data
df1 = pd.read_csv("/content/drive/MyDrive/PROPOSAL dan TA/code/hadis_preprocessed_stemakhir.csv")
df2= pd.read_csv("/content/drive/MyDrive/PROPOSAL dan TA/code/hadis_preprocessed_stemdulu.csv")

In [4]:
#df = df.drop(['Unnamed: 0'], axis=1)
df1.head()

Unnamed: 0,no_stopword,bersih,Class
0,"['iman', 'memiliki', 'enam', 'puluh', 'cabang'...",iman milik enam puluh cabang malu iman,1
1,"['muslim', 'orang', 'kaum', 'muslimin', 'selam...",muslim orang kaum muslimin selamat lisan tanga...,2
2,"['makan', 'salam', 'orang', 'kenal', 'kenal']",makan salam orang kenal kenal,1
3,"['beriman', 'dicintainya', 'orang', 'tuanya', ...",iman cinta orang tua anak,1
4,"['tanda', 'iman', 'mencintai', 'kaum', 'anshar...",tanda iman cinta kaum anshar tanda nifaq benci...,1


In [5]:
df2.head()

Unnamed: 0,no_stopword,bersih,Class
0,"['iman', 'milik', 'enam', 'puluh', 'cabang', '...",iman milik enam puluh cabang malu iman,1
1,"['orang', 'muslim', 'orang', 'kaum', 'muslimin...",orang muslim orang kaum muslimin selamat lisan...,2
2,"['makan', 'salam', 'orang', 'kenal', 'kenal']",makan salam orang kenal kenal,1
3,"['iman', 'orang', 'cinta', 'orang', 'tua', 'an...",iman orang cinta orang tua anak,1
4,"['tanda', 'iman', 'cinta', 'kaum', 'anshar', '...",tanda iman cinta kaum anshar tanda nifaq benci...,1


## Feature extraction and split data

In [6]:
# tf-idf 
tfidf_vect_ = TfidfVectorizer()
df_stemakhir_tfidf = tfidf_vect_.fit_transform(df1['bersih'].apply(lambda x:np.str_(x)))
df_stemdulu_tfidf = tfidf_vect_.fit_transform(df2['bersih'].apply(lambda x:np.str_(x)))

In [7]:
# membagi train test data (stemakhir)
X_stemakhir = df_stemakhir_tfidf
y_stemakhir = df1['Class']

X_train_stemakhir, X_test_stemakhir, y_train_stemakhir, y_test_stemakhir = train_test_split(X_stemakhir, y_stemakhir, test_size=0.2, random_state=0)

print("X Train : ", (X_train_stemakhir.shape))
print("y Train : ", (y_train_stemakhir.shape))
print("X Test : ", (X_test_stemakhir.shape))
print("y Test : ", (y_test_stemakhir.shape))

X Train :  (1300, 3429)
y Train :  (1300,)
X Test :  (326, 3429)
y Test :  (326,)


In [8]:
# membagi train test data (stemdulu)
X_stemdulu = df_stemdulu_tfidf
y_stemdulu = df2['Class']

X_train_stemdulu, X_test_stemdulu, y_train_stemdulu, y_test_stemdulu = train_test_split(X_stemdulu, y_stemdulu, test_size=0.2, random_state=0)

print("X Train : ", (X_train_stemdulu.shape))
print("y Train : ", (y_train_stemdulu.shape))
print("X Test : ", (X_test_stemdulu.shape))
print("y Test : ", (y_test_stemdulu.shape))

X Train :  (1300, 3304)
y Train :  (1300,)
X Test :  (326, 3304)
y Test :  (326,)


## Modeling

In [None]:
# stem akhir
xg = XGBClassifier()

xg.fit(X_train_stemakhir, y_train_stemakhir)
predictions_stemakhir = xg.predict(X_test_stemakhir)
print(accuracy_score(y_test_stemakhir, predictions_stemakhir))
print(confusion_matrix(y_test_stemakhir, predictions_stemakhir))
print(classification_report(y_test_stemakhir, predictions_stemakhir))

0.7177914110429447
[[87  0 11]
 [21 74 13]
 [41  6 73]]
              precision    recall  f1-score   support

           1       0.58      0.89      0.70        98
           2       0.93      0.69      0.79       108
           3       0.75      0.61      0.67       120

    accuracy                           0.72       326
   macro avg       0.75      0.73      0.72       326
weighted avg       0.76      0.72      0.72       326



In [None]:
# stem dulu
xg = XGBClassifier()

xg.fit(X_train_stemdulu, y_train_stemdulu)
predictions_stemdulu = xg.predict(X_test_stemdulu)
print(accuracy_score(y_test_stemdulu, predictions_stemdulu))
print(confusion_matrix(y_test_stemdulu, predictions_stemdulu))
print(classification_report(y_test_stemdulu, predictions_stemdulu))

0.7085889570552147
[[88  0 10]
 [23 72 13]
 [41  8 71]]
              precision    recall  f1-score   support

           1       0.58      0.90      0.70        98
           2       0.90      0.67      0.77       108
           3       0.76      0.59      0.66       120

    accuracy                           0.71       326
   macro avg       0.74      0.72      0.71       326
weighted avg       0.75      0.71      0.71       326



In [9]:
# stem akhir with parameter
xg = XGBClassifier(max_depth=15)

xg.fit(X_train_stemakhir, y_train_stemakhir)
predictions_stemakhir = xg.predict(X_test_stemakhir)
print(accuracy_score(y_test_stemakhir, predictions_stemakhir))
print(confusion_matrix(y_test_stemakhir, predictions_stemakhir))
print(classification_report(y_test_stemakhir, predictions_stemakhir))

0.745398773006135
[[81  3 14]
 [15 81 12]
 [29 10 81]]
              precision    recall  f1-score   support

           1       0.65      0.83      0.73        98
           2       0.86      0.75      0.80       108
           3       0.76      0.68      0.71       120

    accuracy                           0.75       326
   macro avg       0.76      0.75      0.75       326
weighted avg       0.76      0.75      0.75       326



In [10]:
# stem dulu with parameter
xg = XGBClassifier(max_depth=44)

xg.fit(X_train_stemdulu, y_train_stemdulu)
predictions_stemdulu = xg.predict(X_test_stemdulu)
print(accuracy_score(y_test_stemdulu, predictions_stemdulu))
print(confusion_matrix(y_test_stemdulu, predictions_stemdulu))
print(classification_report(y_test_stemdulu, predictions_stemdulu))

0.7668711656441718
[[79  4 15]
 [13 84 11]
 [22 11 87]]
              precision    recall  f1-score   support

           1       0.69      0.81      0.75        98
           2       0.85      0.78      0.81       108
           3       0.77      0.72      0.75       120

    accuracy                           0.77       326
   macro avg       0.77      0.77      0.77       326
weighted avg       0.77      0.77      0.77       326

