In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.combine import SMOTETomek  # Menggunakan SMOTE-Tomek (Hybrid)
from sklearn.preprocessing import RobustScaler, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,Age,Gender,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,A/G Ratio,Selector
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [3]:
df['Gender_Encoded'] = df['Gender'].map({'Female': 0, 'Male': 1})

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 566 entries, 0 to 565
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             566 non-null    int64  
 1   Gender          566 non-null    object 
 2   TB              566 non-null    float64
 3   DB              566 non-null    float64
 4   Alkphos         566 non-null    int64  
 5   Sgpt            566 non-null    int64  
 6   Sgot            566 non-null    int64  
 7   TP              566 non-null    float64
 8   ALB             566 non-null    float64
 9   A/G Ratio       566 non-null    float64
 10  Selector        566 non-null    int64  
 11  Gender_Encoded  566 non-null    int64  
dtypes: float64(5), int64(6), object(1)
memory usage: 53.2+ KB


In [5]:
df = df.drop(columns=['Gender'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 566 entries, 0 to 565
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             566 non-null    int64  
 1   TB              566 non-null    float64
 2   DB              566 non-null    float64
 3   Alkphos         566 non-null    int64  
 4   Sgpt            566 non-null    int64  
 5   Sgot            566 non-null    int64  
 6   TP              566 non-null    float64
 7   ALB             566 non-null    float64
 8   A/G Ratio       566 non-null    float64
 9   Selector        566 non-null    int64  
 10  Gender_Encoded  566 non-null    int64  
dtypes: float64(5), int64(6)
memory usage: 48.8 KB


In [7]:
X = df.drop(columns=['Selector'])
y = df["Selector"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
import pandas as pd

# 1. Tentukan kolom mana yang mau di-scale (Semua KECUALI 'Gender')
# Kita ambil nama semua kolom, lalu buang 'Gender' dari list
cols_to_scale = [col for col in X_train.columns if col != 'Gender']

# 2. Siapkan ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num',          RobustScaler(),    cols_to_scale)
    ],
    remainder='passthrough',  # PENTING: Kolom sisa ('Gender') dibiarkan lewat tanpa diubah
    verbose_feature_names_out=False # Agar nama kolom tidak berubah jadi aneh (misal: num__Age)
).set_output(transform="pandas") # FITUR BARU: Output langsung jadi DataFrame (bukan array)

# 3. Terapkan pada Data
# fit_transform pada Train
X_train_scaled = preprocessor.fit_transform(X_train)

# transform saja pada Test
X_test_scaled = preprocessor.transform(X_test)

# Cek hasil
print("Preview Data (Gender tidak berubah, yang lain berubah):")
print(X_train_scaled.head())

Preview Data (Gender tidak berubah, yang lain berubah):
          Age      TB   DB   Alkphos      Sgpt      Sgot        TP       ALB  \
107  0.192308  3.0000  2.7  3.713115  4.082759  4.179487 -0.641509 -0.727273   
381 -0.269231 -0.1250 -0.1  0.000000 -0.220690  0.162393  0.415094  0.545455   
182  0.153846  0.6250  0.3  0.008197  0.413793 -0.145299 -0.641509 -0.090909   
296 -0.576923 -0.1250 -0.1 -0.409836 -0.220690 -0.316239  1.018868  1.272727   
277 -0.846154  0.0625  0.2 -0.139344  0.110345  0.008547  0.867925  1.090909   

     A/G Ratio  Gender_Encoded  
107     -0.675             0.0  
381      0.150             0.0  
182      0.400             0.0  
296      0.900            -1.0  
277      0.650             0.0  


In [10]:
smote_tomek = SMOTETomek(random_state=42)

X_train_bal, y_train_bal = smote_tomek.fit_resample(X_train_scaled, y_train)

print("Sebelum SMOTE-Tomek:", np.bincount(y_train))
print("Sesudah SMOTE-Tomek:", np.bincount(y_train_bal))


Sebelum SMOTE-Tomek: [129 323]
Sesudah SMOTE-Tomek: [315 315]


In [11]:
n_features_to_select = 8  # ganti jika mau

base_estimator_rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rfe_rf = RFE(
    estimator=base_estimator_rf,
    n_features_to_select=n_features_to_select,
    step=1
)

rfe_rf.fit(X_train_bal, y_train_bal)

selected_mask_rf = rfe_rf.support_
selected_features_rf = X_train_scaled.columns[selected_mask_rf]

print("Fitur terpilih (RF + RFE + SMOTE-Tomek):")
for f in selected_features_rf:
    print("-", f)

Fitur terpilih (RF + RFE + SMOTE-Tomek):
- Age
- TB
- DB
- Alkphos
- Sgpt
- Sgot
- TP
- ALB


In [12]:
# gunakan fitur terpilih
X_train_sel = X_train_bal[selected_features_rf]
X_test_sel = X_test_scaled[selected_features_rf]

clf_rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

clf_rf.fit(X_train_sel, y_train_bal)

y_pred = clf_rf.predict(X_test_sel)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7280701754385965
[[14 19]
 [12 69]]
              precision    recall  f1-score   support

           0       0.54      0.42      0.47        33
           1       0.78      0.85      0.82        81

    accuracy                           0.73       114
   macro avg       0.66      0.64      0.65       114
weighted avg       0.71      0.73      0.72       114



In [13]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

param_grid = {
    "n_estimators": [200, 300, 400, 500],
    "max_depth": [None, 6, 8, 10, 12, 15],
    "min_samples_split": [2, 3, 4, 5],
    "min_samples_leaf": [1, 2, 3, 4],
    "max_features": ["sqrt", "log2", 0.7, 0.9],
    "bootstrap": [True, False]
}

rf_base = RandomForestClassifier(
    random_state=42,
    n_jobs=-1
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    rf_base,
    param_grid,
    scoring="f1",  # boleh ganti 'roc_auc' atau 'recall'
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train_bal, y_train_bal)

print("Best params:", grid.best_params_)
print("Best CV F1:", grid.best_score_)


Fitting 5 folds for each of 3072 candidates, totalling 15360 fits
Best params: {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best CV F1: 0.8092447657858935


In [14]:
# 1. Ambil model terbaik dari GridSearch
best_rf = grid.best_estimator_

# (Opsional) Tidak perlu fit ulang jika refit=True (default) di GridSearchCV
# Tapi jika mau memastikan, boleh jalankan lagi:
best_rf.fit(X_train_bal, y_train_bal)

# 2. PREDIKSI (Gunakan X_test_scaled, BUKAN X_test_sel)
# X_test_scaled memiliki jumlah kolom yang sama dengan X_train_bal
y_pred_best = best_rf.predict(X_test_scaled) 

# 3. Evaluasi
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best, target_names=['Sehat (0)', 'Sakit (1)']))

Accuracy: 0.7543859649122807

Confusion Matrix:
 [[15 18]
 [10 71]]

Classification Report:
               precision    recall  f1-score   support

   Sehat (0)       0.60      0.45      0.52        33
   Sakit (1)       0.80      0.88      0.84        81

    accuracy                           0.75       114
   macro avg       0.70      0.67      0.68       114
weighted avg       0.74      0.75      0.74       114



In [16]:
y_proba = best_rf.predict_proba(X_test_sel)  # prob kelas 1 (liver)

for thr in [0.3, 0.4, 0.5, 0.6]:
    y_pred_thr = (y_proba >= thr).astype(int)
    print("\nThreshold:", thr)
    print("Accuracy:", accuracy_score(y_test, y_pred_thr))
    print(confusion_matrix(y_test, y_pred_thr))
    print(classification_report(y_test, y_pred_thr))


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- A/G Ratio
- Gender_Encoded
