In [1]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.neighbors import KernelDensity
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,Age,Gender,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,A/G Ratio,Selector
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [3]:
df['Gender_Encoded'] = df['Gender'].map({'Female': 0, 'Male': 1})

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 566 entries, 0 to 565
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             566 non-null    int64  
 1   Gender          566 non-null    object 
 2   TB              566 non-null    float64
 3   DB              566 non-null    float64
 4   Alkphos         566 non-null    int64  
 5   Sgpt            566 non-null    int64  
 6   Sgot            566 non-null    int64  
 7   TP              566 non-null    float64
 8   ALB             566 non-null    float64
 9   A/G Ratio       566 non-null    float64
 10  Selector        566 non-null    int64  
 11  Gender_Encoded  566 non-null    int64  
dtypes: float64(5), int64(6), object(1)
memory usage: 53.2+ KB


In [5]:
df = df.drop(columns=['Gender'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 566 entries, 0 to 565
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             566 non-null    int64  
 1   TB              566 non-null    float64
 2   DB              566 non-null    float64
 3   Alkphos         566 non-null    int64  
 4   Sgpt            566 non-null    int64  
 5   Sgot            566 non-null    int64  
 6   TP              566 non-null    float64
 7   ALB             566 non-null    float64
 8   A/G Ratio       566 non-null    float64
 9   Selector        566 non-null    int64  
 10  Gender_Encoded  566 non-null    int64  
dtypes: float64(5), int64(6)
memory usage: 48.8 KB


In [7]:
X = df.drop(columns=['Selector'])
y = df["Selector"]

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
import pandas as pd

# 1. Tentukan kolom mana yang mau di-scale (Semua KECUALI 'Gender')
# Kita ambil nama semua kolom, lalu buang 'Gender' dari list
cols_to_scale = [col for col in X.columns if col != 'Gender']

# 2. Siapkan ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        # ('nama_step',  JenisScaler(),     [list_kolom_target])
        ('num',          RobustScaler(),    cols_to_scale)
    ],
    remainder='passthrough',  # PENTING: Kolom sisa ('Gender') dibiarkan lewat tanpa diubah
    verbose_feature_names_out=False # Agar nama kolom tidak berubah jadi aneh (misal: num__Age)
).set_output(transform="pandas") # FITUR BARU: Output langsung jadi DataFrame (bukan array)

# 3. Terapkan pada Data
# fit_transform pada Train
X_scaled = preprocessor.fit_transform(X)

# Cek hasil
print("Preview Data (Gender tidak berubah, yang lain berubah):")
print(X_scaled.head())

Preview Data (Gender tidak berubah, yang lain berubah):
    Age        TB        DB   Alkphos      Sgpt      Sgot        TP       ALB  \
0  0.80 -0.166667 -0.181818 -0.172131 -0.503311 -0.370968  0.142857  0.166667   
1  0.68  5.500000  4.727273  4.024590  0.768212  0.951613  0.642857  0.083333   
2  0.68  3.500000  3.454545  2.311475  0.662252  0.435484  0.285714  0.166667   
3  0.52  0.000000  0.090909 -0.213115 -0.556291 -0.338710  0.142857  0.250000   
4  1.08  1.611111  1.545455 -0.106557 -0.211921  0.290323  0.500000 -0.583333   

   A/G Ratio  Gender_Encoded  
0     -0.125            -1.0  
1     -0.525             0.0  
2     -0.150             0.0  
3      0.125             0.0  
4     -1.375             0.0  


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import classification_report

# 1. Inisialisasi Model
rf_model = RandomForestClassifier(
    n_estimators=100,       # Jumlah pohon (default 100)
    random_state=42,        # Agar hasil konsisten
    class_weight='balanced' # PENTING: Mengatasi data imbalanced (Sakit vs Sehat)
)

# 2. Latih Model (Fit)
rf_model.fit(X_train, y_train)

# 1. Lakukan Prediksi (Jika belum)
y_pred = rf_model.predict(X_test)

# 2. Tampilkan Report
# target_names membantu kita membaca label (0 itu apa, 1 itu apa)
print(classification_report(y_test, y_pred, target_names=['Sehat (0)', 'Sakit (1)']))

              precision    recall  f1-score   support

   Sehat (0)       0.62      0.24      0.35        33
   Sakit (1)       0.75      0.94      0.84        81

    accuracy                           0.74       114
   macro avg       0.68      0.59      0.59       114
weighted avg       0.71      0.74      0.69       114

