In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df = pd.read_csv('../data/cleaned_heart_disease.csv')

In [8]:
df.head()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num,sex_Male,dataset_Hungary,...,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,slope_flat,slope_upsloping,thal_normal,thal_reversable defect
0,1,1.007386,0.705176,0.303643,0.489727,1.368109,-0.3614,-0.871794,1,0,...,0,1,1,0,0,0,0,0,0,0
1,2,1.432034,1.518569,0.789967,-1.181478,0.611589,4.411152,0.879408,1,0,...,0,0,0,0,0,1,1,0,1,0
2,3,1.432034,-0.650479,0.266939,-0.345875,1.651804,2.820301,0.003807,1,0,...,0,0,0,0,0,1,1,0,0,1
3,4,-1.752828,-0.108217,0.459634,1.961979,2.502889,-0.3614,-0.871794,1,0,...,1,0,0,1,0,0,0,0,1,0
4,5,-1.32818,-0.108217,0.037541,1.36512,0.517024,-0.3614,-0.871794,0,0,...,0,0,0,0,0,0,0,1,1,0


In [9]:
X = df.drop(columns=['id','num'])
y = df['num'].round().astype(int)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2 ,random_state=42)

In [14]:
param_dist = {
    'n_estimators': [100,200,300,500],
    'max_depth': [None, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

random_search.fit(X,y)

print("Best Parameters (RandomizedSearchCV):")
print(random_search.best_params_)

print("\nBest Accuracy (CV):", round(random_search.best_score_, 4))

Best Parameters (RandomizedSearchCV):
{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 5}

Best Accuracy (CV): 0.4761


In [15]:
best_params = random_search.best_params_

param_grid = {
    'n_estimators': [best_params['n_estimators'] - 100, best_params['n_estimators'], best_params['n_estimators'] + 100],
    'max_depth': [best_params['max_depth'], best_params['max_depth'] + 5, best_params['max_depth'] + 10] if best_params['max_depth'] else [None, 10, 20],
    'min_samples_split': [best_params['min_samples_split'], best_params['min_samples_split'] + 2],
    'min_samples_leaf': [best_params['min_samples_leaf'], best_params['min_samples_leaf'] + 1],
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
)

grid_search.fit(X_train, y_train)

print("Best Parameters (GridSearchCV):")
print(grid_search.best_params_)
print("\nBest Accuracy (CV):", round(grid_search.best_score_, 4))

Best Parameters (GridSearchCV):
{'max_depth': 15, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 600}

Best Accuracy (CV): 0.5937


In [16]:
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

print("Final Model Evaluation on Test Set:")
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Final Model Evaluation on Test Set:

Accuracy: 0.532608695652174

Classification Report:
               precision    recall  f1-score   support

          -1       0.61      0.91      0.73        75
           0       0.45      0.48      0.46        54
           1       0.38      0.12      0.18        25
           2       0.14      0.04      0.06        26
           3       0.00      0.00      0.00         4

    accuracy                           0.53       184
   macro avg       0.32      0.31      0.29       184
weighted avg       0.45      0.53      0.47       184



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
