In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix

In [2]:
import pandas as pd

In [3]:
df=pd.read_csv('dataset/heart.csv')

In [4]:
df.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [6]:
x=df.drop('target',axis=1)

In [7]:
y=df['target']

In [8]:
x.shape,y.shape

((303, 13), (303,))

In [9]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [10]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [11]:
scaler=StandardScaler()

In [12]:
x_train_scaled=scaler.fit_transform(x_train)


In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
x_test_scaled=scaler.transform(x_test)

In [15]:
model=RandomForestClassifier(random_state=42)

In [16]:
param_grid = {
    'n_estimators': [100, 300, 500],  # Higher values to ensure better performance
    'max_depth': [10, 15, 20],  # Restricting depth for simplicity
    'min_samples_split': [2, 5, 10],  # Smaller values for classification
    'min_samples_leaf': [1, 2, 4],  # Typical range for classification
    'bootstrap': [True, False]
}

In [17]:
grid_search=GridSearchCV(estimator=model,param_grid=param_grid,cv=5,n_jobs=-1,verbose=2,scoring='roc_auc')

In [18]:
grid_search

In [19]:
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [20]:
model=grid_search.best_estimator_

In [21]:
model

In [22]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 15,
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 100}

In [23]:
cross_val_scores = cross_val_score(model, x_train_scaled, y_train, cv=5, scoring='roc_auc')

In [24]:
cross_val_scores

array([0.88888889, 0.90572391, 0.91710758, 0.93356643, 0.92307692])

In [25]:
y_pred=model.predict(x_test_scaled)

In [26]:
y_pred

array([0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1], dtype=int64)

In [27]:
print(confusion_matrix(y_test,y_pred))

[[22  7]
 [ 8 24]]


In [29]:
import pickle
with open('models/model.pkl','wb') as model_file:
    pickle.dump(model,model_file)
with open('models/scaler.pkl','wb') as scaler_file:
    pickle.dump(scaler,scaler_file)