In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [21]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [22]:
df['target'].value_counts()

target
1    629
0    561
Name: count, dtype: int64

In [None]:
nominal_features = ['chest pain type', 'resting ecg', 'ST slope']
df_encoded = pd.get_dummies(df, columns=nominal_features, drop_first=True)

In [None]:
X = df_encoded.drop('target', axis=1)
y = df_encoded['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
num_features = ['age', 'resting bp s', 'cholesterol', 'max heart rate', 'oldpeak']
X_train[num_features] = scaler.fit_transform(X_train[num_features])
X_test[num_features] = scaler.transform(X_test[num_features])

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [10]:
rf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       107
           1       0.93      0.95      0.94       131

    accuracy                           0.94       238
   macro avg       0.94      0.94      0.94       238
weighted avg       0.94      0.94      0.94       238



In [None]:
import joblib
import numpy as np

joblib.dump(rf, 'model/rf_model.pkl')


joblib.dump(scaler, 'model/scaler.pkl')

joblib.dump({
    'all_features': list(X_train.columns),
    'numerical_features': num_features,
    'categorical_features': [n for n in X_train.columns if n not in num_features],
}, 'model/feature_info.pkl')

report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df.to_csv('model/classification_report.csv', index=True)

cm = confusion_matrix(y_test, y_pred)
np.save('model/confusion_matrix.npy', cm)

importances = pd.DataFrame({
    'Feature': X_train,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)
importances.to_csv('model/feature_importances.csv', index=False)

In [15]:
for n in X_train.columns:
 if n not in num_features:
  print(n) 

sex
fasting blood sugar
exercise angina
chest pain type_2
chest pain type_3
chest pain type_4
resting ecg_1
resting ecg_2
ST slope_1
ST slope_2
ST slope_3
