In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("fedesoriano/heart-failure-prediction")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\ntrjs\.cache\kagglehub\datasets\fedesoriano\heart-failure-prediction\versions\1


In [3]:
heart_df=pd.read_csv(path+'/heart.csv')

print(heart_df.head())
heart_df_encoded = pd.get_dummies(heart_df, columns=["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"], drop_first=True)

   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  


In [4]:
X = heart_df_encoded.drop(columns=["HeartDisease"])
y = heart_df_encoded["HeartDisease"] 


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Apply PCA to reduce dimensions (keeping 95% variance)
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
print(X_pca)
print()
print(X_test_pca)

[[  43.71967721   57.9559273 ]
 [  11.9826839    42.96714265]
 [  12.79025014  -16.2015614 ]
 ...
 [  53.59212817    4.11443388]
 [-200.22465536  -11.9921222 ]
 [ 191.89083904  -18.82804482]]

[[-2.05518153e+00  3.74609544e+01]
 [ 3.17455276e+02 -2.68448684e+01]
 [-1.99929491e+02 -1.11737209e+01]
 [-2.00213055e+02 -1.77398533e+00]
 [ 1.10000343e+02  1.91426290e+01]
 [ 5.56234456e+01  2.21809366e+01]
 [-1.98746103e+02  2.55060619e+01]
 [ 7.05185741e+01 -1.86308474e+01]
 [ 7.47023664e+01  3.21924038e+01]
 [-2.01332666e+02 -1.46614265e+01]
 [-5.27297839e+00  2.78582281e+01]
 [ 1.69680127e+01 -4.66993346e+01]
 [ 1.02109712e+02 -2.02591833e+01]
 [ 3.86209019e+01  4.06255306e+01]
 [ 8.87530133e+01 -2.55370097e+01]
 [-1.96831161e+02  5.33098323e+01]
 [ 1.12885599e+02 -1.29816891e+01]
 [-1.96982673e+02  6.41140225e+01]
 [ 1.94624287e+01  8.98244599e+00]
 [ 3.64701713e+02  4.43720847e+00]
 [-1.99947830e+02 -4.72730681e+00]
 [ 2.71445905e+01 -1.08769788e+01]
 [ 8.31289831e+01  2.77874497e+01]
 [

In [6]:

# Normalize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)


y_pred = rf_classifier.predict(X_test)

In [7]:

# performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("accuracy =",accuracy) 
print(classification_rep)

accuracy = 0.875
              precision    recall  f1-score   support

           0       0.85      0.86      0.85        77
           1       0.90      0.89      0.89       107

    accuracy                           0.88       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.88      0.88      0.88       184



In [8]:
new_sample = {
    "Age": 80,
    "Sex": "M",
    "ChestPainType": "ATA",
    "RestingBP": 140,
    "Cholesterol": 260,
    "FastingBS": 1,
    "RestingECG": "Normal",
    "MaxHR": 160,
    "ExerciseAngina": "N",
    "Oldpeak": 1.2,
    "ST_Slope": "Flat"
}

new_sample_df = pd.DataFrame([new_sample])

In [9]:
new_sample_encoded = pd.get_dummies(new_sample_df, columns=["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"], drop_first=True)

missing_cols = set(X.columns) - set(new_sample_encoded.columns)
for col in missing_cols:
    new_sample_encoded[col] = 0  # Add missing columns

new_sample_encoded = new_sample_encoded[X.columns]
new_sample_scaled = scaler.transform(new_sample_encoded)

In [None]:
prediction = rf_classifier.predict(new_sample_scaled)


if (prediction==1):
    print('Predicted Heart Failure: Yes')
else:
    print('Predicted Heart Failure: No')

Predicted Heart Failure: Yes
