<a href="https://colab.research.google.com/github/engmariamahmed04/NTI-ML-tasks/blob/main/heart_prediction_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install kaggle --quiet
from google.colab import files
files.upload()  # upload kaggle.json


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"mariamhassan11","key":"cf4bec4839c5c6e8e5e867d3f45f55a3"}'}

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d fedesoriano/heart-failure-prediction -q

Dataset URL: https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction
License(s): ODbL-1.0


In [4]:
!unzip /content/heart-failure-prediction.zip


Archive:  /content/heart-failure-prediction.zip
  inflating: heart.csv               


In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv('/content/heart.csv')
print("Initial shape:", df.shape)
df.head()

Initial shape: (918, 12)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [6]:
print(df.isnull().sum())

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [7]:
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col] = df[col].fillna(df[col].median())#Replace any NaN (missing) values with the median value of that column.
    else:
        df[col] = df[col].fillna(df[col].mode()[0])#Replace NaN values with the most frequent value (the mode).

print("Missing values after cleanup:\n", df.isnull().sum())#Prints the count of missing values in each column after processing.

Missing values after cleanup:
 Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [8]:
# Separate features and target
X = df.drop(columns=['HeartDisease'])
y = df['HeartDisease']

In [9]:
categorical_cols = df.select_dtypes(include='object').columns
print("Categorical Columns:", categorical_cols.tolist())


Categorical Columns: ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']


In [10]:

from sklearn.preprocessing import OneHotEncoder

# Select categorical columns
cat_cols = X.select_dtypes(include='object').columns.tolist()
X_cat = X[cat_cols]


In [11]:
# Initialize OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore')#if it encounters a new category during prediction, it won’t crash — it will just ignore it.


In [12]:
# Encode and convert to dense array
X_cat_encoded = ohe.fit_transform(X_cat).toarray()


In [13]:
# Get feature names and create DataFrame
cat_feature_names = ohe.get_feature_names_out(cat_cols)
X_cat_df = pd.DataFrame(X_cat_encoded, columns=cat_feature_names)

print(" One-hot encoding complete. Shape:", X_cat_df.shape)

from sklearn.preprocessing import StandardScaler


 One-hot encoding complete. Shape: (918, 14)


In [14]:
# Select numeric columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
X_num = X[num_cols]

# Scale numeric data
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)
X_num_scaled_df = pd.DataFrame(X_num_scaled, columns=num_cols)

print(" Numeric scaling complete. Shape:", X_num_scaled_df.shape)

# Merge numeric and categorical features
X_processed = pd.concat([X_num_scaled_df.reset_index(drop=True), X_cat_df.reset_index(drop=True)], axis=1)

print(" Final feature matrix shape:", X_processed.shape)

 Numeric scaling complete. Shape: (918, 6)
 Final feature matrix shape: (918, 20)


In [15]:
from sklearn.model_selection import train_test_split

In [16]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42,shuffle= False)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


Train shape: (642, 20)
Test shape: (276, 20)


In [17]:
# Train SVM model
svm = SVC(kernel='rbf')
svm.fit(X_train, y_train)


In [18]:
# Predict on test set
y_pred = svm.predict(X_test)

# Evaluation
print(" Accuracy:", accuracy_score(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))

 Accuracy: 0.7282608695652174

 Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.71      0.74       147
           1       0.70      0.74      0.72       129

    accuracy                           0.73       276
   macro avg       0.73      0.73      0.73       276
weighted avg       0.73      0.73      0.73       276



In [19]:
model = SVC(
    C=1.0,
    kernel='rbf',
    gamma='scale',
    class_weight='balanced',
    probability=True,
    random_state=42
)
model.fit(X_train, y_train)
# Predict on test set
y_pred1 = model.predict(X_test)

# Evaluation
print(" Accuracy:", accuracy_score(y_test, y_pred1))
print("\n Classification Report:\n", classification_report(y_test, y_pred1))


 Accuracy: 0.7608695652173914

 Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.80      0.78       147
           1       0.76      0.72      0.74       129

    accuracy                           0.76       276
   macro avg       0.76      0.76      0.76       276
weighted avg       0.76      0.76      0.76       276



In [20]:

model1= SVC(
    C=1.0,
    kernel='poly',
    gamma='scale',
    class_weight='balanced',
    probability=True,
    random_state=42
)
model1.fit(X_train, y_train)
# Predict on test set
y_pred11 = model1.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred11))
print("\nClassification Report:\n", classification_report(y_test, y_pred11))

Accuracy: 0.782608695652174

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.86      0.81       147
           1       0.81      0.70      0.75       129

    accuracy                           0.78       276
   macro avg       0.79      0.78      0.78       276
weighted avg       0.79      0.78      0.78       276



In [21]:
# Train SVM model
svm1 = SVC(kernel='linear')
svm1.fit(X_train, y_train)

# Predict on test set
y_predsvm = svm1.predict(X_test)

# Evaluation
print(" Accuracy:", accuracy_score(y_test, y_predsvm))
print("\n Classification Report:\n", classification_report(y_test, y_predsvm))


 Accuracy: 0.7282608695652174

 Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.73      0.74       147
           1       0.70      0.72      0.71       129

    accuracy                           0.73       276
   macro avg       0.73      0.73      0.73       276
weighted avg       0.73      0.73      0.73       276



In [22]:
def get_user_input():
    # Create a dictionary with all possible columns initialized to 0
    data = {col: 0 for col in X_processed.columns}

    # Get numerical inputs
    while True:
        try:
            data['Age'] = int(input("Age: "))
            break
        except ValueError:
            print("Invalid input. Please enter a valid integer for Age.")

    while True:
        try:
            data['RestingBP'] = int(input("Resting Blood Pressure: "))
            break
        except ValueError:
            print("Invalid input. Please enter a valid integer for Resting Blood Pressure.")

    while True:
        try:
            data['Cholesterol'] = int(input("Cholesterol: "))
            break
        except ValueError:
            print("Invalid input. Please enter a valid integer for Cholesterol.")

    while True:
        try:
            data['FastingBS'] = int(input("Fasting Blood Sugar (0 or 1): "))
            if data['FastingBS'] in [0, 1]:
                break
            else:
                print("Invalid input. Please enter 0 or 1 for Fasting Blood Sugar.")
        except ValueError:
            print("Invalid input. Please enter a valid integer (0 or 1) for Fasting Blood Sugar.")

    while True:
        try:
            data['MaxHR'] = int(input("Maximum Heart Rate: "))
            break
        except ValueError:
            print("Invalid input. Please enter a valid integer for Maximum Heart Rate.")

    while True:
        try:
            data['Oldpeak'] = float(input("Oldpeak (ST depression): "))
            break
        except ValueError:
            print("Invalid input. Please enter a valid number for Oldpeak.")

    # Get categorical inputs and set the corresponding one-hot encoded column to 1
    sex = input("Sex (M/F): ").upper()
    if sex == 'M':
        data['Sex_M'] = 1
    else:
        data['Sex_F'] = 1

    cp = input("Chest Pain Type (ASY/ATA/NAP/TA): ").upper()
    if cp == 'ATA':
        data['ChestPainType_ATA'] = 1
    elif cp == 'NAP':
        data['ChestPainType_NAP'] = 1
    elif cp == 'TA':
        data['ChestPainType_TA'] = 1
    else:
        data['ChestPainType_ASY'] = 1


    recg = input("Resting ECG (Normal/LVH/ST): ").upper()
    if recg == 'NORMAL':
        data['RestingECG_Normal'] = 1
    elif recg == 'ST':
        data['RestingECG_ST'] = 1
    else:
        data['RestingECG_LVH'] = 1

    exang = input("Exercise Angina (Y/N): ").upper()
    if exang == 'Y':
        data['ExerciseAngina_Y'] = 1
    else:
        data['ExerciseAngina_N'] = 1

    slope = input("ST Slope (Down/Flat/Up): ").upper()
    if slope == 'FLAT':
        data['ST_Slope_Flat'] = 1
    elif slope == 'UP':
        data['ST_Slope_Up'] = 1
    else:
        data['ST_Slope_Down'] = 1


    return pd.DataFrame([data])

In [24]:
user_df = get_user_input()

# Reindex user_df to match the columns of X_processed
user_df = user_df.reindex(columns=X_processed.columns, fill_value=0)

# Select only numerical columns from user_df before scaling
user_num_df = user_df[num_cols]

# Scaling
user_scaled = scaler.transform(user_num_df)

# Combine scaled numerical features with one-hot encoded categorical features
# The categorical features are already in user_df and in the correct order due to reindexing
user_processed = pd.concat([pd.DataFrame(user_scaled, columns=num_cols), user_df[X_cat_df.columns].reset_index(drop=True)], axis=1)

prediction = model.predict(user_processed)


print("\n Prediction:")
if prediction[0] == 1:
    print(" High Risk of Heart Disease")
else:
    print(" Low Risk (No Heart Disease)")

Age: 40
Resting Blood Pressure: 140
Cholesterol: 289
Fasting Blood Sugar (0 or 1): 1
Maximum Heart Rate: 172
Oldpeak (ST depression): 0
Sex (M/F): M
Chest Pain Type (ASY/ATA/NAP/TA): ATA
Resting ECG (Normal/LVH/ST): Normal
Exercise Angina (Y/N): Y
ST Slope (Down/Flat/Up): Up

 Prediction:
 Low Risk (No Heart Disease)
