In [None]:
#import Library

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [None]:
# Load the data
data_path = '/content/heart.csv'
df= pd.read_csv(data_path)
df.head()

Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Gender          918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [None]:
df.shape

(918, 12)

In [None]:
objList = df.select_dtypes(include = "object").columns
print (objList)

Index(['Gender', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='object')


In [None]:
#Label Encoding for object to numeric conversion
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for feat in objList:
    df[feat] = le.fit_transform(df[feat].astype(str))

print (df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Gender          918 non-null    int64  
 2   ChestPainType   918 non-null    int64  
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    int64  
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    int64  
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    int64  
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(11)
memory usage: 86.2 KB
None


In [None]:
df.head()

Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


**Forward Selection**

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Replace this with your dataset and labels
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Initialize an empty list to store selected feature indices
selected_features = []

# Define the machine learning model (in this case, a Random Forest Classifier)
model = RandomForestClassifier()

# Define the number of features you want to select
num_features_to_select = 5

while len(selected_features) < num_features_to_select:
    best_score = -1
    best_feature = None

    for feature_idx in range(X.shape[1]):
        if feature_idx in selected_features:
            continue

        # Try adding the feature to the selected set
        candidate_features = selected_features + [feature_idx]

        # Evaluate the model's performance using cross-validation
        # Use iloc for integer based indexing
        scores = cross_val_score(model, X.iloc[:, candidate_features], y, cv=5, scoring='accuracy')
        mean_score = np.mean(scores)

        # Keep track of the best-performing feature
        if mean_score > best_score:
            best_score = mean_score
            best_feature = feature_idx

    if best_feature is not None:
        selected_features.append(best_feature)
        print(f"Selected Feature {len(selected_features)}: {best_feature}, Mean Accuracy: {best_score:.4f}")

print("Selected feature indices:", selected_features)

print("Selected feature name:", df.columns[selected_features])

Selected Feature 1: 10, Mean Accuracy: 0.8136
Selected Feature 2: 6, Mean Accuracy: 0.8136
Selected Feature 3: 1, Mean Accuracy: 0.8103
Selected Feature 4: 8, Mean Accuracy: 0.8125
Selected Feature 5: 5, Mean Accuracy: 0.8289
Selected feature indices: [10, 6, 1, 8, 5]
Selected feature name: Index(['ST_Slope', 'RestingECG', 'Gender', 'ExerciseAngina', 'FastingBS'], dtype='object')


** Backward Elimination**

In [None]:
# Replace this with your dataset and labels
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Define the machine learning model (in this case, a Random Forest Classifier)
model = RandomForestClassifier()

# Initialize a list with all feature indices
all_features = list(range(X.shape[1]))

# Define the minimum number of features you want to retain
min_features_to_retain = 5

while len(all_features) > min_features_to_retain:
    worst_score = 1.0  # Initialize with a high value
    worst_feature = None

    for feature_idx in all_features:
        # Create a list of features without the current one
        candidate_features = [f for f in all_features if f != feature_idx]

        # Evaluate the model's performance using cross-validation
        scores = cross_val_score(model, X.iloc[:, candidate_features], y, cv=5, scoring='accuracy')
        mean_score = np.mean(scores)

        # Keep track of the worst-performing feature
        if mean_score < worst_score:
            worst_score = mean_score
            worst_feature = feature_idx

    if worst_feature is not None:
        all_features.remove(worst_feature)
        print(f"Removed Feature: {worst_feature}, Mean Accuracy: {worst_score:.4f}")

print("Remaining feature indices:", all_features)

print("Remaining feature name:", df.columns[all_features])

Removed Feature: 10, Mean Accuracy: 0.7789
Removed Feature: 8, Mean Accuracy: 0.7582
Removed Feature: 2, Mean Accuracy: 0.7375
Removed Feature: 9, Mean Accuracy: 0.6961
Removed Feature: 7, Mean Accuracy: 0.6461
Removed Feature: 0, Mean Accuracy: 0.5980
Remaining feature indices: [1, 3, 4, 5, 6]
Remaining feature name: Index(['Gender', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG'], dtype='object')


**Recursive Feature Elimination (RFE)**

**Ranking Features:** Start with all features and rank them
based on their importance or contribution to the model.

**Iterative Removal:** In each iteration, remove the least important feature(s).

**Stopping Criterion:** Continue until a desired number of features is reached

It is a greedy optimization algorithm which aims to find the best performing feature subset. It repeatedly creates models and keeps aside the best or the worst performing feature at each iteration. It constructs the next model with the left features until all the features are exhausted. It then ranks the features based on the order of their elimination.

Recursive feature elimination performs a greedy search to find the best performing feature subset. It iteratively creates models and determines the best or the worst performing feature at each iteration. It constructs the subsequent models with the left features until all the features are explored. It then ranks the features based on the order of their elimination. In the worst case, if a dataset contains N number of features RFE will do a greedy search for 2N combinations of features.

In [None]:
from sklearn.feature_selection import RFE
# Replace this with your dataset and labels
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Define the machine learning model (in this case, a Random Forest Classifier)
model = RandomForestClassifier()

# Specify the number of features you want to retain
num_features_to_retain = 5

# Initialize the RFE selector with the model and the number of features to retain
# explicitly name the n_features_to_select argument
rfe = RFE(model, n_features_to_select=num_features_to_retain)

# Fit the RFE selector to your data
rfe.fit(X, y)

# Get the selected features
selected_features = np.where(rfe.support_)[0]

print("Selected feature indices:", selected_features)

# Evaluate model performance with the selected features using cross-validation
scores = cross_val_score(model, X.iloc[:, candidate_features], y, cv=5, scoring='accuracy')
mean_accuracy = np.mean(scores)
print(f"Mean Accuracy with Selected Features: {mean_accuracy:.4f}")

print("Selected feature name:", df.columns[selected_features])

Selected feature indices: [ 2  4  7  9 10]
Mean Accuracy with Selected Features: 0.6394
Selected feature name: Index(['ChestPainType', 'Cholesterol', 'MaxHR', 'Oldpeak', 'ST_Slope'], dtype='object')


END

**Forward Selection with performance metrices**

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import numpy as np

# Replace this with your dataset and labels
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Initialize an empty list to store selected feature indices
selected_features = []

# Define the machine learning model (in this case, a Random Forest Classifier)
model = RandomForestClassifier()

# Define the number of features you want to select
num_features_to_select = 5

while len(selected_features) < num_features_to_select:
    best_score = -1
    best_feature = None

    for feature_idx in range(X.shape[1]):
        if feature_idx in selected_features:
            continue

        # Try adding the feature to the selected set
        candidate_features = selected_features + [feature_idx]

        # Evaluate the model's performance using cross-validation
        # Use iloc for integer based indexing
        scores = cross_val_score(model, X.iloc[:, candidate_features], y, cv=5, scoring='accuracy')
        mean_score = np.mean(scores)

        # Keep track of the best-performing feature
        if mean_score > best_score:
            best_score = mean_score
            best_feature = feature_idx

    if best_feature is not None:
        selected_features.append(best_feature)
        print(f"Selected Feature {len(selected_features)}: {best_feature}, Mean Accuracy: {best_score:.4f}")

print("Selected feature indices:", selected_features)
print("Selected feature names:", df.columns[selected_features])

# Now we compute accuracy, f1_score, precision, and recall for the final model using the selected features
# Splitting the dataset into training and testing sets
X_selected = X.iloc[:, selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Train the model on the selected features
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print out the results
print(f"Final Model Performance with Selected Features:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


Selected Feature 1: 10, Mean Accuracy: 0.8136
Selected Feature 2: 6, Mean Accuracy: 0.8136
Selected Feature 3: 8, Mean Accuracy: 0.8026
Selected Feature 4: 2, Mean Accuracy: 0.8365
Selected Feature 5: 5, Mean Accuracy: 0.8234
Selected feature indices: [10, 6, 8, 2, 5]
Selected feature names: Index(['ST_Slope', 'RestingECG', 'ExerciseAngina', 'ChestPainType',
       'FastingBS'],
      dtype='object')
Final Model Performance with Selected Features:
Accuracy: 0.8333
F1 Score: 0.8516
Precision: 0.9041
Recall: 0.8049


Backward Elimination with Performance metrices

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

# Replace this with your dataset and labels
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Define the machine learning model (Random Forest Classifier)
model = RandomForestClassifier()

# Initialize a list with all feature indices
all_features = list(range(X.shape[1]))

# Define the minimum number of features you want to retain
min_features_to_retain = 5

# Feature elimination loop
while len(all_features) > min_features_to_retain:
    worst_score = 1.0  # Initialize with a high value
    worst_feature = None

    for feature_idx in all_features:
        # Create a list of features without the current one
        candidate_features = [f for f in all_features if f != feature_idx]

        # Evaluate the model's performance using cross-validation
        scores = cross_val_score(model, X.iloc[:, candidate_features], y, cv=5, scoring='accuracy')
        mean_score = np.mean(scores)

        # Keep track of the worst-performing feature
        if mean_score < worst_score:
            worst_score = mean_score
            worst_feature = feature_idx

    if worst_feature is not None:
        all_features.remove(worst_feature)
        print(f"Removed Feature: {worst_feature}, Mean Accuracy: {worst_score:.4f}")

print("Remaining feature indices:", all_features)
print("Remaining feature names:", df.columns[all_features])

# Now, calculate the final metrics (accuracy, f1_score, precision, recall) with the remaining features
X_selected = X.iloc[:, all_features]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Train the model on the remaining features
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the final performance metrics
print(f"Final Model Performance with Remaining Features:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


Removed Feature: 10, Mean Accuracy: 0.7920
Removed Feature: 2, Mean Accuracy: 0.7582
Removed Feature: 8, Mean Accuracy: 0.7353
Removed Feature: 9, Mean Accuracy: 0.6939
Removed Feature: 7, Mean Accuracy: 0.6450
Removed Feature: 0, Mean Accuracy: 0.5958
Remaining feature indices: [1, 3, 4, 5, 6]
Remaining feature names: Index(['Gender', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG'], dtype='object')
Final Model Performance with Remaining Features:
Accuracy: 0.6087
F1 Score: 0.6786
Precision: 0.6628
Recall: 0.6951


**RFE with Performance metrices**

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

# Replace this with your dataset and labels
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Define the machine learning model (Random Forest Classifier)
model = RandomForestClassifier()

# Specify the number of features you want to retain
num_features_to_retain = 5

# Initialize the RFE selector with the model and the number of features to retain
rfe = RFE(model, n_features_to_select=num_features_to_retain)

# Fit the RFE selector to your data
rfe.fit(X, y)

# Get the selected features
selected_features = np.where(rfe.support_)[0]

print("Selected feature indices:", selected_features)
print("Selected feature names:", df.columns[selected_features])

# Evaluate model performance with the selected features using cross-validation
X_selected = X.iloc[:, selected_features]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Train the model on the selected features
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the final performance metrics
print(f"Final Model Performance with Selected Features:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


Selected feature indices: [ 2  4  7  9 10]
Selected feature names: Index(['ChestPainType', 'Cholesterol', 'MaxHR', 'Oldpeak', 'ST_Slope'], dtype='object')
Final Model Performance with Selected Features:
Accuracy: 0.8188
F1 Score: 0.8377
Precision: 0.8958
Recall: 0.7866
