# Capstone Project : Heart Disease Prediction 

In [None]:
# Import the necessary libraries



import numpy as np
import pandas as pd
from scipy import stats
import plotly.express as px


%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
from matplotlib.colors import ListedColormap

from sklearn.model_selection import train_test_split
from scipy.stats import boxcox
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression



from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score

import ipywidgets as widgets
from IPython.display import display, HTML


import warnings
warnings.filterwarnings('ignore')



### 1. Load the dataset and familiarize  with the features and their descriptions.

In [None]:
# Load the dataset
df = pd.read_csv(r'C:\Users\CCI\OneDrive\Desktop\BIA\14. Capstone Project\heart_disease.csv')

In [None]:
# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Get an overview of the dataset
print("\nDataset Information:")
df.info()

In [None]:
# Statistical Analysis of the dataset
df.describe().T

##  Features and Their Descriptions
   **Below is a detailed description of each feature in the dataset:**

 **1) Age** : The age of the patient in years.

**2) Sex**: The gender of the patient.

         1: Male
         0: Female
        
**3) Chest pain type:** The type of chest pain experienced by the patient.

        0: Typical angina
        1: Atypical angina
        2: Non-anginal pain
        3: Asymptomatic
        
__4) Resting blood pressure__: The resting blood pressure (in mm Hg) of the patient.

__5) Serum cholesterol levels__: The serum cholesterol level (in mg/dL) of the patient.

__6) Fasting blood sugar__: Indicates if the patient's fasting blood sugar is greater than 120 mg/dL.

       1: True (fasting blood sugar > 120 mg/dL)
       0: False (fasting blood sugar ≤ 120 mg/dL)
        
__7) Resting electrocardiographic results__: Results of the resting electrocardiogram (ECG).

        0: Normal
        1: Having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
        2: Showing probable or definite left ventricular hypertrophy by Estes' criteria
        
__8) Maximum heart rate achieved__: The maximum heart rate achieved during exercise.

__9) Exercise-induced angina__: Indicates if the patient experiences angina induced by exercise.

        1: Yes
        0: No
        
__10) ST depression induced by exercise relative to rest__: The value of depression in the ST segment during exercise compared to rest.

__11) Slope of the peak exercise ST segment__: The slope of the peak exercise ST segment.

        0: Upsloping
        1: Flat
        2: Downsloping
        
__12) Number of major vessels colored by fluoroscopy__: The number of major vessels (ranging from 0 to 4) colored by fluoroscopy.

__13) Thalassemia__: A blood disorder involving lower-than-normal amounts of an oxygen-carrying protein.

        0: Normal
        1: Fixed defect
        2: Reversible defect
        3: Not described
        
__14) Target variable__: Indicates the presence or absence of heart disease.

        1: Presence of heart disease
        0: Absence of heart disease
        
These descriptions provide a comprehensive understanding of each feature in the dataset, which is crucial for data exploration and preprocessing steps.

In [None]:
df.info()

#### __Note:__ 

>**Even though some of our columns (like sex, cp, fbs, restecg, exang, slope, ca, thal, and target) have numbers in them, they actually represent categories or groups. For example, 'sex' might have numbers like 0 and 1 to represent male and female, but it's really about different groups. To understand and work with these columns correctly, we need to change their data type to "text" (also known as "object"). This will help us analyze and interpret them properly.**

In [None]:
# List of continuous features
cont_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Determine which columns should be converted to object data type
#Finds all columns not listed in cont_cols
cate_cols = df.columns.difference(cont_cols)

# Convert these columns to object data type
df[cate_cols] = df[cate_cols].apply(lambda x: x.astype('object'))

# Check the data types of all columns
print(df.dtypes)


## 2. Summary Statistics for Numerical Variables

In [None]:
df.describe().T

__Numerical Variables Summary:__

 1. The dataset covers a wide age range from 29 to 77 years, with a mean age of 54.43 years.
 2. Resting blood pressure ranges from 94 to 200 mm Hg, with an average of 131.61 mm Hg.
 3. Serum cholesterol levels exhibit significant variability, ranging from 126 to 564 mg/dL, with an average of 246.0 mg/dL.
 4. Maximum heart rate achieved also shows variability, with values ranging from 71 to 202 bpm, and a mean of 149.11 bpm.
 5. ST depression induced by exercise (oldpeak) ranges from 0.0 to 6.2, with an average of 1.07, indicating variability in the dataset.

## 3. Summary Statistics for Categorical Variables

In [None]:
df.describe(include='object')

#### Categorical Variables Summary 

 1. Sex: The majority of the dataset consists of males (1), representing about 69.56% of the population.
 2. Chest Pain Type (cp): The most common chest pain type is 0, making up approximately 48.49% of the observations.
 3. Fasting Blood Sugar (fbs): Most individuals do not have fasting blood sugar greater than 120 mg/dL, with 85.07% of the values being 0.
 4. Resting Electrocardiographic Results (restecg): The most frequent resting ECG result is 1, accounting for roughly 50.05% of the cases.
 5. Exercise Induced Angina (exang): A large portion of individuals do not experience exercise-induced angina, with 66.34% of the values being 0.
 6. Slope of the Peak Exercise ST Segment (slope): The most common slope value is 1, which appears 47.02% of the time.
 7. Number of Major Vessels Colored by Fluoroscopy (ca): Most individuals have 0 major vessels colored by fluoroscopy, comprising about 56.39% of the observations.
 8. Thalassemia (thal): The most common thalassemia value is 2, representing 53.07% of the observations.
 9. Target: There is a near-even split in the target variable, with a slight majority (51.32%) indicating the presence of heart disease (1).

## 4. Check for missing values and handle them appropriately (e.g., imputation or removal).

#### Handling Missing Values

In [None]:
df.isnull().sum()

**Since our dataset doesn't have any missing values, we don’t need to perform imputation (filling in missing data) or removal (discarding incomplete data).**

## 5. Removing Unnecessary Columns

   >**Based on our analysis, all the features in the dataset seem important. None of the columns appear to be unnecessary or unimportant. Therefore, we will keep all the features to make sure we don't lose any valuable information, especially since the dataset is small.**

## 6. EDA (Exploratory Data Analysis)

   >**Perform exploratory data analysis (EDA) to understand the distribution of features, identify outliers, and visualize relationships between features and the target variable.**

### 6.1) Visualization For Numerical Variables

In [None]:
# Check age distribution in the dataset

# Plot the distribution of the 'age' column
plt.figure(figsize=(8, 6))  # Adjust the size as needed
sns.histplot(df['age'], color='#0D0DF4', bins=10, kde=True)  # Use histplot with KDE

# Customize the plot
plt.title('Distribution of Age Variable', fontweight='bold')
plt.xlabel('Age', fontweight='bold')
plt.ylabel('Count', fontweight='bold')
plt.grid(True)  # Add grid lines

# Show the plot
plt.show()


### Summary :

  1. The height of each bar shows how many people fall into that age range. The taller the bar, the more people there are in that age group.

 2. Looking at the chart, you can see that one bar is clearly taller than the others. This is the most common age range, which we call the mode. In this case, it seems like there are more people in their 50s and 60s compared to other age groups.

  3. As you move towards the right side of the chart (older ages), the bars generally get shorter. This suggests that there are fewer and fewer people as we go up in age. This pattern is called a positive skew.

In [None]:
# Plot the count distribution of the 'sex' column
plt.figure(figsize=(8, 6))  # Adjust the size as needed
sns.countplot(x='sex', data=df, palette='Set1')

# Customize the plot
plt.title('Distribution of Sex Variable', fontweight='bold')
plt.xlabel('Sex', fontweight='bold')
plt.ylabel('Count', fontweight='bold')
  # Add grid lines

# Show the plot
plt.show()

In [None]:
# Oldpeak - Density Plot
plt.figure(figsize=(8,4))
sns.kdeplot(df['oldpeak'], shade=True, color='#154360')
plt.title('Density Plot of ST Depression Induced by Exercise (Oldpeak)', fontsize=10, fontweight='bold')
plt.xlabel('ST Depression (Oldpeak)', fontsize=14, fontweight='bold')
plt.ylabel('Density', fontsize=10, fontweight='bold')
plt.show()

### Summary :
   >**The shape of the curve in the plot is important.  This particular density plot is bell-shaped, which is a normal distribution. This means that most people fall around an average amount of ST depression caused by exercise, with fewer people having very low or very high amounts.**

In [None]:
# Set the figure size
plt.figure(figsize=(12, 6))

# Plot the histogram
sns.histplot(df['chol'], bins=20, kde=True, color='#154360')

# Add a title and axis labels
plt.title('Distribution of Serum Cholesterol Levels', fontweight='bold', fontsize=16)
plt.xlabel('Serum Cholesterol (mg/dL)', fontweight='bold', fontsize=14)
plt.ylabel('Frequency', fontweight='bold', fontsize=14)

# Add grid lines for better readability
plt.grid(True, linestyle='--', alpha=0.7)

# Add annotations for mean and median
mean_chol = df['chol'].mean()
median_chol = df['chol'].median()
plt.axvline(mean_chol, color='red', linestyle='--', linewidth=1, label=f'Mean: {mean_chol:.2f}')
plt.axvline(median_chol, color='green', linestyle='--', linewidth=1, label=f'Median: {median_chol:.2f}')

# Customize the legend


# Show the plot
plt.show()

### Summary :
   >1. The chart has bars of varying heights, which means the number of people with different cholesterol levels varies. There seems to be a peak around 200-250 mg/dL, which suggests this might be the most common cholesterol level in this dataset.
   >2. We can also say that the chart is skewed to the right. This means there are more people with higher cholesterol levels (towards the right side of the chart) than there are people with lower cholesterol levels (towards the left side of the chart).

## 6.2) Univariate Analysis: Histograms with KDE for numerical variables

In [None]:
plt.figure(figsize=(15, 10))
for i, col in enumerate(cont_cols, 1):
    plt.subplot(2, 3, i)
    sns.histplot(df[col], kde=True, bins=10, color='darkblue')
    plt.title(col)
plt.tight_layout()
plt.suptitle('Univariate Analysis of Numerical Features', y=1.02)
plt.show()

### Inference :
 1. __Age__: Most people's ages are spread out, but many are in their late 50s. The average age is about 54 years, with most people being within 9 years of that age.

2. __Resting Blood Pressure__: Most people have a resting blood pressure between 120 and 140 mm Hg. The average is about 132 mm Hg, with most people being within 18 mm Hg of that number.

3. __Serum Cholesterol__: Cholesterol levels for most people are between 200 and 280 mg/dl. The average cholesterol level is around 246 mg/dl, with most people being within 52 mg/dl of that average.

4. __Maximum Heart Rate Achieved__: During a stress test, most people reach a heart rate between 140 and 170 bpm. The average maximum heart rate is about 149 bpm, with most people being within 23 bpm of that number.

5. __ST Depression Induced by Exercise__: Most people have values close to 0, meaning they didn't experience much ST depression during exercise. The average ST depression is about 1, with most people being within 1 of that average.

__After looking at the histograms (which are like bar charts) of the continuous features and checking them against their descriptions, everything looks normal and as expected. There are no unusual or strange values in the continuous variables.__

## 6.3) Bivariate Analysis for Numerical Variables with Target column

In [None]:
# Set seaborn style and font scale
sns.set(style="whitegrid", font_scale=1.2)

# Define target variable
target_variable = 'target'  # Replace 'target' with your actual target variable name

# Create subplots for bivariate analysis
plt.figure(figsize=(15, 15))
for i, col in enumerate(cont_cols, start=1):
    plt.subplot(3, 2, i)
    sns.histplot(data=df, x=col, hue=target_variable, multiple='stack', kde=True, palette='Set1')
    plt.title(f'{col.capitalize()} Distribution by {target_variable.capitalize()}', fontweight='bold')
    plt.xlabel(col.capitalize(), fontweight='bold')
    plt.ylabel('Frequency', fontweight='bold')
    #plt.legend(title=target_variable.capitalize())
plt.tight_layout()
plt.show()

## Inference : 

1. __Age__: People with heart disease are, on average, a bit younger than those without it. Those without heart disease are usually older.

2. __Resting Blood Pressure__: The blood pressure levels for both groups (with and without heart disease) look very similar, so it’s not very useful for telling the difference between the two groups.

3. __Serum Cholesterol__: Cholesterol levels are also similar for both groups, but people with heart disease have a slightly lower average cholesterol level.

4. __Maximum Heart Rate Achieved__: People with heart disease tend to reach a higher maximum heart rate during stress tests compared to those without heart disease.

5. __ST Depression__: People with heart disease show much less ST depression during exercise. Their results are mostly close to zero, while those without heart disease show a wider range of results.



## 6.4) Visualization for Categorical Variables

In [None]:
# Plot the countplot with hue specified
plt.figure(figsize=(8, 6))  # Adjust the figure size as needed
sns.countplot(x='target', hue='target', data=df, palette='Set1')

plt.title('Distribution of Target Variable', fontweight='bold', fontsize=14)
plt.xlabel('Target', fontweight='bold', fontsize=12)
plt.ylabel('Count', fontweight='bold', fontsize=12)

# Set legend outside the plot and adjust position
plt.legend(labels=['Absence Of Disease', 'Presence Of Disease'], loc='upper center',)
print(df['target'].value_counts())
plt.tight_layout()
plt.show()

###  Count of male and female with the presence and absence of disease

In [None]:
sex_target_counts = df.groupby(['sex', 'target']).size().reset_index(name='count')


In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='sex', hue='target', data=df, palette='Set1')
plt.title('Distribution of Sex over Target Variable', fontweight='bold')
plt.xlabel('Sex', fontweight='bold')
plt.ylabel('Count', fontweight='bold')
plt.xticks([1,0], ["Male", "Female"])
plt.legend(labels=['Absence Of Disease', 'Presence Of Disease'])
print(sex_target_counts)
plt.show()

In [None]:
# Count values of cp
cp_counts = df['cp'].value_counts()

# Custom color palette
# cp_type = [0: Typical angina, 1: Atypical angina, 2: Non-anginal pain, 3: Asymptomatic]
custom_colors = ['#E59866', '#5DADE2', '#F1948A', '#ABEBC6']

# Plotting the pie chart
plt.figure(figsize=(6,6))
plt.pie(cp_counts, labels=cp_counts.index, autopct='%1.1f%%', startangle=140, colors=custom_colors) #plt.cm.Set3.colors)
plt.title('Distribution of Chest Pain Types (cp)', fontsize=14, fontweight='bold')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

#### Inference : 
__Typical Angina (0)__: Sometimes the heart's door gets a little blocked, making it harder for blood to get in. This can cause a squeezing or tightness in your chest, like someone is gently hugging you too hard. It usually goes away with rest.

__Atypical Angina (1)__: This is like typical angina, but the feeling might be different. It might be a pain in your arm, jaw, or back instead of your chest. It's important to tell an adult if you feel this kind of pain.

__Non-Anginal Pain (2)__: Not all chest pain comes from the heart! Sometimes you might have a cough, heartburn, or a muscle pull that makes your chest hurt. This kind of pain usually feels different than angina and goes away on its own.

__Asymptomatic (3)__: This is a big word that means "no symptoms." Some people might have problems with their heart but not feel any pain at all. It's important for grown-ups to get checkups to make sure everything is working well, even if they don't feel sick.

## Show fasting blood sugar distribution according to sex variable
        

In [None]:
fbs_target_counts = df.groupby(['fbs', 'sex']).size().reset_index(name='count')
# 1 true = above 120 mg/dl , 0 false = below 120 mg/dl

In [None]:
sns.countplot(x='fbs', hue='sex', data=df, palette='Set1')
plt.title('Distribution of FBS over Sex Variable', fontweight='bold')
plt.xlabel('Fasting Blood Sugar(<= 120 mg/dl, > 120 mg/dl)', fontweight='bold')
plt.ylabel('Count', fontweight='bold')
plt.legend(labels=['Female','Male'])
plt.xticks([0, 1], ['False', 'True'])
print(fbs_target_counts)
plt.show()

In [None]:
# Remove 'target' from the list of categorical columns
cate_cols = [col for col in cate_cols if col != 'target']
cate_cols

## 6.5) Univariate Analysis of Categorical Features

In [None]:
plt.figure(figsize=(12, 20))

for i, col in enumerate(cate_cols, 1):
    plt.subplot(4, 2, i)
    ax = sns.countplot(x=col, data=df, color='#2980B9')
    
    # Annotate the bars with counts
    for p in ax.patches:
        ax.annotate(
            f'{int(p.get_height())}', 
            (p.get_x() + p.get_width() / 2., p.get_height()), 
            ha='center', va='center', 
            fontsize=11, color='black', 
            xytext=(0, 5), 
            textcoords='offset points'
        )

    plt.title(col, fontsize=14, fontweight='bold')

plt.tight_layout()
plt.suptitle('Univariate Analysis of Categorical Features', fontsize=16, fontweight='bold', y=1.02)
plt.show()


## Inference : 
1. __Gender__: There are more males in the dataset than females.

2. __Type of Chest Pain__: There are different types of chest pain in the dataset. The most common type is "Typical angina," but you can see the exact numbers in the bar charts.

3. __Fasting Blood Sugar__: Most patients have a fasting blood sugar level below 120 mg/dl, so high blood sugar isn't common in this dataset.

4. __Resting Electrocardiographic Results__: There are various outcomes for the resting electrocardiogram. Some types are more common than others, and you can see the details in the plots.

5. __Exercise-Induced Angina__: Most patients do not have angina (chest pain) during exercise, suggesting it's not a common symptom here.

6. __Slope of the Peak Exercise ST Segment__: The dataset shows different types of slopes for the ST segment during peak exercise. One type might be more common, and you can check the exact numbers in the bar charts.

7. __Number of Major Vessels Colored by Fluoroscopy__: Most patients have fewer major vessels colored, with '0' being the most common number.

8. __Thalium Stress Test Result__: There are different results from the thalium stress test. One type is more common, but you can see the exact details in the plots.

9. __Presence of Heart Disease__: About 54.5% of the patients have heart disease, while 45.5% do not, so the dataset is almost evenly split.

## 6.6) Bivariate Analysis of Categorical Features by Target Variable

In [None]:
# Set seaborn style and font scale
sns.set(style="whitegrid", font_scale=1.2)

# Define target variable
target_variable = 'target'  # Replace 'target' with your actual target variable name

# Create subplots for bivariate analysis
plt.figure(figsize=(12, 20))

for i, col in enumerate(cate_cols, start=1):
    plt.subplot(4, 2, i)
    ax = sns.histplot(data=df, x=col, hue=target_variable, multiple='stack', palette='Set1', edgecolor='black')
    
    # Customize the plot with more aesthetics
    ax.set_title(f'{col.capitalize()} Distribution by {target_variable.capitalize()}', fontsize=16, fontweight='bold', pad=10)
    #ax.set_xlabel(col.capitalize(), fontsize=14, fontweight='bold')
    ax.set_ylabel('Frequency', fontsize=14, fontweight='bold')
    #ax.legend(title=target_variable.capitalize(), title_fontsize='13', fontsize='11', loc='upper right')
    
    # Add grid lines for better readability
    ax.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
#plt.suptitle('Bivariate Analysis of Categorical Features by Target Variable', fontsize=18, fontweight='bold', y=1.02)
plt.show()

## Inference : 
1. __Number of Major Vessels__: Most people with heart disease have fewer major vessels showing up in the test. If someone has 0 vessels colored, they are more likely to have heart disease.

2. __Chest Pain Type__: Different types of chest pain are linked to heart disease in different ways. Types 1, 2, and 3 are more common in people with heart disease, suggesting that chest pain type can help predict the disease.

3. __Exercise-Induced Angina__: People who don’t have chest pain during exercise are more likely to have heart disease compared to those who do have chest pain during exercise. This feature seems important for predicting heart disease.

4. __Fasting Blood Sugar__: There isn’t a big difference in heart disease rates between people with high fasting blood sugar and those with normal levels. This suggests that fasting blood sugar might not be very useful for predicting heart disease.

5. __Resting Electrocardiographic Results__: The type of resting electrocardiogram result can indicate heart disease. Specifically, type 1 is more common in people with heart disease.

6. __Sex__: Women are less likely to have heart disease compared to men, suggesting that gender plays a role in predicting heart disease.

7. __Slope of the Peak Exercise ST Segment__: The type 2 slope is more common in people with heart disease, making it a useful indicator for prediction.

8. __Thalium Stress Test Result__: The category showing a reversible defect (2) is more common in people with heart disease, highlighting its importance in prediction.


## 7) Exploring Data Correlations with a Heatmap

>**Correlation heatmap is to visually represent the strength and direction of relationships between multiple variables in a dataset**

In [None]:
# Create a heatmap of correlations
plt.figure(figsize=(15,8))
sns.heatmap(df.corr(), annot=True, cmap='cividis', linewidths=0.5, fmt='.2f', cbar=True)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

## Inference :
>1. High Positive Correlation on Target: __cp, thalach and slope__
>2. High Negative Correlation on Target: __exang, oldpeak, ca and thal__
>3. Moderate Negative Correlation on Target: __age and sex__
>4. Less Correlation on Target: __tresbps, chol, fbs and restecg__

## 8) Fixing Outliers in the dataset
 >I'm going to use the IQR method to identify outliers in the numerical features.
 > improve the accuracy of our analysis. 

In [None]:
cont_cols

In [None]:
Q1 = df[cont_cols].quantile(0.25)
Q3 = df[cont_cols].quantile(0.75)
IQR = Q3 - Q1
outliers_count = ((df[cont_cols] < (Q1 - 1.5 * IQR)) | (df[cont_cols] > (Q3 + 1.5 * IQR))).sum()

print("Outliers Count:")
print(outliers_count)

We observed outliers in the continuous variables we analyzed, indicating data points that distinct from others.


1. __trestbps__ : 30 outliers
2. __chol__ : 16 outliers
3. __thalach__ : 4 outlier
4. __oldpeak__ : 7 outliers
5. __age__ : No outliers

## 9) One Hot Encoding

In [None]:
df_encoded = pd.get_dummies(df, columns=['cp', 'restecg', 'thal'], drop_first=True)
df_encoded[['cp_1', 'cp_2', 'cp_3','restecg_1', 'restecg_2', 'thal_1', 'thal_2','thal_3']] = df_encoded[['cp_1', 'cp_2', 'cp_3','restecg_1', 'restecg_2', 'thal_1', 'thal_2','thal_3']].astype('uint8')

# Convert the rest of the categorical variables that don't need one-hot encoding to integer data type
features_to_convert = ['sex', 'fbs', 'exang', 'slope', 'ca', 'target']
for feature in features_to_convert:
    df_encoded[feature] = df_encoded[feature].astype(int)

df_encoded.dtypes

In [None]:
# Displaying the resulting DataFrame after one-hot encoding
df_encoded.head()

In [None]:
# Define the features (X) and the output labels (y)
X = df_encoded.drop('target', axis=1)
y = df_encoded['target'] 

In [None]:
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

In [None]:
X_train_cv, X_val, y_train_cv, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42, stratify=y_train)

In [None]:
# Print the shapes of the resulting datasets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

In [None]:


# Apply Box-Cox transformation to positive-valued features
X['oldpeak'], _ = boxcox(X['oldpeak'] + 1)  # Add 1 to handle zeros

# Display the first few rows of the transformed dataset
X.head()


## 10) Decision Tree Model Building

In [None]:
# Define the base DT model
dt_base = DecisionTreeClassifier(random_state=42)

In [None]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.utils import shuffle

def tune_clf_hyperparameters(clf, param_grid, X_train, y_train, scoring='recall', n_splits=3):
    # Shuffle the data to ensure random distribution before cross-validation
    X_train, y_train = shuffle(X_train, y_train, random_state=42)
    
    # Create the cross-validation object using KFold
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Create the GridSearchCV object
    clf_grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=cv, scoring=scoring, n_jobs=-1)

    # Fit the GridSearchCV object to the training data
    clf_grid.fit(X_train, y_train)

    # Get the best hyperparameters
    best_hyperparameters = clf_grid.best_params_

    # Return the best model and best hyperparameters
    return clf_grid.best_estimator_, best_hyperparameters


In [None]:
# Hyperparameter grid for DT
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2,3],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2]
}

In [None]:
best_dt, best_dt_hyperparams = tune_clf_hyperparameters(dt_base, param_grid_dt, X_train, y_train)

In [None]:
print('DT Optimal Hyperparameters: \n', best_dt_hyperparams)

In [None]:
# Evaluate the optimized model on the train data
print(classification_report(y_train, best_dt.predict(X_train)))

In [None]:
# Evaluate the optimized model on the test data
print(classification_report(y_test, best_dt.predict(X_test)))

In [None]:
# Plotting the Confusion Matrix for Decision Tree Algorithm
cm_dt = confusion_matrix(y_test,best_dt.predict(X_test))
plt.figure(figsize=(5,5))
sns.set_context('notebook',font_scale = 0.9)
sns.heatmap(cm_dt,annot=True,fmt='d', cmap="Blues", cbar=False)
plt.title('Decision Tree Confusion Matrix');
plt.xlabel("Predicted_Value")
plt.ylabel("True_Value")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def evaluate_model(model, X_test, y_test, model_name):
    """
    Evaluates the performance of a trained model on test data using various metrics.
    """
    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate individual metrics for each class
    precision_0 = precision_score(y_test, y_pred, pos_label=0)
    precision_1 = precision_score(y_test, y_pred, pos_label=1)
    recall_0 = recall_score(y_test, y_pred, pos_label=0)
    recall_1 = recall_score(y_test, y_pred, pos_label=1)
    f1_0 = f1_score(y_test, y_pred, pos_label=0)
    f1_1 = f1_score(y_test, y_pred, pos_label=1)

   

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Create a dictionary to store all metrics
    metrics = {
        "precision_0": precision_0,
        "precision_1": precision_1,
        "recall_0": recall_0,
        "recall_1": recall_1,
        "f1_0": f1_0,
        "f1_1": f1_1,
        "accuracy": accuracy
    }

    # Convert dictionary to dataframe
    df = pd.DataFrame(metrics, index=[model_name]).round(2)

    return df


In [None]:
dt_evaluation = evaluate_model(best_dt, X_test, y_test, 'DT')
dt_evaluation

## 11) Logistic Regression Model Building

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=10000)

# Define parameter grid for Logistic Regression
param_grid_log_reg = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'saga'],  # Solvers compatible with L1/L2 penalties
    'penalty': ['l1', 'l2']  # Penalty terms
}

# Perform GridSearchCV with recall as the scoring metric
grid_search_log_reg = GridSearchCV(log_reg, param_grid_log_reg, cv=5, scoring='recall', n_jobs=-1)
grid_search_log_reg.fit(X_train, y_train)

# Best parameters and score
print("Logistic Regression Best Params:", grid_search_log_reg.best_params_)
print("Logistic Regression Best Recall Score:", grid_search_log_reg.best_score_)





In [None]:
# Evaluate the model
y_pred = grid_search_log_reg.predict(X_train)
print("Logistic Regression Classification Report:\n", classification_report(y_train, y_pred))

In [None]:
# Evaluate the model
y_pred = grid_search_log_reg.predict(X_test)
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Plotting the Confusion Matrix for Logistic Regression Algorithm
cm_lr = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,5))
sns.set_context('notebook',font_scale = 0.9)
sns.heatmap(cm_lr,annot=True,fmt='d', cmap="Blues", cbar=False)
plt.title('Logistic Regression Confusion Matrix');
plt.xlabel("Predicted_Value")
plt.ylabel("True_Value")
plt.tight_layout()
plt.show()

In [None]:
logistic_evaluation = evaluate_model(grid_search_log_reg, X_test, y_test, 'LR')
logistic_evaluation

## 12) Random Forest Model Building

In [None]:
rf_base = RandomForestClassifier(random_state=42)

In [None]:
param_grid_rf = {
    'n_estimators': [10, 30, 50, 70, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 3, 4],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3],
    'bootstrap': [True, False]
}

In [None]:
# Using the tune_clf_hyperparameters function to get the best estimator
best_rf, best_rf_hyperparams = tune_clf_hyperparameters(rf_base, param_grid_rf, X_train, y_train)
print('RF Optimal Hyperparameters: \n', best_rf_hyperparams)

In [None]:
# Evaluate the optimized model on the train data
print(classification_report(y_train, best_rf.predict(X_train)))

In [None]:
# Evaluate the optimized model on the test data
print(classification_report(y_test, best_rf.predict(X_test)))

In [None]:
# Plotting the Confusion Matrix for Random Forest Algorithm
cm_rf = confusion_matrix(y_test,best_rf.predict(X_test))
plt.figure(figsize=(5,5))
sns.set_context('notebook',font_scale = 0.9)
sns.heatmap(cm_rf,annot=True,fmt='d', cmap="Blues", cbar=False)
plt.title('Random Forest Confusion Matrix');
plt.xlabel("Predicted_Value")
plt.ylabel("True_Value")
plt.tight_layout()

In [None]:
rf_evaluation = evaluate_model(best_rf, X_test, y_test, 'RF')
rf_evaluation

### Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Apply min-max scaling to numerical columns
min_max_scaler = MinMaxScaler()
X[cont_cols] = min_max_scaler.fit_transform(X[cont_cols])

# Display the first few rows of the scaled dataset
X.head()

## 13) Logistic Regression Model Building

In [None]:
logistic_base=LogisticRegression()

In [None]:
param_grid_logistic = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear']
}

In [None]:
# Call the function for hyperparameter tuning with logistic regression
best_logistic, best_logistic_hyperparams = tune_clf_hyperparameters(logistic_base, param_grid_logistic, X_train, y_train)

# Print the optimal hyperparameters for logistic regression
print('Logistic Regression Optimal Hyperparameters: \n', best_logistic_hyperparams)

In [None]:
# Evaluate the optimized model on the train data
print(classification_report(y_train, best_logistic.predict(X_train)))

In [None]:
# Evaluate the optimized model on the test data
print(classification_report(y_test, best_logistic.predict(X_test)))

In [None]:
# Plotting the Confusion Matrix for Random Forest Algorithm
cm_rf = confusion_matrix(y_test,best_logistic.predict(X_test))
plt.figure(figsize=(5,5))
sns.set_context('notebook',font_scale = 0.9)
sns.heatmap(cm_rf,annot=True,fmt='d', cmap="Blues", cbar=False)
plt.title('Logistic Regression Confusion Matrix');
plt.xlabel("Predicted_Value")
plt.ylabel("True_Value")
plt.tight_layout()

In [None]:
logistic_evaluation = evaluate_model(best_logistic, X_test, y_test, 'LR')
logistic_evaluation

## 14) Support Vector Machine

In [None]:
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC()) 
])

In [None]:
param_grid_svm = {
    'svm__C': [5],
    'svm__kernel': ['linear', 'rbf', 'poly'],
#     'svm__gamma': [2],
#     'svm__degree': [2,3,4]
}

In [None]:
# Call the function for hyperparameter tuning
best_svm, best_svm_hyperparams = tune_clf_hyperparameters(svm_pipeline, param_grid_svm, X_train, y_train)
print('SVM Optimal Hyperparameters: \n', best_svm_hyperparams)

In [None]:
# Evaluate the optimized model on the train data
print(classification_report(y_train, best_svm.predict(X_train)))

In [None]:
# Evaluate the optimized model on the test data
print(classification_report(y_test, best_svm.predict(X_test)))

In [None]:
# Plotting the Confusion Matrix for Support Vector Classifier Algorithm
cm_svc = confusion_matrix(y_test, best_svm.predict(X_test))
plt.figure(figsize=(5,5))
sns.set_context('notebook',font_scale = 0.9)
sns.heatmap(cm_svc,annot=True,fmt='d', cmap="Blues", cbar=False)
plt.title('Support Vector Confusion Matrix');
plt.xlabel("Predicted_Value")
plt.ylabel("True_Value")
plt.tight_layout()
plt.show()

In [None]:
svm_evaluation = evaluate_model(best_svm, X_test, y_test, 'SVM')
svm_evaluation

In [None]:
# Concatenate the dataframes
all_evaluations = [dt_evaluation, rf_evaluation, logistic_evaluation, svm_evaluation]
results = pd.concat(all_evaluations)

# Sort by 'recall_1'
results = results.sort_values(by='recall_1', ascending=False).round(2)
results

In [None]:


# Assume 'results' is the DataFrame containing the recall scores for each algorithm
# If your DataFrame is named differently or you have specific columns for recall, adjust accordingly

# Ensure recall_1 column exists and the DataFrame is sorted by it
results_sorted = results.sort_values(by='recall_1', ascending=False)

# Plot the bar chart
plt.figure(figsize=(12, 7), dpi=70)
plt.bar(results_sorted.index, results_sorted['recall_1'], color='darkblue')

# Annotate each bar with its recall value
for i, (index, row) in enumerate(results_sorted.iterrows()):
    plt.text(i, row['recall_1'] + 0.01, f"{row['recall_1']:.2f}", ha='center', va='bottom', fontweight='bold', fontsize=12)

# Add title and labels
plt.title("Recall Values for Different Algorithms", fontweight='bold', fontsize=22)
plt.xlabel('Algorithm', fontsize=16, fontweight='bold')
plt.ylabel('Recall Value', fontsize=16, fontweight='bold')
plt.xticks(rotation=45, ha='right', fontweight='bold')

# Set y-axis limit
plt.ylim(0, 1.1)

# Show plot
plt.tight_layout()
plt.show()


   The SVM model does a great job at finding people who might have heart disease. It correctly identifies almost all of them, with a recall score of __0.97__ for class 1 (people with heart disease). This is very important in medical situations. The model also makes sure that while it focuses on catching as many cases as possible, it doesn't make too many mistakes by sending false alerts.

## 15) Prediction

In [None]:
import numpy as np
import ipywidgets as widgets
from IPython.display import display, HTML

# Function to make a prediction based on user input
def make_prediction(features):
    return best_rf.predict([features])

# Define feature names and create input widgets
feature_names = ["age", "sex", "trestbps", "chol", "fbs", "thalach", "exang", "oldpeak", "slope", "ca", "cp_1", "cp_2", "cp_3", "restecg_1", "restecg_2", "thal_1", "thal_2", "thal_3"]
input_widgets = {name: widgets.FloatText(value=0.0, description=f'{name}:') for name in feature_names}

# Create a button for making predictions
predict_button = widgets.Button(description="Predict")

# Output widget to display prediction
output = widgets.Output()

# Function to handle button click event
def on_predict_button_click(button):
    features = [input_widgets[name].value for name in feature_names]
    prediction = make_prediction(features)
    
    # Display the prediction
    with output:
        output.clear_output()
        display(HTML(f"<b>Prediction:</b> {prediction[0]}"))

# Attach the button click event
predict_button.on_click(on_predict_button_click)

# Display widgets and output area
display(widgets.VBox(list(input_widgets.values()) + [predict_button]), output)
