In [None]:
#We import main libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
#We import the dataset
df = pd.read_csv('heart.csv')
df.head()

In [None]:
df.isnull().sum()


In [None]:
df.describe()

In [None]:
# Plot a histogram of the 'Age' column
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='Age', kde=True, color='skyblue', bins=20)
plt.title('Histogram of Age', fontsize=16, fontweight='bold')
plt.xlabel('Age', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()

# Plot a boxplot of 'Age' grouped by 'Sex'
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='Sex', y='Age', palette='Set2')
plt.title('Boxplot of Age Grouped by Sex', fontsize=16, fontweight='bold')
plt.xlabel('Sex', fontsize=14)
plt.ylabel('Age', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()


In [None]:
# Count the number of occurrences of heart disease
heart_disease_count = df['HeartDisease'].value_counts()
labels = ['No Heart Disease', 'Heart Disease']
colors = ['lightgreen', 'lightcoral']

plt.figure(figsize=(8, 6))
plt.pie(heart_disease_count, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140, shadow=True)
plt.title('Distribution of Heart Disease', fontsize=16, fontweight='bold')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()




In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='Sex', palette='Paired')
plt.title('Count of Sex', fontsize=16, fontweight='bold')
plt.xlabel('Sex', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()




In [None]:
plt.figure(figsize=(10, 8))
sns.countplot(data=df, x='Sex', hue='HeartDisease', palette='Set2')
plt.title('Count of Heart Disease Grouped by Sex', fontsize=16, fontweight='bold')
plt.xlabel('Sex', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Heart Disease', labels=['Normal', 'Heart Disease'], fontsize=12, title_fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()



In [None]:
plt.figure(figsize=(10, 8))
sns.histplot(data=df, x='Age', hue='HeartDisease', multiple='stack', palette='viridis', alpha=0.7)
plt.title('Distribution of Heart Disease by Age', fontsize=16, fontweight='bold')
plt.xlabel('Age', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Heart Disease', labels=['Normal', 'Heart Disease'], fontsize=12, title_fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()




In [None]:
sns.set(style="whitegrid", font_scale=1.2)

g = sns.catplot(data=df, x='Sex', y='Age', hue='HeartDisease', kind='box', height=6, aspect=1.5)
g.set_axis_labels('Sex', 'Age')
g.set_titles('Age Grouped by Sex and Heart Disease')
plt.show()


In [None]:
plt.figure(figsize=(15,13))
cols = ['ChestPainType','FastingBS','RestingECG','ExerciseAngina','ST_Slope']
for i,cols in enumerate(cols):
    plt.subplot(3,3, (i+1))
    plt.title(f'Age x {cols}')
    sns.kdeplot(data = df, x='Age',hue=cols)
    
plt.tight_layout()
plt.show()

In [None]:
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix', fontsize=16, fontweight='bold')
plt.show()


In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into features (X) and target variable (y)
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Split the data into training, test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#encode the categorical variables ChestPainType', 'Sex', 'ExerciseAngina', 'ST_Slope','RestingECG
X_train = pd.get_dummies(X_train, columns=['ChestPainType', 'Sex', 'ExerciseAngina', 'ST_Slope','RestingECG'], drop_first=True)
X_test = pd.get_dummies(X_test, columns=['ChestPainType', 'Sex', 'ExerciseAngina', 'ST_Slope','RestingECG'], drop_first=True)

#preprocss by Normalizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#print the shape of the scaled training and test sets
print(X_train_scaled.shape)
print(X_test_scaled.shape)

#Show scaled data usiing a boxplot
plt.figure(figsize=(12, 8))
sns.boxplot(data=X_train_scaled, palette='viridis')
plt.title('Scaled Features', fontsize=16, fontweight='bold')
plt.xlabel('Features', fontsize=14)
plt.ylabel('Values', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()





In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Create a new instance of the Random Forest Classifier
rf = RandomForestClassifier()

# Fit the model on the training data
rf.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = rf.predict(X_test_scaled)

# Print the classification report and confusion matrix on a plot
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='viridis', fmt='g')
plt.title('Confusion Matrix', fontsize=16, fontweight='bold')
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

print(classification_report(y_test, y_pred))

#Store extact accuracy and name of the model in a table
results = pd.DataFrame({'Method': ['Random Forest'], 'Accuracy': classification_report(y_test, y_pred, output_dict=True)['accuracy']})

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create a new instance of the Decision Tree Classifier
dt = DecisionTreeClassifier()

# Fit the model on the training data
dt.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = dt.predict(X_test_scaled)

# Print the classification report and confusion matrix on a plot
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='viridis', fmt='g')
plt.title('Confusion Matrix', fontsize=16, fontweight='bold')
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

print(classification_report(y_test, y_pred))

#Store extact accuracy and name of the model in a table
results = results.append({'Method': 'Decision Tree', 'Accuracy': classification_report(y_test, y_pred, output_dict=True)['accuracy']}, ignore_index=True)






In [None]:
from sklearn.linear_model import LinearRegression

# Create an instance of the Linear Regression model
regression = LinearRegression()

# Fit the model on the training data
regression.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = regression.predict(X_test_scaled)

# Print the classification report and confusion matrix on a plot
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred.round()), annot=True, cmap='viridis', fmt='g')
plt.title('Confusion Matrix', fontsize=16, fontweight='bold')
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

print(classification_report(y_test, y_pred.round()))

#Store extact accuracy and name of the model in a table
results = results.append({'Method': 'Linear Regression', 'Accuracy': classification_report(y_test, y_pred.round(), output_dict=True)['accuracy']}, ignore_index=True)





In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Create a new instance of the KNN Classifier
knn = KNeighborsClassifier()

# Fit the model on the training data
knn.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = knn.predict(X_test_scaled)

# Print the classification report and confusion matrix on a plot
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='viridis', fmt='g')
plt.title('Confusion Matrix', fontsize=16, fontweight='bold')
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

print(classification_report(y_test, y_pred))

#Store extact accuracy and name of the model in a table
results = results.append({'Method': 'KNN', 'Accuracy': classification_report(y_test, y_pred, output_dict=True)['accuracy']}, ignore_index=True)



In [None]:
#Show on a bar plot the accuracy of the models
plt.figure(figsize=(10, 8))
sns.barplot(data=results, x='Method', y='Accuracy', palette='viridis')
plt.title('Model Accuracy', fontsize=16, fontweight='bold')
plt.xlabel('Method', fontsize=14)
plt.ylabel('Accuracy', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a new instance of the Random Forest Classifier
rf = RandomForestClassifier()

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object on the training data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters and best score
print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Use the best parameters to create a new Random Forest model
best_rf = RandomForestClassifier(**best_params)

# Fit the new model on the training data
best_rf.fit(X_train_scaled, y_train)

# Make predictions on the test data using the new model
y_pred = best_rf.predict(X_test_scaled)

# Print the classification report and confusion matrix on a plot
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='viridis', fmt='g')
plt.title('Confusion Matrix', fontsize=16, fontweight='bold')
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

print(classification_report(y_test, y_pred))


In [None]:
#Show AUROC curve
from sklearn.metrics import roc_curve, roc_auc_score

# Get the predicted probabilities
y_pred_proba = best_rf.predict_proba(X_test_scaled)[:, 1]

# Get the fpr, tpr, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='Random Forest')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.title('ROC Curve', fontsize=16, fontweight='bold')
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.legend()

# Print the AUC score
auc_score = roc_auc_score(y_test, y_pred_proba)
print('AUC Score:', auc_score)


In [None]:
# Get the feature importances
importances = best_rf.feature_importances_

# Create a dataframe with feature names and importances
feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importances, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importances', fontsize=16, fontweight='bold')
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()

