In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the dataset

In [None]:
data = pd.read_csv("/kaggle/input/heart-failure-prediction/heart.csv")
data.head()

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
data.shape

In [None]:
missing_values = data.isnull().sum()
print(missing_values)

# Data Pre-processing

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop('HeartDisease', axis=1)
y = data['HeartDisease']

X = pd.get_dummies(X, drop_first=True)

print("Data types BEFORE conversion:")
print(X.dtypes)

# Identify non-numeric columns (should be none after get_dummies, but let's check)
non_numeric_cols = X.select_dtypes(include=['object', 'category']).columns
print(f"\nNon-numeric columns found: {list(non_numeric_cols)}")

# If there are any non-numeric columns, we need to convert them.
# This often happens if get_dummies doesn't work as expected or a column has mixed types.
# The safest way is to force conversion to numeric, coering errors to NaN.
for col in X.columns:
    # Convert each column to numeric, forcing any unconvertible values to NaN
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Now, check if the conversion created any NaN values and handle them
missing_after_conversion = X.isnull().sum().sum()
if missing_after_conversion > 0:
    print(f"\nWARNING: Conversion created {missing_after_conversion} NaN values.")
    # Print which columns have NaNs
    print("NaN values per column:")
    print(X.isnull().sum())
    # Simple strategy: fill NaNs with the column mean (or median, or mode for categorical)
    # Since this is likely an error, we use mean for simplicity.
    X = X.fillna(X.mean())
    print("\nNaN values have been filled with column means.")

print("\nData types AFTER conversion:")
print(X.dtypes)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Training and Testing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Initialize the scaler
scaler = StandardScaler()

# Fit on training data, transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Note: Only transform the test data

After preparing the data, we will now apply different Machine learning algorithms to test the accuracy and make comparisons between them using the same dataset.

# **1. Logisitic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix, RocCurveDisplay
import matplotlib.pyplot as plt
import seaborn as sns

# Make predictions on the test set
y_pred = log_reg.predict(X_test)
y_prob = log_reg.predict_proba(X_test)[:, 1]

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate and display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Display ROC curve
plt.figure(figsize=(8, 6))
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='Logistic Regression')
roc_display.plot()
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.show()

**Key Findings from Data Analysis **

1. The logistic regression model demonstrated an accuracy rate of 85% on the validation set.  
2. The model's Area Under the ROC Curve (AUC) was quite high, reflecting exceptional ability to differentiate between patients who do and do not have heart disease.  
3. The most significant factors influencing the prediction of heart disease, as indicated by the model's coefficients, include ChestPainType_NAP, ChestPainType_ATA, ST_Slope_Flat, Sex_M, and ExerciseAngina_Y.

**ACCURACY - 85%**

# **2. Decision Tree**

In [None]:
# Pipeline
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score

# Model
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import warnings 
warnings.filterwarnings('ignore')

In [None]:
x_values = data.select_dtypes(include=['number'])

In [None]:
plt.figure(figsize=(18, 8))
sns.countplot(data=data, x='HeartDisease', palette='YlOrRd')
plt.show()

In [None]:
# Separation of features and target

X = data.drop('HeartDisease',axis=1)
y = data['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

numerical_columns = list(X_train.select_dtypes(include=['float64', 'int64']).columns)
categorical_columns = list(X_train.select_dtypes(include=['object', 'category']).columns)


In [None]:
#Pipeline
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, numerical_columns),
    ('cat', cat_pipeline, categorical_columns)
])

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('DecisionTree', DecisionTreeClassifier())
])

In [None]:
# Training pipeline
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')



print(classification_report(y_test, y_pred))
print(f"Cross Validation: {scores.mean():.2f}")

In [None]:
from sklearn.tree import plot_tree

modelo = pipeline.named_steps['DecisionTree']
feature_names_num = numerical_columns
cat_encoder = pipeline.named_steps['preprocessing'].named_transformers_['cat'].named_steps['encoder']
cat_features = cat_encoder.get_feature_names_out(categorical_columns)
feature_names = list(feature_names_num) + list(cat_features)

# 3. Plotar a Ã¡rvore
plt.figure(figsize=(20, 10))
plot_tree(modelo, 
          filled=True, 
          feature_names=feature_names, 
          class_names=[str(cls) for cls in modelo.classes_])
plt.show()

**Key Findings from Data Analysis**

1. The Decision Tree model demonstrated an accuracy rate of 74% on the validation set.
2. One hot encoder is used to balance the string values.


**ACCURACY = 74%**

# **3. Support Vector Machine**

In [None]:
# Data Handling
import pandas as pd
import numpy as np

# Train-Test Split & Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Models
from sklearn.svm import SVC

# Evaluation Metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import silhouette_score  # For evaluating clusters

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
print("\n" + "="*50)
print("SUPPORT VECTOR MACHINE (SVM)")
print("="*50)

# Create and train the model on SCALED data
svm_model = SVC(kernel='rbf', random_state=42) # RBF kernel is a good default
svm_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test_scaled)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred_svm).round(4))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))

**ACCURACY - 54.35%**

# **4. KNN**

In [None]:
# Data Handling
import pandas as pd
import numpy as np

# Train-Test Split & Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Models
from sklearn.neighbors import KNeighborsClassifier

# Evaluation Metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import silhouette_score  # For evaluating clusters

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
print("\n" + "="*50)
print("K-NEAREST NEIGHBORS (KNN)")
print("="*50)

# Create and train the model on SCALED data
knn_model = KNeighborsClassifier(n_neighbors=5) # You can tune this parameter
knn_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_knn = knn_model.predict(X_test_scaled)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred_knn).round(4))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_knn))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))

**ACCURACY - 50%**

# **5. PCA (Dimension Reduction)**

Principal Component Analysis

In [None]:
# Data Handling
import pandas as pd
import numpy as np

# Train-Test Split & Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Models
from sklearn.decomposition import PCA

# Evaluation Metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import silhouette_score  # For evaluating clusters

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
print("\n" + "="*50)
print("PRINCIPAL COMPONENT ANALYSIS (PCA)")
print(50)

# Apply PCA to the scaled data - use the same data that was used for scaling
pca = PCA(n_components=0.95) # Keep 95% of the variance

# Fit on the TRAINING scaled data and transform both training and testing
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Original number of features: {X_train_scaled.shape[1]}")
print(f"Reduced number of components: {X_train_pca.shape[1]}")

# Plot the explained variance ratio
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA: Explained Variance by Component')
plt.grid(True)
plt.show()

# Optional: Train a model on PCA-reduced data to see the impact
svm_pca_model = SVC(random_state=42)
svm_pca_model.fit(X_train_pca, y_train)
y_pred_svm_pca = svm_pca_model.predict(X_test_pca)

print("\nSVM Performance on PCA-Reduced Data:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm_pca).round(4))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm_pca))

**ACCURACY - 53.8%**

# **Comparison**

In [None]:
# Import the plotting library
import matplotlib.pyplot as plt
import numpy as np

# 1. Define the data
models = ['Logistic Regression', 'Decision Tree', 'SVM', 'KNN', 'SVM (PCA)']
accuracy = [85.0, 74.0, 54.35, 50.0, 53.8]  # Your accuracy values
colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#6B8F71']  # A professional color palette

# 2. Create the figure and axis
plt.figure(figsize=(10, 6))
bars = plt.bar(models, accuracy, color=colors, edgecolor='black', linewidth=0.8, alpha=0.85)

# 3. Customize the chart for clarity and professionalism
plt.title('Comparison of Model Accuracies for Heart Disease Prediction', fontsize=14, fontweight='bold', pad=20)
plt.ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
plt.xlabel('Machine Learning Models', fontsize=12, fontweight='bold')
plt.ylim(0, 100)  # Set y-axis from 0 to 100% for accuracy

# 4. Add the accuracy value on top of each bar
for bar, acc in zip(bars, accuracy):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 1,
             f'{acc}%', ha='center', va='bottom', fontweight='bold')

# 5. Add a horizontal line at 50% (random guess baseline for binary classification)
plt.axhline(y=50, color='r', linestyle='--', linewidth=1.2, alpha=0.7, label='Random Guess (50%)')
plt.legend()

# 6. Improve layout and display
plt.grid(axis='y', alpha=0.3, linestyle=':')
plt.xticks(rotation=45, ha='right') # Rotate model names if they are long
plt.tight_layout() # Ensures everything fits in the figure

# 7. Show the plot
plt.show()

# Optional: Save the figure as a high-resolution image for your article
# plt.savefig('model_accuracy_comparison.png', dpi=300, bbox_inches='tight')

# **Conclusion**

The above analysis demonstrates the accuracy of the different machine learning algorithms implementing on the same dataset.