In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("C:/Users/Lenovo/Downloads/heart_disease_prediction.csv")

# Data Exploration
print("Initial dataset shape:", df.shape)
print(df.info())
print(df.describe())
print("\nMissing values:\n", df.isnull().sum())
print("\nNumber of duplicates before removal:", df.duplicated().sum())

In [None]:
# Handle missing values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

In [None]:
# Check for outliers using IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
outliers = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum()
print("\nOutliers detected:\n", outliers)

In [None]:
# Remove outliers using IQR
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Filter outliers
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

# Identify numerical columns for outlier removal
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
if 'tenYearCHD' in numerical_cols:
    numerical_cols.remove('tenYearCHD')  # Don't remove outliers from target variable

original_size = df.shape[0]
df = remove_outliers(df, numerical_cols)
print(f"Removed {original_size - df.shape[0]} outliers ({((original_size - df.shape[0])/original_size)*100:.2f}% of data)")
print("Dataset shape after outlier removal:", df.shape)

In [None]:
# Feature selection based on p-values
X = df.drop('tenYearCHD', axis=1)
y = df['tenYearCHD']

# Select top 10 features based on ANOVA F-value
selector = SelectKBest(f_classif, k=10)
selector.fit(X, y)
selected_features = X.columns[selector.get_support()]
print("\nTop 10 features based on p-values:\n", selected_features)

# Update dataframe with selected features
df_selected = df[list(selected_features) + ['tenYearCHD']]

In [None]:
# Check class imbalance
print("\nClass distribution:\n", df_selected['tenYearCHD'].value_counts())

# Apply SMOTE to handle class imbalance
X = df_selected.drop('tenYearCHD', axis=1)
y = df_selected['tenYearCHD']

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)
print("\nClass distribution after SMOTE:\n", y_res.value_counts())

In [None]:
# Data Splitting
# First split: 10% for final unseen test, 90% for model development
X_dev, X_unseen, y_dev, y_unseen = train_test_split(X_res, y_res, test_size=0.1, random_state=42, stratify=y_res)

# Second split: 80% training, 20% testing from the development set
X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.2, random_state=42, stratify=y_dev)

In [None]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_unseen_scaled = scaler.transform(X_unseen)

In [None]:
# Train Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)

# Predict on test set
y_pred = lr.predict(X_test_scaled)

# Evaluate model
print("\nModel Evaluation on Test Set:")
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Test Set)')
plt.show()
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Test Data Accuracy: {accuracy:.2f}")
print(f"Test Data Precision: {precision:.2f}")
print(f"Test Data Recall: {recall:.2f}")
print(f"Test Data F1 Score: {f1:.2f}")

In [None]:
coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': lr.coef_[0]
}).sort_values('Coefficient', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=coefficients)
plt.axvline(0, color='k', linestyle='--')
plt.title('Logistic Regression Coefficients')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.show()

In [None]:
# Predict on unseen data
y_unseen_pred = lr.predict(X_unseen_scaled)

# Evaluate on unseen data
print("\nUnseen Data Evaluation:")
print("Confusion Matrix:\n", confusion_matrix(y_unseen, y_unseen_pred))
cm = confusion_matrix(y_unseen, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Unseen Data)')
plt.show()
print("\nClassification Report:\n", classification_report(y_unseen, y_unseen_pred))

unseen_accuracy = accuracy_score(y_unseen, y_unseen_pred)
unseen_precision = precision_score(y_unseen, y_unseen_pred)
unseen_recall = recall_score(y_unseen, y_unseen_pred)
unseen_f1 = f1_score(y_unseen, y_unseen_pred)

print(f"Unseen Data Accuracy: {unseen_accuracy:.2f}")
print(f"Unseen Data Precision: {unseen_precision:.2f}")
print(f"Unseen Data Recall: {unseen_recall:.2f}")
print(f"Unseen Data F1 Score: {unseen_f1:.2f}")