# Heart Disease Prediction - Midterm Project
## Minimum Requirements Implementation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

## 1. DATA COLLECTION AND PREPARATION (25%)

In [None]:
# Load primary dataset
df = pd.read_csv('heart.csv')
print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

In [None]:
# Display first few rows
print("First 5 rows:")
df.head()

In [None]:
# Data types
print("Data types:")
df.dtypes

In [None]:
# Basic info
print("Dataset info:")
df.info()

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Remove duplicates if any
if duplicates > 0:
    df = df.drop_duplicates()
    print(f"Duplicates removed. New shape: {df.shape}")

In [None]:
# Check for missing values
print("Missing values per column:")
df.isnull().sum()

In [None]:
# Identify categorical vs continuous variables
categorical_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']
continuous_vars = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

print(f"Categorical variables: {categorical_vars}")
print(f"Continuous variables: {continuous_vars}")

## 2. DATA PROCESSING - HANDLING MISSING DATA (15%)

In [None]:
# Create a copy for processing
df_processed = df.copy()

# Apply median imputation for continuous variables (ONE technique as requested)
imputer = SimpleImputer(strategy='median')

for col in continuous_vars:
    if df_processed[col].isnull().sum() > 0:
        df_processed[col] = imputer.fit_transform(df_processed[[col]])
        print(f"Imputed {col} with median")

# For categorical variables, use mode imputation if needed
for col in categorical_vars:
    if df_processed[col].isnull().sum() > 0:
        mode_value = df_processed[col].mode()[0]
        df_processed[col].fillna(mode_value, inplace=True)
        print(f"Imputed {col} with mode: {mode_value}")

print("\nMissing values after imputation:")
print(df_processed.isnull().sum().sum())

## 3. EXPLORATORY DATA ANALYSIS AND VISUALIZATION (25%)

In [None]:
# Basic statistical summaries
print("Statistical Summary - Continuous Variables:")
df_processed[continuous_vars].describe()

In [None]:
print("Statistical Summary - Categorical Variables:")
df_processed[categorical_vars].describe()

In [None]:
# Target variable distribution
print("Target variable distribution:")
print(df_processed['target'].value_counts())
print(f"\nClass balance: {df_processed['target'].value_counts(normalize=True)}")

### VISUALIZATION 1: Histogram (Distribution of Age)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df_processed['age'], bins=20, edgecolor='black', alpha=0.7)
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Distribution of Patient Age')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('viz1_age_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

### VISUALIZATION 2: Bar Chart (Target Variable)

In [None]:
plt.figure(figsize=(8, 6))
target_counts = df_processed['target'].value_counts()
plt.bar(target_counts.index, target_counts.values, color=['#2ecc71', '#e74c3c'], edgecolor='black')
plt.xlabel('Heart Disease')
plt.ylabel('Count')
plt.title('Distribution of Heart Disease Presence')
plt.xticks([0, 1], ['Absent', 'Present'])
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('viz2_target_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

### VISUALIZATION 3: Correlation Heatmap

In [None]:
plt.figure(figsize=(12, 10))
correlation_matrix = df_processed[continuous_vars + ['target']].corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Correlation Heatmap - Continuous Variables and Target')
plt.tight_layout()
plt.savefig('viz3_correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. DATA ENCODING FOR FUTURE MODELING

In [None]:
# Create encoded dataset for future modeling
df_encoded = df_processed.copy()

# Binary variables are already encoded (0/1)
binary_vars = ['sex', 'fbs', 'exang', 'target']
print(f"Binary variables (already encoded): {binary_vars}")

In [None]:
# One-hot encode multi-class categorical variables
multi_class_vars = ['cp', 'restecg', 'slope', 'thal']
df_encoded = pd.get_dummies(df_encoded, columns=multi_class_vars, prefix=multi_class_vars, drop_first=True)

print(f"Original shape: {df_processed.shape}")
print(f"Encoded shape: {df_encoded.shape}")
print(f"\nNew columns after encoding: {df_encoded.shape[1] - df_processed.shape[1]} additional columns")

In [None]:
# Display encoded columns
print("Encoded column names:")
print(df_encoded.columns.tolist())

In [None]:
# Save processed and encoded data for future modeling
df_encoded.to_csv('heart_processed_encoded.csv', index=False)
print("Processed and encoded data saved to 'heart_processed_encoded.csv'")

## 5. FINAL SUMMARY

In [None]:
print("="*60)
print("SUMMARY")
print("="*60)
print(f"Total observations: {df_encoded.shape[0]}")
print(f"Total features (after encoding): {df_encoded.shape[1]}")
print(f"Target variable: 'target' (0=No disease, 1=Disease present)")
print("\nData is now prepared for future model training!")
print("="*60)