# EDA & Visualisations Notebook

This notebook performs quick exploratory data analysis (EDA) and generates basic visualisations.

### How to use
1. Set `DATA_PATH` below to your cleaned dataset (CSV) **or** an HTML export (first table will be used).
2. Run the cells top to bottom.
3. Figures will be saved under `report/figures/` if it exists, otherwise `figures/`.


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

# Point this to your dataset.
DATA_PATH = Path('heart_2025_cleaned.csv')

# Output directory for figures
REPORT_DIR = Path('../figures')
FIG_DIR = REPORT_DIR if REPORT_DIR.exists() else Path('./figures')
FIG_DIR.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 160)
print('Figures will be saved to:', FIG_DIR)


Figures will be saved to: figures


In [2]:
# ---- LOAD DATA ----
def load_dataset(path: Path) -> pd.DataFrame:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Dataset not found: {p}")
    if p.suffix.lower() in {'.csv'}:
        return pd.read_csv(p)
    if p.suffix.lower() in {'.xlsx', '.xls'}:
        return pd.read_excel(p)
    if p.suffix.lower() in {'.html', '.htm'}:
        tables = pd.read_html(str(p))
        if len(tables) == 0:
            raise ValueError('No HTML tables found in the file')
        return tables[0]
    raise ValueError('Unsupported file type. Use CSV, Excel, or HTML (table).')

df = load_dataset(DATA_PATH)
print('Loaded shape:', df.shape)
df.head()


FileNotFoundError: Dataset not found: ../data/cleaned/heart_2025_cleaned.csv

## Basic Info & Summary

In [None]:
display(df.info())
display(df.describe(include='all').T)
missing = df.isna().sum().sort_values(ascending=False)
missing = missing[missing > 0]
missing


## Categorical Cardinality (top 20)

In [None]:
cat_cols = [c for c in df.columns if df[c].dtype == 'object' or str(df[c].dtype).startswith('category')]
card_snapshot = {}
for c in cat_cols[:20]:
    vc = df[c].value_counts(dropna=False).head(10)
    card_snapshot[c] = vc
card_snapshot


## Numeric Distributions (Histograms)

In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for c in num_cols:
    plt.figure()
    df[c].dropna().hist(bins=30)
    plt.title(f'Distribution: {c}')
    plt.xlabel(c)
    plt.ylabel('Count')
    out = FIG_DIR / f"hist_{c}.png"
    plt.savefig(out, bbox_inches='tight')
    plt.close()
print(f'Saved {len(num_cols)} histogram(s) to', FIG_DIR)


## Boxplots for Numeric Columns

In [None]:
for c in num_cols:
    plt.figure()
    plt.boxplot(df[c].dropna(), vert=True)
    plt.title(f'Boxplot: {c}')
    plt.xlabel(c)
    out = FIG_DIR / f"box_{c}.png"
    plt.savefig(out, bbox_inches='tight')
    plt.close()
print(f'Saved {len(num_cols)} boxplot(s) to', FIG_DIR)


## Correlation Matrix (Numeric)

In [None]:
if len(num_cols) >= 2:
    corr = df[num_cols].corr(numeric_only=True)
    plt.figure(figsize=(8,6))
    im = plt.imshow(corr, interpolation='nearest')
    plt.title('Correlation Matrix')
    plt.xticks(range(len(num_cols)), num_cols, rotation=90)
    plt.yticks(range(len(num_cols)), num_cols)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    out = FIG_DIR / 'correlation_matrix.png'
    plt.tight_layout()
    plt.savefig(out, bbox_inches='tight')
    plt.close()
    print('Saved correlation matrix to', out)
else:
    print('Not enough numeric columns for correlation matrix')


## Auto-detect a Likely Binary Target (Optional)

In [None]:
def guess_binary_target(dataframe: pd.DataFrame):
    # Names that often indicate a target/outcome
    name_hints = {'target','outcome','label','y','disease','heart_disease','has_disease','class'}
    # First, try name hints
    for c in dataframe.columns:
        if c.lower() in name_hints:
            # Check if it looks binary
            uniq = dataframe[c].dropna().unique()
            if len(uniq) <= 3:  # tolerant for {0,1} + maybe NaN or yes/no
                return c
    # Otherwise, find any column with <=3 unique non-null values
    for c in dataframe.columns:
        uniq = dataframe[c].dropna().unique()
        if len(uniq) <= 3:
            return c
    return None

TARGET_COL = guess_binary_target(df)
print('Guessed binary target column:', TARGET_COL)


## Categorical vs Target (Bar Plots)

In [None]:
if TARGET_COL is not None:
    cat_cols_lite = [c for c in df.columns if c != TARGET_COL and (df[c].dtype == 'object' or str(df[c].dtype).startswith('category'))]
    for c in cat_cols_lite[:10]:
        ct = df[[c, TARGET_COL]].dropna()
        if ct.empty:
            continue
        counts = ct.groupby([c, TARGET_COL]).size().unstack(fill_value=0)
        # Simple stacked bar via matplotlib
        plt.figure()
        bottom = None
        for idx, col in enumerate(counts.columns):
            vals = counts[col].values
            if bottom is None:
                plt.bar(counts.index.astype(str), vals)
                bottom = vals
            else:
                plt.bar(counts.index.astype(str), vals, bottom=bottom)
                bottom = bottom + vals
        plt.title(f'{c} vs {TARGET_COL} (counts)')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        out = FIG_DIR / f"cat_vs_{TARGET_COL}_{c}.png"
        plt.savefig(out, bbox_inches='tight')
        plt.close()
    print('Saved categorical-vs-target bar plots (up to 10) to', FIG_DIR)
else:
    print('No binary-like target detected; skipping categorical vs target plots')


## Top Numeric Correlations with Target (Scatter Plots)

In [None]:
if TARGET_COL is not None and TARGET_COL in df.columns and df[TARGET_COL].dropna().nunique() <= 3:
    # Encode target to 0/1 if possible
    t = df[TARGET_COL]
    if t.dtype == 'object' or str(t.dtype).startswith('category'):
        t_enc = pd.factorize(t)[0]
    else:
        # Attempt binary cast
        t_enc = (t == t.dropna().unique()[0]).astype(int) if t.dropna().nunique() == 2 else t
    tmp = df.copy()
    tmp['_target_enc'] = t_enc
    corr = tmp.select_dtypes(include=[np.number]).corr(numeric_only=True)['_target_enc'].drop('_target_enc', errors='ignore').abs().sort_values(ascending=False)
    top = corr.head(5).index.tolist()
    for c in top:
        if c == '_target_enc':
            continue
        plt.figure()
        plt.scatter(tmp[c], t_enc, s=10)
        plt.title(f'{c} vs {TARGET_COL}')
        plt.xlabel(c)
        plt.ylabel(str(TARGET_COL))
        out = FIG_DIR / f"scatter_{c}_vs_{TARGET_COL}.png"
        plt.savefig(out, bbox_inches='tight')
        plt.close()
    print('Saved scatter plots for top numeric correlates to', FIG_DIR)
else:
    print('No suitable binary-like target for correlation-based scatter plots')


## Save a Cleaned Snapshot (Optional)

In [None]:
snapshot_path = FIG_DIR / 'eda_sample_preview.csv'
df.head(50).to_csv(snapshot_path, index=False)
snapshot_path
