In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer

# 1. Load and clean data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data" 
col_names = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
             "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"]

df = pd.read_csv(url, names=col_names, na_values="?")
df.dropna(inplace=True)

# 2. Sample 50 rows for simplicity
df = df.sample(n=30, random_state=42)

# Drop complex columns that are hard to encode (missing values, many categories)
df = df.drop(columns=["ca", "thal", "slope"])

# 3. Convert target into 4 classes: 0, 1, 2, 3
df["target"] = pd.cut(df["target"], bins=[-1, 0, 1, 2, 4], labels=[0, 1, 2, 3]).astype(int)

# 4. Handle categorical variables: cp and restecg
cat_cols = ["cp", "restecg"]

# Debug: Check unique values and data types in categorical columns
print("Unique values in 'cp':", df['cp'].unique())
print("Data type of 'cp':", df['cp'].dtype)
print("Unique values in 'restecg':", df['restecg'].unique())
print("Data type of 'restecg':", df['restecg'].dtype)

# Ensure categorical columns are treated as categorical data types
df[cat_cols] = df[cat_cols].astype('category')

# One-hot encode categorical features
X_cat = pd.get_dummies(df[cat_cols])  # No prefix needed; auto-generated column names

# Debug: Check X_cat (one-hot encoded categorical features)
print("Shape of X_cat:", X_cat.shape)
print("Columns in X_cat:", X_cat.columns)
print("Unique values in X_cat:", np.unique(X_cat.values))

# 5. Remove original categorical cols from df
df = df.drop(columns=cat_cols)

# 6. Binned continuous variables
cont_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]
binner = KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='quantile')
X_cont = pd.DataFrame(binner.fit_transform(df[cont_cols]))

# Debug: Check X_cont (binned continuous features)
print("Shape of X_cont:", X_cont.shape)
print("Unique values in X_cont:", np.unique(X_cont.values))

# 7. Binary variables we want to keep as-is
X_bin = df[["sex", "fbs", "exang"]].copy()

# Debug: Check X_bin (binary features)
print("Shape of X_bin:", X_bin.shape)
print("Unique values in X_bin:", np.unique(X_bin.values))

# 8. Combine all features
X_all = pd.concat([
    X_bin.reset_index(drop=True),
    X_cat.reset_index(drop=True),
    X_cont.reset_index(drop=True)
], axis=1)

# Debug: Check combined features
print("Shape of X_all:", X_all.shape)
print("Unique values in X_all:", np.unique(X_all.values))

# 9. Convert to NumPy arrays
X_final = X_all.to_numpy().astype(int)
y_final = df["target"].to_numpy().astype(int)

# 10. Final output check
print(" Unique values in X_final:", np.unique(X_final))
print(" Shape of X_final:", X_final.shape)
print(" Final Feature Matrix (X_all):")
#print(X_all)