# Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import joblib
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm
#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

#  Load the dataset

In [None]:
# Download the dataset
data_path = "https://storage.googleapis.com/edulabs-public-datasets/heart_disease_uci.csv"

# Load the dataset into a Pandas DataFrame
df = pd.read_csv(data_path)


**Column description**

- id: unique id
- age: age in years
- sex: gender
- dataset: location of data collection
- cp: chest pain type
- trestbps: resting blood pressure
- chol: cholesterol measure
- fbs: fasting blood sugar
- restecg: ecg observation at resting condition
- thalch: maximum heart rate achieved
- exang: exercise induced angina
- oldpeak: ST depression induced by exercise relative to rest
- slope: the slope of the peak exercise ST segment
- ca: number of major vessels (0-3) colored by flourosopy
- thal: thal
- num: target [0=no heart disease; 1,2,3,4 = stages of heart disease ]

In [None]:
df

# Data preprocessing

Perform quick data preprocessing:

- Remove redundant columns
- Fill / Drop missing values
- Convert column types if needed

In [None]:
df.drop(columns=['id', 'dataset'], inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.drop(columns=['thal', 'slope'], inplace=True)

In [None]:
df.dropna()['num'].value_counts()

In [None]:
df.dropna(inplace=True)

In [None]:
df

In [None]:
df['num'].value_counts()

# Multinomial Logistic Regression

In [None]:
df = pd.get_dummies(df, columns=['sex', 'fbs', 'restecg', 'exang', 'cp'], drop_first=True)

In [None]:
# Define features and target
X = df.drop(columns=["num"])
y = df["num"]

In [None]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Standardize features
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test),  columns=X_test.columns)

In [None]:
# Train Logistic Regression with Softmax
model = LogisticRegression(max_iter=100)
model.fit(X_train, y_train)

In [None]:
print(metrics.classification_report(y_test, model.predict(X_test)))

# Using class weights

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight

In [None]:
classes = np.unique(y_train)
weights = compute_class_weight('balanced', classes=classes, y=y_train)
weights

In [None]:
# using balanced weights
model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)
print(metrics.classification_report(y_test, model.predict(X_test)))

In [None]:
# custom class weights
weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
class_weights

In [None]:
class_weights[4] = 20
class_weights

In [None]:
model = LogisticRegression(class_weight=class_weights)
model.fit(X_train, y_train)
print(metrics.classification_report(y_test, model.predict(X_test)))

# Undersampling

**Note: this is just a demo - in reality you will not perform undersampling on such a small dataset!**

In [None]:
y_train.value_counts()

In [None]:
# --- Undersampling the majority class ---
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
X_res, y_res = rus.fit_resample(X_train, y_train)
y_res.value_counts()

In [None]:
rus = RandomUnderSampler(sampling_strategy='majority')
X_res, y_res = rus.fit_resample(X_train, y_train)
y_res.value_counts()

In [None]:
rus = RandomUnderSampler(sampling_strategy={0: 70, 1: 45, 2:29, 3:28, 4:10})
X_res, y_res = rus.fit_resample(X_train, y_train)
y_res.value_counts()

In [None]:
model = LogisticRegression()
model.fit(X_res, y_res)
print(metrics.classification_report(y_test, model.predict(X_test)))

# Oversampling

### Random Oversampler

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X_train, y_train)

In [None]:
y_res.value_counts()

In [None]:
model = LogisticRegression()
model.fit(X_res, y_res)
print(metrics.classification_report(y_test, model.predict(X_test)))

### SMOTE - suitable for numeric features

In [None]:
import numpy as np
import plotly.graph_objects as go
from imblearn.over_sampling import SMOTE

# Tiny 2D dataset
X = np.array([
    [1, 1],       # minority
    [1.2, 1.1],   # minority
    [4, 4],       # majority
    [4.2, 3.8],   # majority
    [3.9, 4.1],   # majority
])
y = np.array([1, 1, 0, 0, 0])  # 1 = minority, 0 = majority

# Apply SMOTE with 1 neighbor for simplicity
smote = SMOTE(k_neighbors=1)
X_res, y_res = smote.fit_resample(X, y)

# Identify synthetic samples
synthetic_X = X_res[len(X):]

# Create interactive plot
fig = go.Figure()

# Majority class (blue)
fig.add_trace(go.Scatter(
    x=X[y == 0][:, 0],
    y=X[y == 0][:, 1],
    mode='markers',
    marker=dict(color='blue', size=10),
    name='Majority class'
))

# Original minority (red)
fig.add_trace(go.Scatter(
    x=X[y == 1][:, 0],
    y=X[y == 1][:, 1],
    mode='markers',
    marker=dict(color='red', size=10, symbol='triangle-up'),
    name='Original Minority'
))

# Synthetic samples (orange X)
fig.add_trace(go.Scatter(
    x=synthetic_X[:, 0],
    y=synthetic_X[:, 1],
    mode='markers',
    marker=dict(color='orange', size=12, symbol='x'),
    name='Synthetic Minority (SMOTE)'
))

# Optional: draw lines from original to synthetic points
for original in X[y == 1]:
    for synthetic in synthetic_X:
        fig.add_trace(go.Scatter(
            x=[original[0], synthetic[0]],
            y=[original[1], synthetic[1]],
            mode='lines',
            line=dict(color='gray', dash='dot'),
            showlegend=False
        ))

fig.update_layout(
    title='🔍 SMOTE Demo – Synthetic Samples via Interpolation',
    xaxis_title='Feature 1',
    yaxis_title='Feature 2',
    legend=dict(bgcolor='rgba(0,0,0,0)', borderwidth=0),
    width=700,
    height=600
)

fig.show()


In [None]:
smote = SMOTE(k_neighbors=5)
X_res, y_res = smote.fit_resample(X_train, y_train)

In [None]:
model = LogisticRegression()
model.fit(X_res, y_res)
print(metrics.classification_report(y_test, model.predict(X_test)))

### SMOTENC - suitable for numeric and categorical

In [None]:
df_orig = pd.read_csv(data_path)
df_orig.drop(columns=['id', 'dataset'], inplace=True)
df_orig.drop(columns=['thal', 'slope'], inplace=True)
df_orig.dropna(inplace=True)

In [None]:
# Define features and target
X = df_orig.drop(columns=["num"])
y = df_orig["num"]

In [None]:
X

In [None]:
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang']
numerical_cols = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression

In [None]:
# Create preprocessors
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OrdinalEncoder(), categorical_cols)
])

X_preprocessed = preprocessor.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, stratify=y, random_state=42)


# 2. Use SMOTENC (gender is at column index 2 in preprocessed output)
smote = SMOTENC(categorical_features=[1,2,5,6,8], random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# 3. Train Logistic Regression
model = LogisticRegression()
model.fit(X_resampled, y_resampled)
print(metrics.classification_report(y_test, model.predict(X_test)))

# Mix rersamling and weighted learning

In [None]:
df = pd.read_csv(data_path)
df.drop(columns=['id', 'dataset'], inplace=True)
df.drop(columns=['thal', 'slope'], inplace=True)
df.dropna(inplace=True)

df = pd.get_dummies(df, columns=['sex', 'fbs', 'restecg', 'exang', 'cp'], drop_first=True)

In [None]:
# Define features and target
X = df.drop(columns=["num"])
y = df["num"]
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Standardize features
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test),  columns=X_test.columns)

In [None]:
y_train.value_counts()


In [None]:
rus = RandomUnderSampler(sampling_strategy={0:100})
X_res, y_res = rus.fit_resample(X_train, y_train)
y_res.value_counts()


# Apply SMOTE
smote_nc = SMOTE(random_state=42, sampling_strategy={2:60, 3:60, 4: 40})
X_res, y_res = smote_nc.fit_resample(X_res, y_res)

In [None]:
y_res.value_counts()

In [None]:
model = LogisticRegression(class_weight="balanced")
model.fit(X_res, y_res)
print(metrics.classification_report(y_test, model.predict(X_test)))

# BONUS - ADASYN

In [None]:
from imblearn.over_sampling import ADASYN

adasyn = ADASYN(random_state=42)
X_res, y_res = adasyn.fit_resample(X_train, y_train)

In [None]:
y_res.value_counts()

In [None]:
model = LogisticRegression()
model.fit(X_res, y_res)
print(metrics.classification_report(y_test, model.predict(X_test)))