#Import necessary libraries



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import joblib
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm
#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

#  Load the dataset

In [None]:
# Download the dataset
data_path = "https://storage.googleapis.com/edulabs-public-datasets/heart_disease_uci.csv"

# Load the dataset into a Pandas DataFrame
df = pd.read_csv(data_path)


In [None]:
df

**Column description**

- id: unique id
- age: age in years
- sex: gender
- dataset: location of data collection
- cp: chest pain type
- trestbps: resting blood pressure
- chol: cholesterol measure
- fbs: fasting blood sugar
- restecg: ecg observation at resting condition
- thalch: maximum heart rate achieved
- exang: exercise induced angina
- oldpeak: ST depression induced by exercise relative to rest
- slope: the slope of the peak exercise ST segment
- ca: number of major vessels (0-3) colored by flourosopy
- thal: thal
- num: target [0=no heart disease; 1,2,3,4 = stages of heart disease ]

# Data preprocessing

Perform quick data preprocessing:

- Remove redundant columns
- Fill / Drop missing values
- Convert column types if needed

In [None]:
df.drop(columns=['id', 'dataset'], inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.drop(columns=['thal', 'slope'], inplace=True)

In [None]:
df.dropna()['num'].value_counts()

In [None]:
df.dropna(inplace=True)

# Multinomial Logistic Regression

In [None]:
df = pd.get_dummies(df, columns=['sex', 'fbs', 'restecg', 'exang', 'cp'], drop_first=True)

In [None]:
# Define features and target
X = df.drop(columns=["num"])
y = df["num"]

In [None]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Train Logistic Regression with Softmax
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

In [None]:
# Predictions
model.predict(X_test)

In [None]:
y_test.to_numpy()

In [None]:
# true values vs predictions
pd.DataFrame({
    "true": y_test.to_numpy(),
    "pred": model.predict(X_test)
})[:30]

In [None]:
metrics.confusion_matrix(y_test, model.predict(X_test))

In [None]:
# prompt: display confusion matric as heatmap with plotly

cm = metrics.confusion_matrix(y_test, model.predict(X_test))

fig = px.imshow(cm,
                labels=dict(x="Predicted", y="True", color="Count"),
                x=['0', '1', '2', '3', '4'],
                y=['0', '1', '2', '3', '4'],
                text_auto=True)
fig.update_layout(title='Confusion Matrix')
fig.show()


In [None]:
# train

cm = metrics.confusion_matrix(y_train, model.predict(X_train))

fig = px.imshow(cm,
                labels=dict(x="Predicted", y="True", color="Count"),
                x=['0', '1', '2', '3', '4'],
                y=['0', '1', '2', '3', '4'],
                text_auto=True)
fig.update_layout(title='Confusion Matrix')
fig.show()

In [None]:
print(metrics.classification_report(y_test, model.predict(X_test)))

In [None]:
print(metrics.classification_report(y_train, model.predict(X_train)))

### But what about the fact that predicting 2 or 3 for actual value 2/3 is ok?

In [None]:
# prompt: set display config for numpy to display numbers without scientific notation and 2 decimal digits

import numpy as np
np.set_printoptions(suppress=True, precision=2)


In [None]:
model.predict_proba(X_test)

In [None]:
metrics.top_k_accuracy_score(y_test, model.predict_proba(X_test), k=2)

In [None]:
metrics.top_k_accuracy_score(y_train, model.predict_proba(X_train), k=2)

# One-vs-Rest / One-vs-All

In [None]:
ovr_model = OneVsRestClassifier(LogisticRegression(max_iter=200))
ovr_model.fit(X_train, y_train)

In [None]:
ovr_model.predict(X_test)

In [None]:
ovr_model.predict_proba(X_test)

In [None]:
# If you need a probability distribution where probabilities sum to 1, you can apply Softmax:
from scipy.special import softmax
probs = ovr_model.predict_proba(X_test)
softmax(probs, axis=1)

In [None]:
cm = metrics.confusion_matrix(y_test, ovr_model.predict(X_test))

fig = px.imshow(cm,
                labels=dict(x="Predicted", y="True", color="Count"),
                x=['0', '1', '2', '3', '4'],
                y=['0', '1', '2', '3', '4'],
                text_auto=True)
fig.update_layout(title='Confusion Matrix')
fig.show()


In [None]:
ovr_model.estimators_

In [None]:
metrics.top_k_accuracy_score(y_test, ovr_model.predict_proba(X_test), k=2)

In [None]:
metrics.top_k_accuracy_score(y_train, ovr_model.predict_proba(X_train), k=2)

# One-vs-One

In [None]:
ovo_model = OneVsOneClassifier(LogisticRegression(max_iter=200))
ovo_model.fit(X_train, y_train)

In [None]:
ovo_model.estimators_

In [None]:
cm = metrics.confusion_matrix(y_test, ovo_model.predict(X_test))

fig = px.imshow(cm,
                labels=dict(x="Predicted", y="True", color="Count"),
                x=['0', '1', '2', '3', '4'],
                y=['0', '1', '2', '3', '4'],
                text_auto=True)
fig.update_layout(title='Confusion Matrix')
fig.show()

In [None]:
# no probabilities for one-vs-one model
# ovo_model.predict_proba(X_test)