In [140]:
import sys
import os
import numpy as np

# Add the 'oracle' directory to the Python path
sys.path.append(os.path.join(os.getcwd(), 'oracle'))
import oracle as oracle 

In [141]:
res = oracle.q3_hyper(23607)
print(res)

('gini', 'best', 7)


Csv headers are: <br>
age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal


In [142]:
%pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [143]:
import pandas as pd
import numpy as np

# Load the data
data = pd.read_csv('processed.cleveland.data')
# insert column names
data.columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','goal']

In [144]:
from sklearn.impute import SimpleImputer

# Clean the data
data = data.replace('?', np.nan)

# numeric columns
# age, trestbps, chol, thalach, oldpeak ; impute with mean
fields = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
imputer = SimpleImputer(strategy='mean')
data[fields] = imputer.fit_transform(data[fields])


# Catetorical columns
# sex, cp, fbs, restecg, exang, slope, ca, thal; impute with mode
fields = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
imputer = SimpleImputer(strategy='most_frequent')
data[fields] = imputer.fit_transform(data[fields])

# we have to check disease or no-disease, so make the goal column binary
data['goal'] = data['goal'].replace([1, 2, 3, 4], 1)    

# check if all are filled
print(data.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
goal        0
dtype: int64


In [145]:
from sklearn.model_selection import train_test_split

# split the data
X = data.drop('goal', axis=1)
y = data['goal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [146]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report
from tabulate import tabulate

clf = DecisionTreeClassifier(random_state=69, criterion=res[0], splitter=res[1], max_depth=res[2])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# precision, accuracy, recall, and F1 score
accuracy = accuracy_score(y_test, y_pred)
print(tabulate([[f'Accuracy: {accuracy:.4f}']], tablefmt="rounded_grid"))
report = classification_report(y_test, y_pred)
report_table = [row.split() for row in report.split('\n') if row]
print(report)





╭──────────────────╮
│ Accuracy: 0.8197 │
╰──────────────────╯
              precision    recall  f1-score   support

           0       0.91      0.79      0.85        38
           1       0.71      0.87      0.78        23

    accuracy                           0.82        61
   macro avg       0.81      0.83      0.81        61
weighted avg       0.84      0.82      0.82        61



In [168]:
# Visualization of decision tree

import dtreeviz
import matplotlib.pyplot as plt

# Convert 'ca' and 'thal' to numeric
X_train['ca'] = pd.to_numeric(X_train['ca'], errors='coerce')
X_train['thal'] = pd.to_numeric(X_train['thal'], errors='coerce')


# Visualize the Decision Tree

viz = dtreeviz.model(
    clf,
    X_train,
    y_train,
    target_name="Heart Disease",
    feature_names=X_train.columns.tolist(),
    class_names=["No Disease", "Disease"],
)

v = viz.view()
v.show()  

# Save the visualization to a specific location
v.save("decision_tree.svg")

Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x











In [161]:
for col in X_train.columns:
    print(f"{col}: {X_train[col].map(type).unique()}")


age: [<class 'float'>]
sex: [<class 'float'>]
cp: [<class 'float'>]
trestbps: [<class 'float'>]
chol: [<class 'float'>]
fbs: [<class 'float'>]
restecg: [<class 'float'>]
thalach: [<class 'float'>]
exang: [<class 'float'>]
oldpeak: [<class 'float'>]
slope: [<class 'float'>]
ca: [<class 'str'>]
thal: [<class 'str'>]
