In [1]:
import pandas as pd
import altair as alt
import numpy as np
from IPython.display import display

In [2]:
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Simplify working with large datasets in Altair
alt.data_transformers.disable_max_rows()

# Output dataframes instead of arrays
set_config(transform_output="pandas")

In [3]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [4]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 

In [5]:
#dataset 
data = pd.read_csv("https://archive.ics.uci.edu/static/public/45/data.csv")
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0,1
299,68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0,2
300,57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0,3
301,57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0,1


In [6]:
#define heart disease as num values greater than 0
data['heart_disease'] = data['num'] > 0
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,heart_disease
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0,False
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2,True
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1,True
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0,False
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0,1,True
299,68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0,2,True
300,57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0,3,True
301,57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0,1,True


In [7]:
#drop rows with empty values
data.dropna()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,heart_disease
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0,False
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2,True
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1,True
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0,False
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57,0,4,140,241,0,0,123,1,0.2,2,0.0,7.0,1,True
298,45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0,1,True
299,68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0,2,True
300,57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0,3,True


In [8]:
data1 = data[["sex", "fbs", "trestbps", "restecg", "cp", "thal", "thalach", "heart_disease"]]
data1

Unnamed: 0,sex,fbs,trestbps,restecg,cp,thal,thalach,heart_disease
0,1,1,145,2,1,6.0,150,False
1,1,0,160,2,4,3.0,108,True
2,1,0,120,2,4,7.0,129,True
3,1,0,130,0,3,3.0,187,False
4,0,0,130,2,2,3.0,172,False
...,...,...,...,...,...,...,...,...
298,1,0,110,0,1,7.0,132,True
299,1,1,144,0,4,7.0,141,True
300,1,0,130,0,4,7.0,115,True
301,0,0,130,2,2,3.0,174,True


Split data into 75% training, 25% testing. 

In [9]:
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(
    data1, train_size = 0.75, random_state = 57
)

Scale numerical variables, passthrough all categorical variables. 

In [10]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
data_preprocessor = make_column_transformer(
    (StandardScaler(), ["trestbps", "thalach"]),
    verbose_feature_names_out = True
)
data_preprocessor

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

knn = KNeighborsClassifier(n_neighbors = 3)

X_train = data_train[["sex", "fbs", "trestbps", "restecg", "cp", "thal", "thalach"]]
y_train = data_train["heart_disease"]

knn_pipeline = make_pipeline(data_preprocessor, knn)
knn_pipeline.fit(X_train, y_train)

data_test_predictions = data_test.assign(
    predict = knn_pipeline.predict(data_test[["sex", "fbs", "trestbps", "restecg", "cp", "thal", "thalach"]])
)
data_test_predictions

X_test = data_test[["sex", "fbs", "trestbps", "restecg", "cp", "thal", "thalach"]]
y_test = data_test["heart_disease"]

data_prediction_accuracy = knn_pipeline.score(X_test, y_test)
data_prediction_accuracy

data_mat = pd.crosstab(
    data_test_predictions["heart_disease"], 
    data_test_predictions["predict"]
)
data_mat

predict,False,True
heart_disease,Unnamed: 1_level_1,Unnamed: 2_level_1
False,27,13
True,15,21


In [12]:
np.random.seed(2000)

data_pipe = make_pipeline(data_preprocessor, knn)
data_vfold_score = pd.DataFrame(
    cross_validate(
        estimator = data_pipe, 
        cv = 5, 
        X = X_train, 
        y = y_train, 
        return_train_score = True
    )
)
data_vfold_score

data_metrics = data_vfold_score.agg(["mean", "sem"])
data_metrics

param_grid = {
    "kneighborsclassifier__n_neighbors": range(1, 50, 1)
}

data_tune_pipe = make_pipeline(data_preprocessor, KNeighborsClassifier())

data_tune_grid = GridSearchCV(
    estimator = data_tune_pipe,
    param_grid = param_grid, 
    cv = 5, 
    return_train_score = True, 
    n_jobs = -1,
)
data_tune_grid

model_grid = data_tune_grid.fit(X_train, y_train)
accuracies_grid = pd.DataFrame(model_grid.cv_results_)
accuracies_grid

cv_plot = alt.Chart(accuracies_grid).mark_line(point=True).encode(
    x=alt.X("param_kneighborsclassifier__n_neighbors").title("K"),
    y=alt.Y("mean_test_score").title("Accuracy Estimate").scale(zero=False)
)
cv_plot

In [13]:
knn1 = KNeighborsClassifier(n_neighbors = 1)

knn_pipeline1 = make_pipeline(data_preprocessor, knn1)
knn_pipeline1.fit(X, y)

data_test_predictions1 = data_test.assign(
    predict = knn_pipeline1.predict(data_test[["sex", "fbs", "trestbps", "restecg", "cp", "thal", "thalach"]])
)
data_test_predictions1

data_test_predictions1["predicted"] = data_test_predictions1["predict"] > 0
data_test_predictions1

data_prediction_accuracy = knn_pipeline1.score(X_test, y_test)
data_prediction_accuracy

data_mat1 = pd.crosstab(
    data_test_predictions1["heart_disease"], 
    data_test_predictions1["predicted"]
)
data_mat1

  return self._fit(X, y)


predicted,False,True
heart_disease,Unnamed: 1_level_1,Unnamed: 2_level_1
False,39,1
True,0,36


In [14]:
from sklearn.metrics import recall_score, precision_score
recall_score(
    y_true = data_test_predictions1["heart_disease"], 
    y_pred = data_test_predictions1["predicted"], 
    pos_label = True
)

1.0

In [15]:
data_test_predictions1

Unnamed: 0,sex,fbs,trestbps,restecg,cp,thal,thalach,heart_disease,predict,predicted
239,1,0,120,0,2,3.0,162,False,0,False
56,1,0,140,0,3,7.0,163,True,1,True
241,0,0,126,0,2,3.0,163,False,0,False
174,1,0,145,2,4,6.0,132,True,4,True
60,0,0,130,0,4,7.0,142,True,2,True
...,...,...,...,...,...,...,...,...,...,...
251,1,0,146,0,4,7.0,105,True,1,True
276,0,0,146,2,3,3.0,152,False,0,False
277,0,0,138,0,3,3.0,152,False,0,False
168,1,0,126,2,4,7.0,156,True,1,True


In [16]:
data_prediction_accuracy = knn_pipeline1.score(X_test, y_test)
data_prediction_accuracy

0.6710526315789473

In [17]:
knn_pipeline1.score(
    data_test_predictions1[["sex", "fbs", "trestbps", "restecg", "cp", "thal", "thalach"]], 
    data_test_predictions1["heart_disease"]
)

NameError: name 'knn_pipeline25' is not defined