In [1]:
import pandas as pd
import altair as alt
import numpy as np
from IPython.display import display

In [2]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 

In [4]:
np.random.seed(57)
data = pd.read_csv("https://archive.ics.uci.edu/static/public/45/data.csv")
data['heart_disease'] = data['num'] > 0
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,heart_disease
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0,False
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2,True
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1,True
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0,False
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0,1,True
299,68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0,2,True
300,57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0,3,True
301,57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0,1,True


In [5]:
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split (
    data, train_size = 0.75, stratify=  data["heart_disease"]
)

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
data_preprocessor = make_column_transformer(
    (StandardScaler(), ["thalach", "age"])
)

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
knn = KNeighborsClassifier(n_neighbors = 3)
X = data_train[["thalach", "age"]] 
y = data_train["heart_disease"]
knn_pipeline = make_pipeline(data_preprocessor, knn)
knn_pipeline.fit(X, y)
knn_pipeline

In [8]:
data_test["predicted"] = knn_pipeline.predict(data_test[["thalach", "age"]])
data_test[["predicted", "heart_disease"]]

Unnamed: 0,predicted,heart_disease
89,False,False
190,False,False
171,True,False
206,True,True
261,True,True
...,...,...
43,True,False
237,False,True
32,False,True
217,False,False


In [9]:
#accuracy
knn_pipeline.score(
    data_test[["thalach", "age"]], 
    data_test["heart_disease"]
)

0.6578947368421053

In [10]:
#recall
from sklearn.metrics import recall_score, precision_score
recall_score(
    y_true = data_test["heart_disease"], 
    y_pred = data_test["predicted"], 
    pos_label = True
)

0.5428571428571428

In [11]:
#precision
precision_score(
    y_true = data_test["heart_disease"], 
    y_pred = data_test["predicted"], 
    pos_label = True
)

0.6551724137931034

In [12]:
from sklearn.model_selection import cross_validate
data_vfold_score = pd.DataFrame(
    cross_validate(
        estimator = knn_pipeline, 
        cv = 10, 
        X = X, 
        y = y
    )
)
data_vfold_score

Unnamed: 0,fit_time,score_time,test_score
0,0.004519,0.005018,0.695652
1,0.003592,0.003747,0.478261
2,0.003563,0.003631,0.608696
3,0.00344,0.003623,0.565217
4,0.003453,0.003587,0.565217
5,0.003393,0.003739,0.521739
6,0.003431,0.024451,0.652174
7,0.003394,0.003511,0.772727
8,0.003345,0.003532,0.636364
9,0.003358,0.003527,0.681818


In [13]:
data_metrics = data_vfold_score.agg(["mean", "sem"])
data_metrics

Unnamed: 0,fit_time,score_time,test_score
mean,0.003549,0.005837,0.617787
sem,0.000111,0.002073,0.027889


In [14]:
parameter_grid = {
    "kneighborsclassifier__n_neighbors": range(0, 50, 1)
}
data_tune_pipe = make_pipeline(data_preprocessor, KNeighborsClassifier())

In [15]:

from sklearn.model_selection import GridSearchCV
data_tune_pipe.get_params()
data_tune_grid = GridSearchCV(
    estimator = data_tune_pipe, 
    param_grid = parameter_grid,
    cv = 4,
)
data_tune_grid

In [16]:
knn_model_grid = data_tune_grid.fit(X, y)
accuracies_grid = pd.DataFrame(knn_model_grid.cv_results_)
accuracies_grid

4 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/sklearn/pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/conda/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrap

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003228,0.000226,0.0,0.0,0,{'kneighborsclassifier__n_neighbors': 0},,,,,,,50
1,0.003446,3.8e-05,0.004922,0.000164,1,{'kneighborsclassifier__n_neighbors': 1},0.54386,0.561404,0.54386,0.607143,0.564066,0.025881,49
2,0.003689,0.000559,0.00484,9.1e-05,2,{'kneighborsclassifier__n_neighbors': 2},0.561404,0.596491,0.526316,0.642857,0.581767,0.043123,48
3,0.003446,9.9e-05,0.007489,0.004705,3,{'kneighborsclassifier__n_neighbors': 3},0.614035,0.631579,0.631579,0.75,0.656798,0.054285,47
4,0.003358,4.6e-05,0.004793,2.3e-05,4,{'kneighborsclassifier__n_neighbors': 4},0.649123,0.666667,0.596491,0.767857,0.670034,0.062102,46
5,0.003314,8e-06,0.004837,6.4e-05,5,{'kneighborsclassifier__n_neighbors': 5},0.701754,0.614035,0.736842,0.75,0.700658,0.05303,9
6,0.003361,6.3e-05,0.004794,2.3e-05,6,{'kneighborsclassifier__n_neighbors': 6},0.666667,0.649123,0.719298,0.75,0.696272,0.040362,16
7,0.003302,1e-05,0.004793,2.2e-05,7,{'kneighborsclassifier__n_neighbors': 7},0.736842,0.649123,0.736842,0.75,0.718202,0.040243,1
8,0.00333,5.2e-05,0.004785,1.4e-05,8,{'kneighborsclassifier__n_neighbors': 8},0.684211,0.666667,0.701754,0.803571,0.714051,0.053153,3
9,0.003298,1.1e-05,0.004786,1.4e-05,9,{'kneighborsclassifier__n_neighbors': 9},0.684211,0.684211,0.754386,0.75,0.718202,0.034027,1


In [17]:
accuracies_vs_k = alt.Chart(accuracies_grid).mark_line(point=True).encode(
    x=alt.X("param_kneighborsclassifier__n_neighbors").title("K").scale(zero=False),
    y=alt.Y("mean_test_score").title("Accuracy").scale(zero=False)
)
accuracies_vs_k

In [18]:
#K=9

In [32]:
knn9 = KNeighborsClassifier(n_neighbors = 25)
X = data_train[["thalach", "age"]] 
y = data_train["heart_disease"]
knn_pipeline9 = make_pipeline(data_preprocessor, knn9)
knn_pipeline9.fit(X, y)
data_test["predicted"] = knn_pipeline9.predict(data_test[["thalach", "age"]])
data_test[["predicted", "heart_disease"]]
knn_pipeline9.score(
    data_test[["thalach", "age"]], 
    data_test["heart_disease"]
)

ValueError: A given column is not a column of the dataframe

In [33]:
#oldpeak vs thalach
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split (
    data, train_size = 0.75, stratify=  data["heart_disease"], random_state=57
)

from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
data_preprocessor = make_column_transformer(
    (StandardScaler(), ["thalach", "oldpeak"])
)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
knn = KNeighborsClassifier(n_neighbors = 3)
X = data_train[["thalach", "oldpeak"]] 
y = data_train["heart_disease"]
knn_pipeline = make_pipeline(data_preprocessor, knn)
knn_pipeline.fit(X, y)
knn_pipeline

data_test["predicted"] = knn_pipeline.predict(data_test[["thalach", "oldpeak"]])
data_test[["predicted", "heart_disease"]]

knn_pipeline.score(
    data_test[["thalach", "oldpeak"]], 
    data_test["heart_disease"]
)

from sklearn.model_selection import cross_validate
data_vfold_score = pd.DataFrame(
    cross_validate(
        estimator = knn_pipeline, 
        cv = 5, 
        X = X, 
        y = y
    )
)
data_vfold_score

data_metrics = data_vfold_score.agg(["mean", "sem"])
data_metrics

parameter_grid = {
    "kneighborsclassifier__n_neighbors": range(50)
}
data_tune_pipe = make_pipeline(data_preprocessor, KNeighborsClassifier())

from sklearn.model_selection import GridSearchCV
data_tune_pipe.get_params()
data_tune_grid = GridSearchCV(
    estimator = data_tune_pipe, 
    param_grid = parameter_grid,
    cv = 10,
)
data_tune_grid

knn_model_grid = data_tune_grid.fit(X, y)
accuracies_grid = pd.DataFrame(knn_model_grid.cv_results_)
accuracies_grid

accuracies_vs_k = alt.Chart(accuracies_grid).mark_line(point=True).encode(
    x=alt.X("param_kneighborsclassifier__n_neighbors").title("K").scale(zero=False),
    y=alt.Y("mean_test_score").title("Accuracy").scale(zero=False)
)
accuracies_vs_k

10 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/sklearn/pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/conda/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wr

In [36]:
np.random.seed()
knn = KNeighborsClassifier(n_neighbors = 25)
X = data_train[["thalach", "oldpeak"]] 
y = data_train["heart_disease"]
knn_pipeline = make_pipeline(data_preprocessor, knn)
knn_pipeline.fit(X, y)
data_test["predicted"] = knn_pipeline.predict(data_test[["thalach", "oldpeak"]])
data_test[["predicted", "heart_disease"]]
knn_pipeline.score(
    data_test[["thalach", "oldpeak"]], 
    data_test["heart_disease"]
)

0.6578947368421053

In [35]:
from sklearn.metrics import recall_score, precision_score
recall_score(
    y_true = data_test["heart_disease"], 
    y_pred = data_test["predicted"], 
    pos_label = True
)

0.45714285714285713

In [23]:
data = pd.read_csv("https://archive.ics.uci.edu/static/public/45/data.csv")
data['heart_disease'] = data['num'] > 0
data1 = data.dropna()
data2 = data1[["age", "trestbps", "chol", "thalach", "oldpeak", "heart_disease"]]

In [24]:
import numpy as np
import pandas as pd

from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

names = list(data1.drop(
    columns=["heart_disease"]
).columns.values)

accuracy_dict = {"size": [], "selected_predictors": [], "accuracy": []}

# store the total number of predictors
n_total = len(names)

# start with an empty list of selected predictors
selected = []

# create the pipeline and CV grid search objects
param_grid = {
    "kneighborsclassifier__n_neighbors": range(1, 100, 5),
}
cancer_preprocessor = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include="number"))
)
cancer_tune_pipe = make_pipeline(cancer_preprocessor, KNeighborsClassifier())
cancer_tune_grid = GridSearchCV(
    estimator=cancer_tune_pipe,
    param_grid=param_grid,
    cv=10,
    n_jobs=-1
)

# for every possible number of predictors
for i in range(1, n_total + 1):
    accs = np.zeros(len(names))
    # for every possible predictor to add
    for j in range(len(names)):
        # Add remaining predictor j to the model
        X = data1[selected + [names[j]]]
        y = data1["heart_disease"]

        # Find the best K for this set of predictors
        cancer_tune_grid.fit(X, y)
        accuracies_grid = pd.DataFrame(cancer_tune_grid.cv_results_)

        # Store the tuned accuracy for this set of predictors
        accs[j] = accuracies_grid["mean_test_score"].max()

    # get the best new set of predictors that maximize cv accuracy
    best_set = selected + [names[accs.argmax()]]

    # store the results for this round of forward selection
    accuracy_dict["size"].append(i)
    accuracy_dict["selected_predictors"].append(", ".join(best_set))
    accuracy_dict["accuracy"].append(accs.max())

    # update the selected & available sets of predictors
    selected = best_set
    del names[accs.argmax()]

accuracies2 = pd.DataFrame(accuracy_dict)
accuracies2


Unnamed: 0,size,selected_predictors,accuracy
0,1,num,1.0
1,2,"num, age",1.0
2,3,"num, age, sex",1.0
3,4,"num, age, sex, restecg",1.0
4,5,"num, age, sex, restecg, fbs",0.993333
5,6,"num, age, sex, restecg, fbs, thal",0.976552
6,7,"num, age, sex, restecg, fbs, thal, exang",0.966437
7,8,"num, age, sex, restecg, fbs, thal, exang, ca",0.92931
8,9,"num, age, sex, restecg, fbs, thal, exang, ca, cp",0.922299
9,10,"num, age, sex, restecg, fbs, thal, exang, ca, ...",0.91931


In [25]:
data = pd.read_csv("https://archive.ics.uci.edu/static/public/45/data.csv")
data['heart_disease'] = data['num'] > 0
data1 = data.dropna()
data2 = data1[["age", "trestbps", "chol", "thalach", "oldpeak", "heart_disease"]]

In [26]:
data2

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,heart_disease
0,63,145,233,150,2.3,False
1,67,160,286,108,1.5,True
2,67,120,229,129,2.6,True
3,37,130,250,187,3.5,False
4,41,130,204,172,1.4,False
...,...,...,...,...,...,...
297,57,140,241,123,0.2,True
298,45,110,264,132,1.2,True
299,68,144,193,141,3.4,True
300,57,130,131,115,1.2,True


In [27]:
import numpy as np
import pandas as pd

from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

names = list(data2.drop(
    columns=["heart_disease"]
).columns.values)

accuracy_dict = {"size": [], "selected_predictors": [], "accuracy": []}

# store the total number of predictors
n_total = len(names)

# start with an empty list of selected predictors
selected = []

# create the pipeline and CV grid search objects
param_grid = {
    "kneighborsclassifier__n_neighbors": range(1, 100, 5),
}
cancer_preprocessor = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include="number"))
)
cancer_tune_pipe = make_pipeline(cancer_preprocessor, KNeighborsClassifier())
cancer_tune_grid = GridSearchCV(
    estimator=cancer_tune_pipe,
    param_grid=param_grid,
    cv=10,
    n_jobs=-1
)

# for every possible number of predictors
for i in range(1, n_total + 1):
    accs = np.zeros(len(names))
    # for every possible predictor to add
    for j in range(len(names)):
        # Add remaining predictor j to the model
        X = data2[selected + [names[j]]]
        y = data2["heart_disease"]

        # Find the best K for this set of predictors
        cancer_tune_grid.fit(X, y)
        accuracies_grid = pd.DataFrame(cancer_tune_grid.cv_results_)

        # Store the tuned accuracy for this set of predictors
        accs[j] = accuracies_grid["mean_test_score"].max()

    # get the best new set of predictors that maximize cv accuracy
    best_set = selected + [names[accs.argmax()]]

    # store the results for this round of forward selection
    accuracy_dict["size"].append(i)
    accuracy_dict["selected_predictors"].append(", ".join(best_set))
    accuracy_dict["accuracy"].append(accs.max())

    # update the selected & available sets of predictors
    selected = best_set
    del names[accs.argmax()]

accuracies = pd.DataFrame(accuracy_dict)
accuracies

Unnamed: 0,size,selected_predictors,accuracy
0,1,thalach,0.714023
1,2,"thalach, oldpeak",0.750805
2,3,"thalach, oldpeak, age",0.744483
3,4,"thalach, oldpeak, age, trestbps",0.730575
4,5,"thalach, oldpeak, age, trestbps, chol",0.72092


In [28]:
data3 = data1[["age", "chol", "thalach", "heart_disease"]]
data3

Unnamed: 0,age,chol,thalach,heart_disease
0,63,233,150,False
1,67,286,108,True
2,67,229,129,True
3,37,250,187,False
4,41,204,172,False
...,...,...,...,...
297,57,241,123,True
298,45,264,132,True
299,68,193,141,True
300,57,131,115,True


In [29]:
import numpy as np
import pandas as pd

from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

names = list(data3.drop(
    columns=["heart_disease"]
).columns.values)

accuracy_dict = {"size": [], "selected_predictors": [], "accuracy": []}

# store the total number of predictors
n_total = len(names)

# start with an empty list of selected predictors
selected = []

# create the pipeline and CV grid search objects
param_grid = {
    "kneighborsclassifier__n_neighbors": range(1, 100, 5),
}
cancer_preprocessor = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include="number"))
)
cancer_tune_pipe = make_pipeline(cancer_preprocessor, KNeighborsClassifier())
cancer_tune_grid = GridSearchCV(
    estimator=cancer_tune_pipe,
    param_grid=param_grid,
    cv=10,
    n_jobs=-1
)

# for every possible number of predictors
for i in range(1, n_total + 1):
    accs = np.zeros(len(names))
    # for every possible predictor to add
    for j in range(len(names)):
        # Add remaining predictor j to the model
        X = data3[selected + [names[j]]]
        y = data3["heart_disease"]

        # Find the best K for this set of predictors
        cancer_tune_grid.fit(X, y)
        accuracies_grid = pd.DataFrame(cancer_tune_grid.cv_results_)

        # Store the tuned accuracy for this set of predictors
        accs[j] = accuracies_grid["mean_test_score"].max()

    # get the best new set of predictors that maximize cv accuracy
    best_set = selected + [names[accs.argmax()]]

    # store the results for this round of forward selection
    accuracy_dict["size"].append(i)
    accuracy_dict["selected_predictors"].append(", ".join(best_set))
    accuracy_dict["accuracy"].append(accs.max())

    # update the selected & available sets of predictors
    selected = best_set
    del names[accs.argmax()]

accuracies1 = pd.DataFrame(accuracy_dict)
accuracies1

Unnamed: 0,size,selected_predictors,accuracy
0,1,thalach,0.714023
1,2,"thalach, age",0.731264
2,3,"thalach, age, chol",0.69023


In [30]:
data4 = data1.drop(columns=["num"])
data4

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,False
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,True
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,True
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,False
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57,0,4,140,241,0,0,123,1,0.2,2,0.0,7.0,True
298,45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0,True
299,68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0,True
300,57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0,True


In [31]:
import numpy as np
import pandas as pd

from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

names = list(data4.drop(
    columns=["heart_disease"]
).columns.values)

accuracy_dict = {"size": [], "selected_predictors": [], "accuracy": []}

# store the total number of predictors
n_total = len(names)

# start with an empty list of selected predictors
selected = []

# create the pipeline and CV grid search objects
param_grid = {
    "kneighborsclassifier__n_neighbors": range(1, 100, 5),
}
cancer_preprocessor = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include="number"))
)
cancer_tune_pipe = make_pipeline(cancer_preprocessor, KNeighborsClassifier())
cancer_tune_grid = GridSearchCV(
    estimator=cancer_tune_pipe,
    param_grid=param_grid,
    cv=10,
    n_jobs=-1
)

# for every possible number of predictors
for i in range(1, n_total + 1):
    accs = np.zeros(len(names))
    # for every possible predictor to add
    for j in range(len(names)):
        # Add remaining predictor j to the model
        X = data4[selected + [names[j]]]
        y = data4["heart_disease"]

        # Find the best K for this set of predictors
        cancer_tune_grid.fit(X, y)
        accuracies_grid = pd.DataFrame(cancer_tune_grid.cv_results_)

        # Store the tuned accuracy for this set of predictors
        accs[j] = accuracies_grid["mean_test_score"].max()

    # get the best new set of predictors that maximize cv accuracy
    best_set = selected + [names[accs.argmax()]]

    # store the results for this round of forward selection
    accuracy_dict["size"].append(i)
    accuracy_dict["selected_predictors"].append(", ".join(best_set))
    accuracy_dict["accuracy"].append(accs.max())

    # update the selected & available sets of predictors
    selected = best_set
    del names[accs.argmax()]

accuracies3 = pd.DataFrame(accuracy_dict)
accuracies3

Unnamed: 0,size,selected_predictors,accuracy
0,1,thal,0.764253
1,2,"thal, ca",0.781034
2,3,"thal, ca, cp",0.838161
3,4,"thal, ca, cp, chol",0.834713
4,5,"thal, ca, cp, chol, exang",0.834943
5,6,"thal, ca, cp, chol, exang, thalach",0.851839
6,7,"thal, ca, cp, chol, exang, thalach, slope",0.868391
7,8,"thal, ca, cp, chol, exang, thalach, slope, fbs",0.865172
8,9,"thal, ca, cp, chol, exang, thalach, slope, fbs...",0.861839
9,10,"thal, ca, cp, chol, exang, thalach, slope, fbs...",0.858506
