In [1]:
import numpy as np
import pandas as pd

In [63]:
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import accuracy_score, confusion_matrix

In [15]:
bank = pd.read_csv('https://raw.githubusercontent.com/christianolivamoya/MIAX11-ML/main/data/bank.csv', sep=';')
bank.sample(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
1248,33,services,married,secondary,no,90,yes,no,cellular,28,jul,263,2,-1,0,unknown,no
3036,73,retired,married,primary,no,279,no,no,cellular,28,jan,399,3,-1,0,unknown,yes
3500,41,technician,married,secondary,no,0,no,no,cellular,29,aug,141,2,-1,0,unknown,no
3839,51,technician,divorced,secondary,no,2323,yes,yes,cellular,18,aug,151,10,-1,0,unknown,no
1555,34,entrepreneur,married,tertiary,no,412,yes,yes,cellular,8,jul,164,1,-1,0,unknown,no


In [16]:
bank.drop(columns=['day', 'month'], inplace=True)
educ_mapping = {'unknown':0,
                'primary':1,
                'secondary':2, 
                'tertiary':3}
noyes_mapping = {'no':False, 'yes':True}
bank['education'].replace(educ_mapping, inplace=True)
bank['y'].replace(noyes_mapping, inplace=True) # Cambio el target a True (1) o False (0) 
bank['default'].replace(noyes_mapping, inplace=True)
bank['housing'].replace(noyes_mapping, inplace=True)
bank['loan'].replace(noyes_mapping, inplace=True)

features = bank.drop(columns='y')
target = bank['y']

features = pd.get_dummies(features)
features.sample(5)

Unnamed: 0,age,education,default,balance,housing,loan,duration,campaign,pdays,previous,...,marital_divorced,marital_married,marital_single,contact_cellular,contact_telephone,contact_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
4004,34,2,False,179,False,False,294,3,-1,0,...,False,False,True,True,False,False,False,False,False,True
920,38,3,False,221,False,False,79,3,-1,0,...,False,False,True,True,False,False,False,False,False,True
2048,54,1,False,3859,False,True,104,3,-1,0,...,False,True,False,True,False,False,False,False,False,True
1531,46,1,False,2749,True,False,332,1,-1,0,...,False,True,False,True,False,False,False,False,False,True
3458,71,2,False,2850,False,False,185,1,-1,0,...,False,True,False,True,False,False,False,False,False,True


Entrenaremos K-Nearest Neighbors (KNN)

In [23]:
knn = KNeighborsClassifier()

Validación cruzada KFold con `K=4` con la función cross_validate:

In [24]:
scoring = ['accuracy', 'precision', 'recall', 'f1']

In [25]:
scores = cross_validate(knn, features, target, scoring=scoring, cv=4)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
0,0.008471,0.165769,0.879752,0.45283,0.183206,0.26087
1,0.004259,0.163964,0.872566,0.393939,0.2,0.265306
2,0.004539,0.190531,0.871681,0.384615,0.192308,0.25641
3,0.004743,0.196036,0.863717,0.337838,0.192308,0.245098


In [26]:
pd.DataFrame(scores).mean()

fit_time          0.005503
score_time        0.179075
test_accuracy     0.871929
test_precision    0.392306
test_recall       0.191955
test_f1           0.256921
dtype: float64

Vamos a buscar el mejor valor de K como hiperparámetro:

In [51]:
pd.DataFrame(row.values[None, :], columns=scores, index=[k])

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
27,0.005606,0.219626,0.885866,0.530119,0.107516,0.178013


In [53]:
results = pd.DataFrame()

for k in range(1, 51, 2):
    print(" > Procesando k = {}".format(k))
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_validate(knn, features, target, scoring=scoring, cv=4)
    row = pd.DataFrame(scores).mean()
    row_df = pd.DataFrame(row.values[None, :], columns=scores, index=[k])
    results = pd.concat((results, row_df), axis=0)
results.head()

 > Procesando k = 1
 > Procesando k = 3
 > Procesando k = 5
 > Procesando k = 7
 > Procesando k = 9
 > Procesando k = 11
 > Procesando k = 13
 > Procesando k = 15
 > Procesando k = 17
 > Procesando k = 19
 > Procesando k = 21
 > Procesando k = 23
 > Procesando k = 25
 > Procesando k = 27
 > Procesando k = 29
 > Procesando k = 31
 > Procesando k = 33
 > Procesando k = 35
 > Procesando k = 37
 > Procesando k = 39
 > Procesando k = 41
 > Procesando k = 43
 > Procesando k = 45
 > Procesando k = 47
 > Procesando k = 49


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
1,0.004824,0.157117,0.845832,0.325914,0.320655,0.322609
3,0.005094,0.225607,0.865735,0.376146,0.243746,0.295238
5,0.005861,0.195791,0.871929,0.392306,0.191955,0.256921
7,0.005397,0.213359,0.876354,0.410763,0.161274,0.230339
9,0.005435,0.202923,0.878345,0.423887,0.147813,0.218436


In [62]:
metric = "test_accuracy" # test_f1
results[results[metric] == results[metric].max()]

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
1,0.004824,0.157117,0.845832,0.325914,0.320655,0.322609


¿Cómo buscamos un umbral?

In [89]:
kfold = KFold(n_splits=4, shuffle=True)
results = pd.DataFrame()
for umbral in np.arange(0.01, 1.0, 0.01):
    print(" > Procesando umbral {:.2f}".format(umbral))
    scores = pd.DataFrame()
    for train, test in kfold.split(features):

        fold_train_x = features.iloc[train]
        fold_train_y = target.iloc[train]
        fold_test_x = features.iloc[test]
        fold_test_y = target.iloc[test]

        knn = KNeighborsClassifier()
        knn.fit(fold_train_x, fold_train_y)
        fold_test_pred = knn.predict_proba(fold_test_x)[:, 1]
        fold_test_pred = fold_test_pred > umbral

        tn, fp, fn, tp = confusion_matrix(fold_test_y, fold_test_pred).ravel()
        acc = (tp + tn) / (tp + tn + fp + fn)
        recall = tp / (tp + fn)
        prec = tp / (tp + fp + 1e-8)
        f1 = 2 * prec * recall / (prec + recall + 1e-8) # Evitamos division entre 0
        row = pd.Series([acc, recall, prec, f1])
        row_df = pd.DataFrame(row.values[None, :], columns=["Acc", "Recall", "Prec", "F1"])
        scores = pd.concat((scores, row_df), axis=0)

    row = scores.mean()
    row_df = pd.DataFrame(row.values[None, :], columns=["Acc", "Recall", "Prec", "F1"], index=[round(umbral, 2)])
    results = pd.concat((results, row_df), axis=0)
results.head()

 > Procesando umbral 0.01
 > Procesando umbral 0.02
 > Procesando umbral 0.03
 > Procesando umbral 0.04
 > Procesando umbral 0.05
 > Procesando umbral 0.06
 > Procesando umbral 0.07
 > Procesando umbral 0.08
 > Procesando umbral 0.09
 > Procesando umbral 0.10
 > Procesando umbral 0.11
 > Procesando umbral 0.12
 > Procesando umbral 0.13
 > Procesando umbral 0.14
 > Procesando umbral 0.15
 > Procesando umbral 0.16
 > Procesando umbral 0.17
 > Procesando umbral 0.18
 > Procesando umbral 0.19
 > Procesando umbral 0.20
 > Procesando umbral 0.21
 > Procesando umbral 0.22
 > Procesando umbral 0.23
 > Procesando umbral 0.24
 > Procesando umbral 0.25
 > Procesando umbral 0.26
 > Procesando umbral 0.27
 > Procesando umbral 0.28
 > Procesando umbral 0.29
 > Procesando umbral 0.30
 > Procesando umbral 0.31
 > Procesando umbral 0.32
 > Procesando umbral 0.33
 > Procesando umbral 0.34
 > Procesando umbral 0.35
 > Procesando umbral 0.36
 > Procesando umbral 0.37
 > Procesando umbral 0.38
 > Procesand

Unnamed: 0,Acc,Recall,Prec,F1
0.01,0.715552,0.665719,0.23778,0.349501
0.02,0.720643,0.69209,0.247512,0.363894
0.03,0.726609,0.716039,0.255156,0.37576
0.04,0.719534,0.692327,0.246343,0.362874
0.05,0.721963,0.701498,0.248904,0.367002


In [91]:
metric = "F1" # F1
results[results[metric] == results[metric].max()]

Unnamed: 0,Acc,Recall,Prec,F1
0.2,0.854899,0.443531,0.388594,0.413165


**Pregunta**: ¿Cómo podríamos buscar el mejor umbral y el mejor valor de K?