In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import accuracy_score, confusion_matrix

In [11]:
bank = pd.read_csv('https://raw.githubusercontent.com/christianolivamoya/MIAX11-ML/main/data/bank.csv', sep=';')
bank.sample(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
3449,28,unemployed,single,secondary,no,304,no,no,cellular,14,jun,516,1,-1,0,unknown,no
4425,35,services,married,secondary,no,505,yes,yes,unknown,27,may,371,2,-1,0,unknown,no
1803,31,services,single,secondary,no,1987,yes,no,telephone,20,nov,55,2,-1,0,unknown,no
2173,55,services,married,unknown,no,1210,yes,no,unknown,30,may,868,1,-1,0,unknown,no
1754,29,management,single,tertiary,no,1445,no,no,cellular,26,apr,328,2,143,2,success,yes


In [12]:
bank.drop(columns=['day', 'month'], inplace=True)
educ_mapping = {'unknown':0,
                'primary':1,
                'secondary':2,
                'tertiary':3}
noyes_mapping = {'no':False, 'yes':True}
bank['education'].replace(educ_mapping, inplace=True)
bank['y'].replace(noyes_mapping, inplace=True) # Cambio el target a True (1) o False (0)
bank['default'].replace(noyes_mapping, inplace=True)
bank['housing'].replace(noyes_mapping, inplace=True)
bank['loan'].replace(noyes_mapping, inplace=True)

features = bank.drop(columns='y')
target = bank['y']

features = pd.get_dummies(features)
features.sample(5)

Unnamed: 0,age,education,default,balance,housing,loan,duration,campaign,pdays,previous,...,marital_divorced,marital_married,marital_single,contact_cellular,contact_telephone,contact_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
3360,79,1,False,429,False,False,1006,2,-1,0,...,False,True,False,False,True,False,False,False,False,True
3788,46,2,False,390,True,False,208,2,-1,0,...,False,True,False,True,False,False,False,False,False,True
2875,29,2,False,778,True,False,1994,2,-1,0,...,False,False,True,False,False,True,False,False,False,True
2489,55,1,False,-966,False,True,104,1,-1,0,...,False,True,False,True,False,False,False,False,False,True
1024,36,2,False,2357,True,False,228,2,-1,0,...,False,True,False,False,False,True,False,False,False,True


In [17]:
# Normalización de los datos
features = (features - features.mean()) / features.std()
features.sample(5)

Unnamed: 0,age,education,default,balance,housing,loan,duration,campaign,pdays,previous,...,marital_divorced,marital_married,marital_single,contact_cellular,contact_telephone,contact_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
3081,1.023987,-0.084408,-0.130744,-0.379666,-1.141925,-0.424709,0.496577,0.709488,-0.407173,-0.320377,...,2.749696,-1.27359,-0.599683,0.748996,-0.267042,-0.643465,-0.348613,-0.213423,-0.171362,0.469249
1958,0.173021,-0.084408,7.646823,-0.859458,0.875521,2.354032,-0.723327,0.387925,-0.407173,-0.320377,...,-0.363596,-1.27359,1.667179,0.748996,-0.267042,-0.643465,-0.348613,-0.213423,-0.171362,0.469249
374,0.078469,-0.084408,-0.130744,-0.604278,0.875521,-0.424709,-0.707934,-0.576766,2.679089,2.041508,...,-0.363596,0.785009,-0.599683,0.748996,-0.267042,-0.643465,2.867876,-0.213423,-0.171362,-2.130595
4339,1.118539,-1.364972,-0.130744,-0.397941,-1.141925,-0.424709,-0.357741,1.352615,-0.407173,-0.320377,...,-0.363596,0.785009,-0.599683,0.748996,-0.267042,-0.643465,-0.348613,-0.213423,-0.171362,0.469249
2166,0.551228,-0.084408,-0.130744,-0.198249,0.875521,-0.424709,-0.265382,0.066361,0.72146,4.403393,...,-0.363596,0.785009,-0.599683,-1.334826,3.743906,-0.643465,-0.348613,4.684483,-0.171362,-2.130595


Entrenaremos K-Nearest Neighbors (KNN)

In [18]:
knn = KNeighborsClassifier(n_neighbors=5)

Validación cruzada KFold con `K=4` con la función cross_validate:

In [19]:
scoring = ['accuracy', 'precision', 'recall', 'f1']

In [20]:
scores = cross_validate(knn, features, target, scoring=scoring, cv=4)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
0,0.014089,0.136936,0.890363,0.561404,0.244275,0.340426
1,0.006611,0.174116,0.881416,0.458333,0.169231,0.247191
2,0.006382,0.175878,0.89469,0.634146,0.2,0.304094
3,0.007309,0.148751,0.889381,0.539683,0.261538,0.352332


In [21]:
pd.DataFrame(scores).mean()

fit_time          0.008598
score_time        0.158920
test_accuracy     0.888962
test_precision    0.548391
test_recall       0.218761
test_f1           0.311010
dtype: float64

Vamos a buscar el mejor valor de K como hiperparámetro:

In [22]:
results = pd.DataFrame()

for k in range(1, 51, 2):
    print(" > Procesando k = {}".format(k))
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_validate(knn, features, target, scoring=scoring, cv=4)
    row = pd.DataFrame(scores).mean()
    row_df = pd.DataFrame(row.values[None, :], columns=scores, index=[k])
    results = pd.concat((results, row_df), axis=0)
results.head()

 > Procesando k = 1
 > Procesando k = 3
 > Procesando k = 5
 > Procesando k = 7
 > Procesando k = 9
 > Procesando k = 11
 > Procesando k = 13
 > Procesando k = 15
 > Procesando k = 17
 > Procesando k = 19
 > Procesando k = 21
 > Procesando k = 23
 > Procesando k = 25
 > Procesando k = 27
 > Procesando k = 29
 > Procesando k = 31
 > Procesando k = 33
 > Procesando k = 35
 > Procesando k = 37
 > Procesando k = 39
 > Procesando k = 41
 > Procesando k = 43
 > Procesando k = 45
 > Procesando k = 47
 > Procesando k = 49


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
1,0.008957,0.335584,0.859985,0.384382,0.353171,0.367619
3,0.005389,0.120571,0.884316,0.492433,0.268614,0.346698
5,0.006041,0.313311,0.888962,0.548391,0.218761,0.31101
7,0.006586,0.203577,0.886751,0.537331,0.172681,0.258757
9,0.004576,0.092574,0.890732,0.596951,0.165018,0.256751


In [23]:
metric = "test_accuracy" # test_f1
results[results[metric] == results[metric].max()]

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
29,0.004229,0.113359,0.892502,0.64681,0.153538,0.247199


¿Cómo buscamos un umbral?

In [24]:
kfold = KFold(n_splits=4, shuffle=True)
results = pd.DataFrame()
for umbral in np.arange(0.01, 1.0, 0.01):
    print(" > Procesando umbral {:.2f}".format(umbral))
    scores = pd.DataFrame()
    for train, test in kfold.split(features):

        fold_train_x = features.iloc[train]
        fold_train_y = target.iloc[train]
        fold_test_x = features.iloc[test]
        fold_test_y = target.iloc[test]

        knn = KNeighborsClassifier()
        knn.fit(fold_train_x, fold_train_y)
        fold_test_pred = knn.predict_proba(fold_test_x)[:, 1]
        fold_test_pred = fold_test_pred > umbral

        tn, fp, fn, tp = confusion_matrix(fold_test_y, fold_test_pred).ravel()
        acc = (tp + tn) / (tp + tn + fp + fn)
        recall = tp / (tp + fn)
        prec = tp / (tp + fp + 1e-8)
        f1 = 2 * prec * recall / (prec + recall + 1e-8) # Evitamos division entre 0
        row = pd.Series([acc, recall, prec, f1])
        row_df = pd.DataFrame(row.values[None, :], columns=["Acc", "Recall", "Prec", "F1"])
        scores = pd.concat((scores, row_df), axis=0)

    row = scores.mean()
    row_df = pd.DataFrame(row.values[None, :], columns=["Acc", "Recall", "Prec", "F1"], index=[round(umbral, 2)])
    results = pd.concat((results, row_df), axis=0)
results.head()

 > Procesando umbral 0.01
 > Procesando umbral 0.02
 > Procesando umbral 0.03
 > Procesando umbral 0.04
 > Procesando umbral 0.05
 > Procesando umbral 0.06
 > Procesando umbral 0.07
 > Procesando umbral 0.08
 > Procesando umbral 0.09
 > Procesando umbral 0.10
 > Procesando umbral 0.11
 > Procesando umbral 0.12
 > Procesando umbral 0.13
 > Procesando umbral 0.14
 > Procesando umbral 0.15
 > Procesando umbral 0.16
 > Procesando umbral 0.17
 > Procesando umbral 0.18
 > Procesando umbral 0.19
 > Procesando umbral 0.20
 > Procesando umbral 0.21
 > Procesando umbral 0.22
 > Procesando umbral 0.23
 > Procesando umbral 0.24
 > Procesando umbral 0.25
 > Procesando umbral 0.26
 > Procesando umbral 0.27
 > Procesando umbral 0.28
 > Procesando umbral 0.29
 > Procesando umbral 0.30
 > Procesando umbral 0.31
 > Procesando umbral 0.32
 > Procesando umbral 0.33
 > Procesando umbral 0.34
 > Procesando umbral 0.35
 > Procesando umbral 0.36
 > Procesando umbral 0.37
 > Procesando umbral 0.38
 > Procesand

Unnamed: 0,Acc,Recall,Prec,F1
0.01,0.725057,0.692329,0.249855,0.366978
0.02,0.72993,0.672295,0.249626,0.363829
0.03,0.725725,0.695545,0.251181,0.368603
0.04,0.731475,0.683981,0.253693,0.369847
0.05,0.729706,0.699376,0.25485,0.373375


In [27]:
metric = "F1" # F1
results[results[metric] == results[metric].max()]

Unnamed: 0,Acc,Recall,Prec,F1
0.32,0.870604,0.432636,0.438914,0.434499


**Pregunta**: ¿Cómo podríamos buscar el mejor umbral y el mejor valor de K?