# Clasificación por medio del método K-NN

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_train = pd.read_csv('Data/aps_failure_training_set.csv',na_values='na')
df_test = pd.read_csv('Data/aps_failure_test_set.csv', na_values='na')

In [3]:
feature_columns = df_train.drop(columns=['class']).columns.values
features = np.empty((1,1))

Se eliminarán aquellas columnas que posean menos de 50000 datos (número total de datos de entrenamiento : 60000).

In [4]:
for feature in feature_columns:
    if len(df_train.loc[df_train[feature].notnull()]) > 50000:
        features = np.append(features,feature)
        
features = np.delete(features,0,0)

Se completarán los datos faltantes con la mediana.

In [5]:
df_train[features]=df_train[features].fillna(df_train.median())
df_test[features]=df_test[features].fillna(df_test.median())

Se procederá a reemplazar la clase {neg, pos} por {-1, 1} respectivamente.

In [6]:
df_train['target'] = df_train['class'].replace ({'neg':-1, 'pos':1})
df_test['target'] = df_test['class'].replace ({'neg':-1, 'pos':1})
print( 'df_train target:'), print(df_train['target'].value_counts())
print( 'df_test target:'), print(df_test['target'].value_counts())

df_train target:
-1    59000
 1     1000
Name: target, dtype: int64
df_test target:
-1    15625
 1      375
Name: target, dtype: int64


(None, None)

In [7]:
X = df_train[features]
y = df_train['target']

In [8]:
def projectData(pcaModel, X, K):
    Z = pcaModel.transform(X)[:,:K]
    return Z

In [9]:
def recoverData(pcaModel, Z):
    K = Z.shape[1]
    X_rec = np.dot(Z, pcaModel.components_[:K,:])
    return X_rec

Se procederá a realizar una reducción de dimensionalidad usando el método PCA. Se escogieron los primeros 60 componentes principales.

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

pca = PCA().fit(X_scaled)
pca_scores = projectData(pca,X_scaled,75)
X_pca = recoverData(pca,pca_scores)



In [11]:
X_train = np.empty((int(0.9*X_pca.shape[0]),X_pca.shape[1],10))
X_val = np.empty((int(0.1*X_pca.shape[0]),X_pca.shape[1],10))
y_train = np.empty((int(0.9*y.shape[0]),10))
y_val = np.empty((int(0.1*y.shape[0]),10))

In [12]:
from sklearn.model_selection import train_test_split

X_train = np.empty((int(0.9*X_pca.shape[0]),X_pca.shape[1],10))
X_val = np.empty((int(0.1*X_pca.shape[0]),X_pca.shape[1],10))
y_train = np.empty((int(0.9*y.shape[0]),10))
y_val = np.empty((int(0.1*y.shape[0]),10))

for i in range(10):
    X_t, X_v, y_t, y_v = train_test_split(X,y,test_size=0.1, random_state = i)
    X_train[:,:,i] = X_t
    X_val[:,:,i] = X_v
    y_train[:,i] = y_t
    y_val[:,i] = y_v

Se define la función de costo a utilizar para mejorar el modelo. Se decidió variar el umbral de clasificación con el objetivo de disminuir la cantidad de falsos negativos.

In [13]:
def Costo_total(probabilidades, y, umbral):
    U_check = 10 #cost that an unnecessary check
    M_check = 500 # cost of missing a faulty truck

    FN = ((probabilidades < umbral) & (y == 1)).sum()
    TN = ((probabilidades < umbral) & (y == -1)).sum()
    FP = ((probabilidades >= umbral) & (y == -1)).sum()
    TP = ((probabilidades >= umbral) & (y == 1)).sum()

    Costo_total = FP*U_check + FN*M_check
    return Costo_total

Se entrena un modelo K-NN con los datos con dimensionalidad reducida.

In [14]:
from sklearn.neighbors import KNeighborsClassifier

n_vecinos = np.arange(5,105,5)
umbrales = np.arange(0,1,0.05)
Costos = np.empty((len(n_vecinos),len(umbrales),10))
#Costos = np.load('Costos.npy')
for i in range(len(n_vecinos)):
    for j in range(10):
        print(i,j)
        modelo = KNeighborsClassifier(n_neighbors = n_vecinos[i])
        modelo.fit(X_train[:,:,j],y_train[:,j])
        prob = modelo.predict_proba(X_val[:,:,j])[:,1]
        for k in range(len(umbrales)):
            Costos[i,k,j] = Costo_total(prob,y_val[:,j],umbrales[k])

0 0
0 1
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0 9
1 0
1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
2 0
2 1
2 2
2 3
2 4
2 5
2 6
2 7
2 8
2 9
3 0
3 1
3 2
3 3
3 4
3 5
3 6
3 7
3 8
3 9
4 0
4 1
4 2
4 3
4 4
4 5
4 6
4 7
4 8
4 9
5 0
5 1
5 2
5 3
5 4
5 5
5 6
5 7
5 8
5 9
6 0
6 1
6 2
6 3
6 4
6 5
6 6
6 7
6 8
6 9
7 0
7 1
7 2
7 3
7 4
7 5
7 6
7 7
7 8
7 9
8 0
8 1
8 2
8 3
8 4
8 5
8 6
8 7
8 8
8 9
9 0
9 1
9 2
9 3
9 4
9 5
9 6
9 7
9 8
9 9
10 0
10 1
10 2
10 3
10 4
10 5
10 6
10 7
10 8
10 9
11 0
11 1
11 2
11 3
11 4
11 5
11 6
11 7
11 8
11 9
12 0
12 1
12 2
12 3
12 4
12 5
12 6
12 7
12 8
12 9
13 0
13 1
13 2
13 3
13 4
13 5
13 6
13 7
13 8
13 9
14 0
14 1
14 2
14 3
14 4
14 5
14 6
14 7
14 8
14 9
15 0
15 1
15 2
15 3
15 4
15 5
15 6
15 7
15 8
15 9
16 0
16 1
16 2
16 3
16 4
16 5
16 6
16 7
16 8
16 9
17 0
17 1
17 2
17 3
17 4
17 5
17 6
17 7
17 8
17 9
18 0
18 1
18 2
18 3
18 4
18 5
18 6
18 7
18 8
18 9
19 0
19 1
19 2
19 3
19 4
19 5
19 6
19 7
19 8
19 9


In [15]:
np.save('Costos_NoBalanceados',Costos)

In [16]:
n_vecinos = np.arange(5,105,5)
umbrales = np.arange(0,1,0.05)
Costos_medias = np.empty((len(n_vecinos),len(umbrales)))
Costos_desv = np.empty((len(n_vecinos),len(umbrales)))
mejor_Costo = 20000
mejor_desv = 0
mejor_n_vec = 0
mejor_umbral = 0
#Costos = np.load('Costos.npy')
for i in range(len(n_vecinos)):
    for j in range(len(umbrales)):
        Costos_medias[i,j] = np.mean(Costos[i,j,:])
        Costos_desv[i,j] = np.std(Costos[i,j,:])
        if Costos_medias[i,j] < mejor_Costo:
            mejor_Costo = Costos_medias[i,j]
            mejor_n_vec = n_vecinos[i]
            mejor_umbral = umbrales[j]
            mejor_desv = Costos_desv[i,j]

In [17]:
print(mejor_Costo, mejor_desv, mejor_umbral, mejor_n_vec)

5716.0 661.4710878035411 0.05 20


In [18]:
from sklearn.neighbors import KNeighborsClassifier

modelo = KNeighborsClassifier(n_neighbors = mejor_n_vec)
modelo.fit(X_pca,y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=20, p=2,
           weights='uniform')

In [19]:
X_test = df_test[features]
y_test = df_test['target']
X_test_scaled = scaler.transform(X_test)


pca_scores_test = projectData(pca,X_test_scaled,75)
X_test_pca = recoverData(pca,pca_scores_test)

probabilidades_test = modelo.predict_proba(X_test_pca)[:,1]
Costo_total(probabilidades_test,y_test, mejor_umbral)

16130