In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_train = pd.read_csv('aps_failure_training_set.csv',na_values='na')
df_test = pd.read_csv('aps_failure_test_set.csv', na_values='na')

In [3]:
feature_columns = df_train.drop(columns=['class']).columns.values
features = np.empty((1,1))

In [4]:
for feature in feature_columns:
    if len(df_train.loc[df_train[feature].notnull()]) > 50000:
        features = np.append(features,feature)
        
features = np.delete(features,0,0)

In [5]:
df_train[features]=df_train[features].fillna(df_train.median())
df_test[features]=df_test[features].fillna(df_test.median())

In [6]:
df_train['target'] = df_train['class'].replace ({'neg':-1, 'pos':1})
df_test['target'] = df_test['class'].replace ({'neg':-1, 'pos':1})
print( 'df_train target:'), print(df_train['target'].value_counts())
print( 'df_test target:'), print(df_test['target'].value_counts())

df_train target:
-1    59000
 1     1000
Name: target, dtype: int64
df_test target:
-1    15625
 1      375
Name: target, dtype: int64


(None, None)

In [7]:
from sklearn.model_selection import train_test_split

X = df_train[features]
y = df_train['target']

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.1, random_state = 1)
print('X_train:', X_train.shape); print('y_train:', y_train.shape)
print('X_val:', X_val.shape); print('y_val:', y_val.shape)

X_train: (54000, 145)
y_train: (54000,)
X_val: (6000, 145)
y_val: (6000,)


In [8]:
def projectData(pcaModel, X, K):
    Z = pcaModel.transform(X)[:,:K]
    return Z

In [9]:
def recoverData(pcaModel, Z):
    K = Z.shape[1]
    X_rec = np.dot(Z, pcaModel.components_[:K,:])
    return X_rec

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)


pca = PCA().fit(X_train_scaled)
pca_loadings = pca.components_
pca_scores_train = projectData(pca,X_train_scaled,110)
pca_scores_val = projectData(pca,X_val_scaled,110)
X_train_pca = recoverData(pca,pca_scores_train)
X_val_pca = recoverData(pca,pca_scores_val)

In [11]:
X_train_pca.shape

(54000, 145)

In [12]:
from sklearn.neighbors import KNeighborsClassifier
modelo = KNeighborsClassifier(n_neighbors = 5)
modelo.fit(X_train_pca, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [13]:
from sklearn import metrics

predicciones_val = modelo.predict(X_val_pca)
num_predicciones_correctas = (y_val == predicciones_val).sum()
num_total_de_muestras = len(y_val)
exactitud = num_predicciones_correctas / num_total_de_muestras

print ( 'Predicciones correctas : ', num_predicciones_correctas )
print ( 'Número de muestras     : ', num_total_de_muestras )
print ( 'Exactitud (manual)     : ', exactitud )

Predicciones correctas :  5947
Número de muestras     :  6000
Exactitud (manual)     :  0.9911666666666666


In [14]:
X_test = df_test[features]
y_test = df_test['target']
X_test_scaled = scaler.transform(X_test)
pca_scores_test = projectData(pca,X_test_scaled,110)
X_test_pca = recoverData(pca,pca_scores_test)

predicciones_test = modelo.predict(X_test_pca)
num_predicciones_correctas = (y_test == predicciones_test).sum()
num_total_de_muestras = len(y_test)
exactitud = num_predicciones_correctas / num_total_de_muestras

print ( 'Predicciones correctas : ', num_predicciones_correctas )
print ( 'Número de muestras     : ', num_total_de_muestras )
print ( 'Exactitud (manual)     : ', exactitud )
print ( 'Exactitud (score)      : ', modelo.score(X_test, y_test) )
print ( 'Exactitud (metrics)    : ', metrics.accuracy_score(y_test, predicciones_test) )

Predicciones correctas :  15796
Número de muestras     :  16000
Exactitud (manual)     :  0.98725
Exactitud (score)      :  0.3619375
Exactitud (metrics)    :  0.98725


In [15]:
probabilidades_val = modelo.predict_proba(X_test_pca)[:,1]

In [16]:
U_check = 10 #cost that an unnecessary check
M_check = 500 # cost of missing a faulty truck

FN = ((probabilidades_val < 0.05) & (y_test == 1)).sum()
TN = ((probabilidades_val < 0.05) & (y_test == -1)).sum()
FP = ((probabilidades_val >= 0.05) & (y_test == -1)).sum()
TP = ((probabilidades_val >= 0.05) & (y_test == 1)).sum()

Total_Cost = FP*U_check + FN*M_check
print('Total Cost: ', Total_Cost, FP,  FN, TP, TN)

Total Cost:  34970 197 66 309 15428
