In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_train = pd.read_csv('data/aps_failure_training_set.csv',na_values='na')
df_test = pd.read_csv('data/aps_failure_test_set.csv', na_values='na')

In [3]:
df_train['class'].value_counts()

neg    59000
pos     1000
Name: class, dtype: int64

In [4]:
feature_columns = df_train.drop(columns=['class']).columns.values
features = np.empty((1,1))
for feature in feature_columns:
    if len(df_train.loc[df_train[feature].notnull()]) > 50000:
        features = np.append(features,feature)
        
features = np.delete(features,0,0)

In [5]:
df_train[features]=df_train[features].fillna(df_train.median())
df_test[features]=df_test[features].fillna(df_test.median())

In [6]:
df_train['target'] = df_train['class'].replace ({'neg':-1, 'pos':1})
df_test['target'] = df_test['class'].replace ({'neg':-1, 'pos':1})
print( 'df_train target:'), print(df_train['target'].value_counts())
print( 'df_test target:'), print(df_test['target'].value_counts())

df_train target:
-1    59000
 1     1000
Name: target, dtype: int64
df_test target:
-1    15625
 1      375
Name: target, dtype: int64


(None, None)

In [7]:
X = df_train[features]
Y = df_train['target']

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X, Y,test_size = .1,random_state=12)

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(x_train)
X_train_scaled = scaler.transform(x_train)
X_val_scaled = scaler.transform(x_val)



In [10]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X_train_scaled)
X_t_train = pca.transform(X_train_scaled)
X_t_test = pca.transform(X_val_scaled)

In [23]:
pca.components_.shape

(145, 145)

In [11]:
X_t_train.shape

(54000, 145)

In [12]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_t_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [13]:
clf.score(X_t_test, y_val)

0.9895

In [14]:
from sklearn import metrics

predicciones_val = clf.predict(X_t_test)
num_predicciones_correctas = (y_val == predicciones_val).sum()
num_total_de_muestras = len(y_val)
exactitud = num_predicciones_correctas / num_total_de_muestras

print ( 'Predicciones correctas : ', num_predicciones_correctas )
print ( 'Número de muestras     : ', num_total_de_muestras )
print ( 'Exactitud (manual)     : ', exactitud )

Predicciones correctas :  5937
Número de muestras     :  6000
Exactitud (manual)     :  0.9895


In [15]:
X_test = df_test[features]
y_test = df_test['target']
X_test_scaled = scaler.transform(X_test)
X_t_test = pca.transform(X_test_scaled)

predicciones_val = clf.predict(X_t_test)
num_predicciones_correctas = (y_test == predicciones_val).sum()
num_total_de_muestras = len(y_test)
exactitud = num_predicciones_correctas / num_total_de_muestras

print ( 'Predicciones correctas : ', num_predicciones_correctas )
print ( 'Número de muestras     : ', num_total_de_muestras )
print ( 'Exactitud (manual)     : ', exactitud )

Predicciones correctas :  15734
Número de muestras     :  16000
Exactitud (manual)     :  0.983375


In [18]:
from scipy.stats import itemfreq
itemfreq(predicciones_val)

array([[   -1, 15871],
       [    1,   129]])

In [19]:
itemfreq(y_test)

array([[   -1, 15625],
       [    1,   375]])

In [20]:
#Costo de Pronostico (Santosh)

U_check = 10 #cost that an unnecessary check
M_check = 500 # cost of missing a faulty truck

FP = ((predicciones_val == 1) & (y_test == -1)).sum()
FN = ((predicciones_val == -1) & (y_test == 1)).sum()

Total_Cost = FP*U_check + FN*M_check
print('Total Cost: ', Total_Cost)

Total Cost:  128100
