In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_train = pd.read_csv('data/aps_failure_training_set.csv',na_values='na')
df_test = pd.read_csv('data/aps_failure_test_set.csv', na_values='na')

In [3]:
df_train['class'].value_counts()

neg    59000
pos     1000
Name: class, dtype: int64

In [4]:
feature_columns = df_train.drop(columns=['class']).columns.values
features = np.empty((1,1))
for feature in feature_columns:
    if len(df_train.loc[df_train[feature].notnull()]) > 50000:
        features = np.append(features,feature)
        
features = np.delete(features,0,0)

In [5]:
df_train[features]=df_train[features].fillna(df_train.median())
df_test[features]=df_test[features].fillna(df_test.median())

In [6]:
df_train['target'] = df_train['class'].replace ({'neg':-1, 'pos':1})
df_test['target'] = df_test['class'].replace ({'neg':-1, 'pos':1})
print( 'df_train target:'), print(df_train['target'].value_counts())
print( 'df_test target:'), print(df_test['target'].value_counts())

df_train target:
-1    59000
 1     1000
Name: target, dtype: int64
df_test target:
-1    15625
 1      375
Name: target, dtype: int64


(None, None)

In [7]:
X = df_train[features]
Y = df_train['target']

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X, Y,test_size = .1,random_state=12)

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(x_train)
X_train_scaled = scaler.transform(x_train)
X_val_scaled = scaler.transform(x_val)



In [10]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=12, ratio = 1.0)
x_train_res, y_train_res = sm.fit_sample(X_train_scaled, y_train)



In [17]:
from scipy.stats import itemfreq
itemfreq(y_train_res)

array([[   -1, 53091],
       [    1, 53091]])

In [21]:
pd.DataFrame(x_train_res).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,135,136,137,138,139,140,141,142,143,144
0,0.26856,2.311223,-0.040839,-0.050846,-0.010687,-0.028368,-0.05695,-0.115844,-0.056276,0.551121,...,0.517241,0.610436,0.497912,0.411771,0.162321,-0.046565,0.445747,-0.070528,-0.02052,-0.023745
1,-0.209435,-0.432672,-0.040839,-0.050846,-0.010687,-0.028368,-0.05695,-0.115867,-0.178533,-0.017392,...,-0.063901,-0.146789,-0.154709,-0.17597,-0.177147,-0.16374,-0.192965,-0.187892,-0.02052,-0.023745
2,-0.404205,-0.432672,-0.040839,-0.050846,-0.010687,-0.028368,-0.05695,-0.115867,-0.171976,-0.335567,...,-0.382494,-0.384327,-0.378234,-0.353607,-0.308513,-0.19024,-0.310446,-0.188123,-0.02052,-0.023745
3,0.006216,-0.432672,-0.040839,-0.050846,-0.010687,-0.028368,-0.05695,-0.115867,-0.178934,0.029125,...,0.041503,0.124958,0.09275,0.052627,0.06809,-0.023637,0.201816,-0.1878,-0.02052,-0.023745
4,-0.248588,-0.432672,-0.040839,-0.050846,-0.010687,-0.028368,-0.05695,-0.115867,-0.182223,-0.337355,...,-0.383318,-0.3855,-0.379147,-0.354507,-0.309314,-0.198781,-0.310446,-0.188123,-0.02052,-0.023745


In [22]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x_train_res)
X_t_train = pca.transform(x_train_res)
X_t_test = pca.transform(X_val_scaled)

In [27]:
X_t_train.shape

(106182, 145)

In [28]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_t_train, y_train_res)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [29]:
clf.score(X_t_test, y_val)

0.9845

In [32]:
from sklearn import metrics

predicciones_val = clf.predict(X_t_test)
num_predicciones_correctas = (y_val == predicciones_val).sum()
num_total_de_muestras = len(y_val)
exactitud = num_predicciones_correctas / num_total_de_muestras

print ( 'Predicciones correctas : ', num_predicciones_correctas )
print ( 'Número de muestras     : ', num_total_de_muestras )
print ( 'Exactitud (manual)     : ', exactitud )

Predicciones correctas :  5907
Número de muestras     :  6000
Exactitud (manual)     :  0.9845


In [34]:
X_test = df_test[features]
y_test = df_test['target']
X_test_scaled = scaler.transform(X_test)
X_t_test = pca.transform(X_test_scaled)

predicciones_val = clf.predict(X_t_test)
num_predicciones_correctas = (y_test == predicciones_val).sum()
num_total_de_muestras = len(y_test)
exactitud = num_predicciones_correctas / num_total_de_muestras

print ( 'Predicciones correctas : ', num_predicciones_correctas )
print ( 'Número de muestras     : ', num_total_de_muestras )
print ( 'Exactitud (manual)     : ', exactitud )

Predicciones correctas :  15767
Número de muestras     :  16000
Exactitud (manual)     :  0.9854375


In [38]:
#Costo de Pronostico (Santosh)

U_check = 10 #cost that an unnecessary check
M_check = 500 # cost of missing a faulty truck

FP = ((predicciones_val == 1) & (y_test == -1)).sum()
FN = ((predicciones_val == -1) & (y_test == 1)).sum()

Total_Cost = FP*U_check + FN*M_check
print('Total Cost: ', Total_Cost)

Total Cost:  22910
