In [1]:
import pandas as pd
import numpy as np
import sys
import os

In [2]:
import winsound

In [3]:
os.listdir()

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 '2015_failures.csv',
 '2015_sample.csv',
 'boosting.ipynb',
 'data_Q1_2018_failures.csv',
 'data_Q2_2018_failures.csv',
 'data_Q3_2018_failures.csv',
 'hard_drive_intro.ipynb',
 'random_sample.py',
 'README.md',
 'save_failures.py']

In [4]:
good = pd.read_csv('2015_sample.csv')

In [5]:
bad = pd.read_csv('2015_failures.csv')

In [31]:
df = pd.concat([good,bad])

In [32]:
df = df.dropna(0,thresh=30).dropna(1)
columns = [i for i in df.columns if 'capacity' in i or 'normalized' in i]
X = df[columns]
y = df['failure']

In [35]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve

In [9]:
accuracy = lambda actual,predicted : 1- np.mean(np.abs(actual-predicted))

def precision(actual,predicted):
    tp = np.all([actual, predicted],axis=0).sum()
    fp = np.all([1-actual, predicted],axis=0).sum()
    tn = np.all([1-actual, 1-predicted],axis=0).sum()
    fn = np.all([actual, 1-predicted],axis=0).sum()
    return tp/(tp+fp) if (tp+fp>0) else 0

def recall(actual,predicted):
    tp = np.all([actual, predicted],axis=0).sum()
    fp = np.all([1-actual, predicted],axis=0).sum()
    tn = np.all([1-actual, 1-predicted],axis=0).sum()
    fn = np.all([actual, 1-predicted],axis=0).sum()
    return tp/(tp+fn) if (tp+fn>0) else 0

In [10]:
lr = LogisticRegression(solver='lbfgs')
gnb = GaussianNB()
svc = LinearSVC(C=1.0)
rfc = RandomForestClassifier(n_estimators=100)
ada = AdaBoostClassifier(gnb, algorithm="SAMME",
                         n_estimators=200)
gb = GradientBoostingClassifier( 
                         n_estimators=200)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.66, random_state=42)

In [43]:
for model in [lr,gnb,svc,rfc,ada,gb]:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print("Trained: ",model)
    print("Precision:",precision(y_test,y_pred))
    print("Recall:",recall(y_test,y_pred))
    print("Accuracy:",accuracy(y_test,y_pred))
    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
    print("FPR: %s, TPR: %s, Thresh: %s" % (fpr, tpr, thresholds))

Trained:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)
Precision: 0
Recall: 0.0
Accuracy: 0.5029177718832891
FPR: [0. 1.], TPR: [0. 1.], Thresh: [1 0]
Trained:  GaussianNB(priors=None)
Precision: 0.6512096774193549
Recall: 0.3447171824973319
Accuracy: 0.5824933687002652
FPR: [0.         0.18248945 1.        ], TPR: [0.         0.34471718 1.        ], Thresh: [2 1 0]
Trained:  LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
Precision: 0
Recall: 0.0
Accuracy: 0.5029177718832891
FPR: [0. 1.], TPR: [0. 1.], Thresh: [1 0]
Trained:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
        

In [62]:
lr = LogisticRegression(solver='newton-cg')

In [63]:
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print("Trained: ",lr)
print("Precision:",precision(y_test,y_pred))
print("Recall:",recall(y_test,y_pred))
print("Accuracy:",accuracy(y_test,y_pred))

Trained:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)
Precision: 0
Recall: 0.0
Accuracy: 0.503711558854719




In [15]:
X_train.head()

Unnamed: 0,capacity_bytes,failure,smart_1_normalized,smart_3_normalized,smart_4_normalized,smart_5_normalized,smart_7_normalized,smart_9_normalized,smart_10_normalized,smart_12_normalized,smart_197_normalized,smart_198_normalized,smart_199_normalized
227,1500301910016,0,119.0,100.0,100.0,100.0,93.0,42.0,100.0,100.0,100.0,100.0,200.0
852,4000787030016,1,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,200.0
1182,4000787030016,0,116.0,93.0,100.0,100.0,78.0,96.0,100.0,100.0,100.0,100.0,200.0
805,4000787030016,0,100.0,123.0,100.0,100.0,100.0,98.0,100.0,100.0,100.0,100.0,200.0
820,1500301910016,1,120.0,99.0,100.0,75.0,92.0,43.0,100.0,100.0,100.0,100.0,200.0


In [44]:
new_bad = pd.read_csv('data_Q1_2018_failures.csv')

In [45]:
new_bad = new_bad.dropna(1)
X_new = new_bad[columns]
y_new = new_bad['failure']

KeyError: "['smart_1_normalized' 'smart_3_normalized' 'smart_4_normalized'\n 'smart_5_normalized' 'smart_7_normalized' 'smart_9_normalized'\n 'smart_10_normalized' 'smart_12_normalized' 'smart_194_normalized'\n 'smart_197_normalized' 'smart_198_normalized' 'smart_199_normalized'] not in index"

In [46]:
new_bad.head()

Unnamed: 0,date,serial_number,model,capacity_bytes,failure
0,2018-01-01,Z300GZ1B,ST4000DM000,4000787030016,1
1,2018-01-01,Z300GZCV,ST4000DM000,4000787030016,1
2,2018-01-01,VKGKRG7X,HGST HUH728080ALE600,8001563222016,1
3,2018-01-02,S300V3AD,ST4000DM000,4000787030016,1
4,2018-01-02,ZA11THZ7,ST8000DM002,8001563222016,1


In [38]:
for model in [lr,gnb,svc,rfc,ada,gb]:
    y_pred = model.predict(X_new)
    print("Testing with new data")
    print("Trained: ",model)
    print("Precision:",precision(y_new,y_pred))
    print("Recall:",recall(y_new,y_pred))
    print("Accuracy:",accuracy(y_new,y_pred))

NameError: name 'X_new' is not defined

In [82]:
y_pred = lr.predict(X_test)

In [83]:
accuracy(y_test,y_pred)

0.9999592040714337

In [84]:
precision(y_test,y_pred)

0

In [85]:
recall(y_test,y_pred)

0.0

In [70]:
y_pred.shape

(196098,)

In [None]:
future_nominal = future_nominal[features]
    future_failure = future_failure[features]
    future_nominial = future_nominal[future_nominal['smart_1_normalized'] >= 0]
    future_failure = future_failure[future_failure['smart_1_normalized'] >= 0]
    # future_nominal[future_nominal.isnull().values]
    # future_failure[future_failure.isnull().values]

    train_x, train_y = data_split(nominal_sample, failure_sample, head=True, split=0.8)
    val_x, val_y = data_split(nominal_sample, failure_sample, head=False, split=0.2)
    future_x, future_y = data_split(future_nominal, future_failure, head=True, split=1.0)