#### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings

warnings.filterwarnings("ignore")

#### Import Data

In [2]:
df = pd.read_csv('failure.csv')

#### EDA - Exploratory Data Analysis

In [3]:
df.shape

(124494, 12)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        124494 non-null  object
 1   device      124494 non-null  object
 2   failure     124494 non-null  int64 
 3   attribute1  124494 non-null  int64 
 4   attribute2  124494 non-null  int64 
 5   attribute3  124494 non-null  int64 
 6   attribute4  124494 non-null  int64 
 7   attribute5  124494 non-null  int64 
 8   attribute6  124494 non-null  int64 
 9   attribute7  124494 non-null  int64 
 10  attribute8  124494 non-null  int64 
 11  attribute9  124494 non-null  int64 
dtypes: int64(10), object(2)
memory usage: 11.4+ MB


In [5]:
df.head()

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9
0,2015-01-01,S1F01085,0,215630672,56,0,52,6,407438,0,0,7
1,2015-01-01,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,2015-01-01,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,2015-01-01,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,2015-01-01,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3


In [6]:
df.describe()

Unnamed: 0,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9
count,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0
mean,0.000851,122386800.0,159.484762,9.940455,1.74112,14.222693,260172.858025,0.292528,0.292528,12.451524
std,0.029167,70459600.0,2179.65773,185.747321,22.908507,15.943021,99151.009852,7.436924,7.436924,191.425623
min,0.0,0.0,0.0,0.0,0.0,1.0,8.0,0.0,0.0,0.0
25%,0.0,61276750.0,0.0,0.0,0.0,8.0,221452.0,0.0,0.0,0.0
50%,0.0,122795700.0,0.0,0.0,0.0,10.0,249799.5,0.0,0.0,0.0
75%,0.0,183308400.0,0.0,0.0,0.0,12.0,310266.0,0.0,0.0,0.0
max,1.0,244140500.0,64968.0,24929.0,1666.0,98.0,689161.0,832.0,832.0,18701.0


In [7]:
df.failure.unique()

array([0, 1])

Description of column failure: 0 for non-failure and 1 for failure

In [8]:
df.failure.value_counts()

0    124388
1       106
Name: failure, dtype: int64

#### Feature Engineering

In [9]:
df['Date']  = pd.to_datetime(df['date'])

In [10]:
df["year"]=df["Date"].dt.year
df["month"]=df["Date"].dt.month_name()
df["day"]=df["Date"].dt.day_name()

In [11]:
df.head()

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,Date,year,month,day
0,2015-01-01,S1F01085,0,215630672,56,0,52,6,407438,0,0,7,2015-01-01,2015,January,Thursday
1,2015-01-01,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0,2015-01-01,2015,January,Thursday
2,2015-01-01,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0,2015-01-01,2015,January,Thursday
3,2015-01-01,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0,2015-01-01,2015,January,Thursday
4,2015-01-01,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3,2015-01-01,2015,January,Thursday


In [12]:
del df['Date']

In [13]:
df.groupby('month')['failure'].value_counts(ascending=True)

month      failure
April      1              9
           0          12003
August     1              4
           0           8342
February   1             14
           0          19486
January    1             24
           0          25008
July       1             16
           0          10515
June       1              6
           0          10463
March      1              9
           0          19824
May        1             21
           0          11309
November   0             31
October    1              3
           0           2937
September  0           4470
Name: failure, dtype: int64

In [14]:
df.corr()['failure'].sort_values(ascending=False)

failure       1.000000
attribute7    0.119055
attribute8    0.119055
attribute4    0.067398
attribute2    0.052902
attribute5    0.002270
attribute1    0.001984
attribute9    0.001622
attribute6   -0.000550
attribute3   -0.000948
year               NaN
Name: failure, dtype: float64

Check Attribute7 and Attribute8 (seem to be equal)

In [15]:
df['attribute7'].equals(df['attribute8'])

True

In [16]:
del df['attribute8']

In [17]:
df[['attribute3','attribute4','attribute7','attribute9']] = df[['attribute3','attribute4','attribute7','attribute9']].astype('object')

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 14 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        124494 non-null  object
 1   device      124494 non-null  object
 2   failure     124494 non-null  int64 
 3   attribute1  124494 non-null  int64 
 4   attribute2  124494 non-null  int64 
 5   attribute3  124494 non-null  object
 6   attribute4  124494 non-null  object
 7   attribute5  124494 non-null  int64 
 8   attribute6  124494 non-null  int64 
 9   attribute7  124494 non-null  object
 10  attribute9  124494 non-null  object
 11  year        124494 non-null  int64 
 12  month       124494 non-null  object
 13  day         124494 non-null  object
dtypes: int64(6), object(8)
memory usage: 13.3+ MB


In [19]:
del df['date']

In [20]:
df.year

0         2015
1         2015
2         2015
3         2015
4         2015
          ... 
124489    2015
124490    2015
124491    2015
124492    2015
124493    2015
Name: year, Length: 124494, dtype: int64

In [21]:
df.year.value_counts()

2015    124494
Name: year, dtype: int64

In [22]:
del df['year']

#### Prediciton and selecting the Algorithm

In [23]:
x=df.drop('failure',axis=1)
y=df['failure']

In [24]:
x=pd.get_dummies(x,drop_first=True)

In [25]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from sklearn.model_selection import train_test_split

In [41]:
def classification(x,y):
    from sklearn.naive_bayes import GaussianNB
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.svm import SVC
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from xgboost import XGBClassifier #yüklemek için !pip install xgboost kullandım
    from sklearn.ensemble import GradientBoostingClassifier
    
    import imblearn
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.metrics import geometric_mean_score, make_index_balanced_accuracy, classification_report_imbalanced

    from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
    from sklearn.metrics import confusion_matrix,classification_report
              
    g=GaussianNB()
    b=BernoulliNB()
    KN=KNeighborsClassifier()
    SVC=SVC() 
    D=DecisionTreeClassifier()
    R=RandomForestClassifier()
    Log=LogisticRegression()
    XGB=XGBClassifier()
    G=GradientBoostingClassifier()
      
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=13)
    
    
    algos=[g,b,KN,SVC,D,R,Log,XGB,G]
    algo_names=['GaussianNB','BernoulliNB','KNeighborsClassifier','SVC','DecisionTreeClassifier','RandomForestClassifier','LogisticRegression','XGBClassifier','GradientBoostingClassifier']
    
    accuracy_scored=[]
    precision_scored=[]
    recall_scored=[]
    f1_scored=[]
    
    RUS=RandomUnderSampler(random_state=42)
    x_RUS,y_RUS=RUS.fit_resample(x_train,y_train)
    
    for item in algos:

        #item.fit(x_RUS,y_RUS)
        predict=item.fit(x_RUS,y_RUS).predict(x_test)
        
        
        accuracy_scored.append(accuracy_score(y_test,predict))
        precision_scored.append(precision_score(y_test,predict))
        recall_scored.append(recall_score(y_test,predict))
        f1_scored.append(f1_score(y_test,predict))

    result=pd.DataFrame(columns=['accuracy_score','f1_score','recall_score','precision_score'],index=algo_names)
    result['accuracy_score']=accuracy_scored
    result['f1_score']=f1_scored
    result['recall_score']=recall_scored
    result['precision_score']=precision_scored
    
    return result.sort_values('accuracy_score',ascending=False)  

In [43]:
classification(x,y)

Unnamed: 0,accuracy_score,f1_score,recall_score,precision_score
GaussianNB,0.990907,0.020761,0.081081,0.011905
BernoulliNB,0.915885,0.011329,0.405405,0.005745
RandomForestClassifier,0.901427,0.009684,0.405405,0.0049
GradientBoostingClassifier,0.883884,0.00932,0.459459,0.004708
DecisionTreeClassifier,0.806741,0.005621,0.459459,0.002828
XGBClassifier,0.708681,0.004829,0.594595,0.002425
SVC,0.645868,0.002895,0.432432,0.001452
KNeighborsClassifier,0.490265,0.002264,0.486486,0.001135
LogisticRegression,0.001189,0.002375,1.0,0.001189


### Deep Learning

In [44]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import geometric_mean_score, make_index_balanced_accuracy, classification_report_imbalanced

In [45]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=13)

RUS=RandomUnderSampler(random_state=42)
x_RUS,y_RUS=RUS.fit_resample(x_train,y_train)

In [46]:
model=Sequential()
model.add(Dense(500,activation='relu'))
model.add(Dense(500,activation='relu'))
model.add(Dense(500,activation='relu'))
model.add(Dense(500,activation='relu'))
model.add(Dense(500,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

In [47]:
model.compile(loss='binary_crossentropy',optimizer="adam",metrics=["accuracy"])
history = model.fit(x_RUS,y_RUS, batch_size=128, epochs=40, verbose=1, validation_data=(x_test,y_test))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [48]:
_, accuracy=model.evaluate(x_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 99.88
