In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
#from keras.utils import np_utils
#from sklearn.preprocessing import LabelEncoder

In [2]:
# Random seed for reproducibility
seed = 10
np.random.seed(seed)
# Import data
df = pd.read_csv('Sensorless_drive_diagnosis.txt', sep = ' ', header = None)
# Print first 10 samples
print(df.head(10))

             0             1         2             3             4         5   \
0 -3.014600e-07  8.260300e-06 -0.000012 -2.309800e-06 -1.438600e-06 -0.000021   
1  2.913200e-06 -5.247700e-06  0.000003 -6.056100e-06  2.778900e-06 -0.000004   
2 -2.951700e-06 -3.184000e-06 -0.000016 -1.208400e-06 -1.575300e-06  0.000017   
3 -1.322600e-06  8.820100e-06 -0.000016 -4.811100e-06 -7.282900e-07  0.000004   
4 -6.836600e-08  5.666300e-07 -0.000026 -6.490100e-06 -7.940600e-07  0.000013   
5 -9.584900e-07  5.214300e-08 -0.000047  6.453700e-07 -2.304100e-06  0.000055   
6 -1.783700e-06  1.303600e-06 -0.000038 -7.083900e-06  6.706300e-06  0.000022   
7 -2.566600e-06 -1.679500e-07  0.000015 -1.598400e-06  8.709200e-07  0.000015   
8 -5.474000e-06  1.086500e-07 -0.000011 -1.815600e-06  4.757800e-07  0.000024   
9  2.825600e-06  6.067600e-06  0.000118 -4.347500e-06  1.492300e-06 -0.000003   

         6         7         8         9   ...       39       40       41  \
0  0.031718  0.031710  0.031721

In [3]:
# Check missing values
print(df.isna().sum())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
44    0
45    0
46    0
47    0
48    0
dtype: int64


In [4]:
# Divide data into features X and target (Classes) Y
X = df.loc[:,0:47]
Y = df.loc[:,48]
print(X.shape)
print(Y.shape)

(58509, 48)
(58509,)


In [5]:
# Statistical summary of the variables
print(X.describe())

                 0             1             2             3             4   \
count  58509.000000  5.850900e+04  5.850900e+04  58509.000000  5.850900e+04   
mean      -0.000003  1.439648e-06  1.412013e-06     -0.000001  1.351239e-06   
std        0.000072  5.555429e-05  2.353009e-04      0.000063  5.660943e-05   
min       -0.013721 -5.414400e-03 -1.358000e-02     -0.012787 -8.355900e-03   
25%       -0.000007 -1.444400e-05 -7.239600e-05     -0.000005 -1.475300e-05   
50%       -0.000003  8.804600e-07  5.137700e-07     -0.000001  7.540200e-07   
75%        0.000002  1.877700e-05  7.520000e-05      0.000004  1.906200e-05   
max        0.005784  4.525300e-03  5.237700e-03      0.001453  8.245100e-04   

                 5             6             7             8             9   \
count  5.850900e+04  58509.000000  58509.000000  58509.000000  58509.000000   
mean  -2.654483e-07      0.001915      0.001913      0.001912     -0.011897   
std    2.261907e-04      0.036468      0.036465    

In [6]:
# Check for class imbalance
print(df.groupby(Y).size())

48
1     5319
2     5319
3     5319
4     5319
5     5319
6     5319
7     5319
8     5319
9     5319
10    5319
11    5319
dtype: int64


In [7]:
# Normalize features within range 0 (minimum) and 1 (maximum)
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)
X = pd.DataFrame(X)

In [8]:
# Convert target Y to one hot encoded Y for Neural Network
Y = pd.get_dummies(Y)
# If target is in string form, use following code:
# First encode target values as integers from string
# Then perform one hot encoding
# encoder = LabelEncoder()
# encoder.fit(Y)
# Y = encoder.transform(Y)
# Y = np_utils.to_categorical(Y)

In [9]:
# For Keras, convert dataframe to array values (Inbuilt requirement of Keras)
X = X.values
Y = Y.values

In [10]:
# First define baseline model. Then use it in Keras Classifier for the training
def baseline_model():
    # Create model here
    model = Sequential()
    model.add(Dense(15, input_dim = 48, activation = 'relu')) # Rectified Linear Unit Activation Function
    model.add(Dense(15, activation = 'relu'))
    model.add(Dense(11, activation = 'softmax')) # Softmax for multi-class classification
    # Compile model here
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [11]:
# Create Keras Classifier and use predefined baseline model
estimator = KerasClassifier(build_fn = baseline_model, epochs = 100, batch_size = 10, verbose = 0)
# Try different values for epoch and batch size

  estimator = KerasClassifier(build_fn = baseline_model, epochs = 100, batch_size = 10, verbose = 0)


In [12]:
# KFold Cross Validation
kfold = KFold(n_splits = 5, shuffle = True, random_state = seed)
# Try different values of splits e.g., 10

In [None]:
# Object to describe the result
results = cross_val_score(estimator, X, Y, cv = kfold)
# Result
print("Result: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))