**Dataset Information**: Dataset is extracted from the electric current drive signals. The drive contains intact as well as defective components. Therefore, dataset has 11 classes based on the condition of the components. Aim is, to predict the correct component condition based on the input variables using **Deep Learning** technique. Tools used: **Keras TensorFlow** 

**Dataset Rights**: This dataset has been taken from "University of California Irvine Machine Learning Repository" for the knowledge purpose and all the rights for this dataset are reserved by them. For more details like content of the dataset, owner of the dataset and reference research paper, please refer the following link: https://archive.ics.uci.edu/ml/datasets/Dataset+for+Sensorless+Drive+Diagnosis

In [75]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, Conv2D, AlphaDropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
#from keras.utils import np_utils
#from sklearn.preprocessing import LabelEncoder

In [52]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

['/device:GPU:0']

Step 2. Import and print data

In [53]:
# Random seed for reproducibility
seed = 10
np.random.seed(seed)
# Import data
df = pd.read_csv(r'D:\source\repos\KaggleConnectX\data\working\actions.csv',index_col=0)
# Print first 10 samples
print(df.head(10))

   1  2  3  4  5  6  7  8  9  10  ...  35  36  37  38  39  40  41  42  target  \
0  0  0  0  0  0  0  0  0  0   0  ...   0   0   0   0   1   0   0   0       3   
1  0  0  0  0  0  0  0  0  0   0  ...   0   0   0   0   1   0   0   0       3   
2  0  0  0  0  0  0  0  0  0   0  ...   0   0   0   0   1   2   0   0       1   
3  0  0  0  0  0  0  0  0  0   0  ...   0   0   0   2   1   0   0   0       5   
4  0  0  0  0  0  0  0  0  0   0  ...   0   0   0   0   1   0   0   2       3   
5  0  0  0  0  0  0  0  0  0   0  ...   0   0   0   0   1   0   0   2       3   
6  0  0  0  0  0  0  0  0  0   0  ...   0   0   0   0   1   0   0   2       2   
7  0  0  0  0  0  0  0  0  0   0  ...   0   0   0   0   1   0   0   2       4   
8  0  0  0  2  0  0  0  0  0   0  ...   0   0   0   0   1   0   0   0       2   
9  0  0  0  2  0  0  0  0  0   0  ...   0   0   2   1   1   0   0   0       5   

   winner  
0     1.0  
1     1.0  
2     1.0  
3     1.0  
4     1.0  
5     1.0  
6     1.0  
7     1.0  


In [54]:
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,35,36,37,38,39,40,41,42,target,winner
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,3,1.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,3,1.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,2,0,0,1,1.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,1,0,0,0,5,1.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,2,3,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455736,0,0,1,1,2,0,0,0,0,1,...,2,1,2,1,1,2,1,2,5,2.0
1455737,0,0,1,1,2,0,0,0,0,1,...,0,2,2,1,1,1,2,1,5,2.0
1455738,0,0,1,1,2,0,0,0,0,1,...,2,0,2,1,1,2,1,1,1,2.0
1455739,0,0,1,1,2,0,0,0,0,1,...,0,2,2,2,1,1,2,2,5,1.0


In [55]:
df.loc[df['target'] == 3, 'target_three'] = 1
df.loc[df['target'] != 3, 'target_three'] = 0

In [56]:
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,36,37,38,39,40,41,42,target,winner,target_three
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,3,1.0,1.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,3,1.0,1.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,2,0,0,1,1.0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,2,1,0,0,0,5,1.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,2,3,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455736,0,0,1,1,2,0,0,0,0,1,...,1,2,1,1,2,1,2,5,2.0,0.0
1455737,0,0,1,1,2,0,0,0,0,1,...,2,2,1,1,1,2,1,5,2.0,0.0
1455738,0,0,1,1,2,0,0,0,0,1,...,0,2,1,1,2,1,1,1,2.0,0.0
1455739,0,0,1,1,2,0,0,0,0,1,...,2,2,2,1,1,2,2,5,1.0,0.0


Column indices 0 to 47 are input variables (total 48 columns). Column index 48 is target column that contains 11 different classes (1 column). 

Step 3. Data pre-processing

In [57]:
# Check missing values
print(df.isna().sum())

1               0
2               0
3               0
4               0
5               0
6               0
7               0
8               0
9               0
10              0
11              0
12              0
13              0
14              0
15              0
16              0
17              0
18              0
19              0
20              0
21              0
22              0
23              0
24              0
25              0
26              0
27              0
28              0
29              0
30              0
31              0
32              0
33              0
34              0
35              0
36              0
37              0
38              0
39              0
40              0
41              0
42              0
target          0
winner          0
target_three    0
dtype: int64


No missing values. 

In [58]:
# Remove missing values IF AVAILABLE and print first 10 samples
# df = df.dropna()
# print(df.head(10))
# print(df.shape)


In [59]:
# Divide data into features X and target (Classes) Y
columns = [str(item) for item in range(1, 43)]
columns.append('winner')
X = df.loc[:,columns]
Y = df.loc[:,'target']
print(X.shape)
print(Y.shape)

(1455741, 43)
(1455741,)


In [60]:
# Statistical summary of the variables
print(X.describe())

                  1             2             3             4             5  \
count  1.455741e+06  1.455741e+06  1.455741e+06  1.455741e+06  1.455741e+06   
mean   3.588420e-01  5.071541e-01  6.404738e-01  1.342433e+00  6.306788e-01   
std    6.886931e-01  7.565136e-01  8.191852e-01  7.635096e-01  8.172292e-01   
min   -1.000000e+00 -1.000000e+00 -1.000000e+00 -1.000000e+00 -1.000000e+00   
25%    0.000000e+00  0.000000e+00  0.000000e+00  1.000000e+00  0.000000e+00   
50%    0.000000e+00  0.000000e+00  0.000000e+00  2.000000e+00  0.000000e+00   
75%    0.000000e+00  1.000000e+00  1.000000e+00  2.000000e+00  1.000000e+00   
max    2.000000e+00  2.000000e+00  2.000000e+00  2.000000e+00  2.000000e+00   

                  6             7             8             9            10  \
count  1.455741e+06  1.455741e+06  1.455741e+06  1.455741e+06  1.455741e+06   
mean   4.797845e-01  3.148094e-01  4.965361e-01  6.938439e-01  7.402835e-01   
std    7.442279e-01  6.572117e-01  7.617138e-01  8.

Scale of all the variables is different. Therefore, feature scaling is important.    

In [61]:
# Check for class imbalance
print(df.groupby(Y).size())

target
0    280362
1    213333
2    197202
3     57142
4    200343
5    221437
6    285922
dtype: int64


Since all the classes have same sample size, there is no class imbalance. 

In [62]:
# Normalize features within range 0 (minimum) and 1 (maximum)
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)
X = pd.DataFrame(X)

To train the Neural Network, single target column must be converted into one hot encoded fomat. For more details, visit this link: https://machinelearningmastery.com/why-one-hot-encode-data-in-machine-learning/

In [63]:
# Convert target Y to one hot encoded Y for Neural Network
Y = pd.get_dummies(Y)
# If target is in string form, use following code:
# First encode target values as integers from string
# Then perform one hot encoding
# encoder = LabelEncoder()
# encoder.fit(Y)
# Y = encoder.transform(Y)
# Y = np_utils.to_categorical(Y)

In [64]:
# For Keras, convert dataframe to array values (Inbuilt requirement of Keras)
X = X.values
Y = Y.values

Step 4. Define Neural Network Model 

Two hidden layers are defined with "Rectified Linear Unit" (relu) and 15 neurons each. Furthermore, this is a multi-class classification problem and there are total 11 target clsses, therefore "softmax" activation function and 11 neurons are used in the output layer. For hidden layers, the number of neurons should be in between the input data dimension and the output data dimension. In this case, the input data has 48 variable columns and output classes are 11. Therefore, the number of neurons for the hidden layer should be in between 11 and 48. You can try different values for the number of neurons as well as different number of hidden layers.  

In [96]:
# First define baseline model. Then use it in Keras Classifier for the training
def baseline_model():
    # Create model here
    model = Sequential()
    #model.add(Dense(150, input_dim = 43, activation = 'relu')) # Rectified Linear Unit Activation Function
    #model.add(Dense(7, activation = 'relu'))
    #model.add(Dropout(.2))
    #model.add(Dense(150, activation = 'relu'))
    #model.add(Dense(7, activation = 'softmax')) # Softmax for multi-class classification
    model.add(Dense(43, activation= 'relu'))
    model.add(Dense(10000, activation= 'relu'))
    model.add(AlphaDropout(.142))
    model.add(Dense(10000, activation= 'relu'))
    model.add(Dense(7, activation='softmax'))
    # Compile model here
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

# First define baseline model. Then use it in Keras Classifier for the training
def baseline_model_binary():
    # Create model here
    model = Sequential()
    #model.add(Dense(150, input_dim = 43, activation = 'relu')) # Rectified Linear Unit Activation Function
    #model.add(Dense(7, activation = 'relu'))
    #model.add(Dropout(.2))
    #model.add(Dense(150, activation = 'relu'))
    #model.add(Dense(7, activation = 'softmax')) # Softmax for multi-class classification
    model.add(Dense(43, activation= 'relu'))
    model.add(Dense(2, activation='sigmoid'))
    # Compile model here
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

Input dimension (input_dim) is 48, because the input variable columns are 48. It will change as per the dimension of the input variables.

**Note**: If you use only one hidden layer, then it will be the case of simple Neural Network problem. But, if you use more than one hidden layers for example 3, it will be considered as the deep learning problem.  

In [97]:
# Create Keras Classifier and use predefined baseline model
estimator = KerasClassifier(build_fn = baseline_model, epochs = 10, batch_size = 1000, verbose = 1)
# Try different values for epoch and batch size

  estimator = KerasClassifier(build_fn = baseline_model, epochs = 10, batch_size = 1000, verbose = 1)


Step 5. Define cross-validation and train pre-defined model 

In [98]:
# KFold Cross Validation
#kfold = KFold(n_splits = 5, shuffle = True, random_state = seed)
# Try different values of splits e.g., 10

In [99]:
# Object to describe the result
#results = cross_val_score(estimator, X, Y, cv = kfold)
# Result
#print("Result: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

From the result above, the accuracy is 94% and it can be improved by techniques like feature extraction, selection and feature engineering. 

In [100]:
estimator.fit(X,Y)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f6f99ca530>

In [101]:
estimator.predict([X[20].tolist()])



array([2])

In [102]:
model_name = 'test6'
estimator.model.save(f'{model_name}')
estimator.model.save(f'{model_name}.h5', save_format='h5')

INFO:tensorflow:Assets written to: test6\assets


In [103]:
import keras
model2 = keras.models.load_model(f'{model_name}.h5')

In [104]:
np.argmax(model2.predict([X[0].tolist()]))



3

In [105]:
model2.predict([X[0:40]])



array([[4.71371114e-02, 7.00341910e-02, 1.10560380e-01, 5.72111785e-01,
        1.50601208e-01, 3.14760767e-02, 1.80792566e-02],
       [6.07902221e-02, 1.14336640e-01, 1.88136876e-01, 2.40623027e-01,
        3.08017224e-01, 5.16343713e-02, 3.64616066e-02],
       [7.10691363e-02, 1.02814525e-01, 1.40789792e-01, 3.87978762e-01,
        2.44963229e-01, 2.49597058e-02, 2.74247956e-02],
       [8.88352543e-02, 2.60418337e-02, 1.90343529e-01, 3.78577501e-01,
        2.05746740e-01, 6.40231669e-02, 4.64320369e-02],
       [7.84418806e-02, 1.08233169e-01, 1.42414555e-01, 3.14706713e-01,
        1.96287706e-01, 9.19836611e-02, 6.79323673e-02],
       [7.20046163e-02, 1.34557366e-01, 1.10945910e-01, 3.96525115e-01,
        1.26604930e-01, 9.75930542e-02, 6.17690310e-02],
       [7.87516236e-02, 1.97839856e-01, 1.46889314e-01, 8.44208300e-02,
        1.67666569e-01, 1.83230951e-01, 1.41200885e-01],
       [7.31838867e-02, 1.37587085e-01, 2.10872933e-01, 9.96471420e-02,
        2.66621023e-01, 9