In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.grid_search import GridSearchCV
plt.style.use('ggplot')

%matplotlib inline



In [2]:
import keras

Using TensorFlow backend.


Reading data. Defining classes.

In [30]:
data = pd.read_csv("data/initial_data.csv", header=0)

label_col = 'DriveTrain'
classes = data[label_col].unique()
classes_num = len(classes)

## Preprocessing

Detecting type of features

In [31]:
data_label = data[label_col]
categorical_features = [col for col in data.columns if data[col].dtype.name == 'object']
categorical_features.remove(label_col)
numerical_features = [col for col in data.columns if data[col].dtype.name != 'object']

Filling N/A

In [32]:
data_processed = data.fillna(data.median(axis=0), axis=0)
data_describe = data.describe(include=[object])
for col in categorical_features:
    data_processed[col] = data_processed[col].fillna(data_describe[col]['top'])


Normalization of numerical data

In [33]:
data_numerical = data_processed[numerical_features]
data_numerical = (data_numerical - data_numerical.mean()) / data_numerical.std()

for col in data_numerical:
    data_processed[col] = data_numerical[col]

Vectorization

In [34]:
binary_features    = [col for col in categorical_features if data_describe[col]['unique'] == 2]
nonbinary_features = [col for col in categorical_features if data_describe[col]['unique'] > 2]


for col in binary_features:
    top = data_describe[col]['top']
    top_items = data_processed[col] == top
    data_processed.loc[top_items, col] = 0
    data_processed.loc[np.logical_not(top_items), col] = 1    
  
    
data_not_binary = pd.get_dummies(data_processed[nonbinary_features])

for col in data_not_binary:
    data_processed[col] = data_not_binary[col]

data_processed = data_processed.drop(nonbinary_features, axis=1)

Dropping labell from dataset

In [35]:
data_processed = data_processed.drop(label_col, axis=1)

## Feature selection

Dropping correlated features

In [36]:
features_list = ['Price', 'Luggage.room', 'RPM', 'Horsepower']
data_uncorr = data_processed[features_list]

Dimensionality reduction

In [37]:
from sklearn.decomposition import PCA
data_for_pca = data_processed
pca = PCA(n_components=2)
pca.fit(data_for_pca)
data_pca = pca.transform(data_for_pca)

Size reduction:

In [38]:
print("Number of features decreased from %d to %d after PCA." % (data_processed.shape[1], data_pca.shape[1]))

Number of features decreased from 253 to 2 after PCA.


In [39]:
data_processed = np.array(data_processed)
data_uncorr = np.array(data_uncorr)

# Building classifiers

In [121]:
X = data_processed
features_num = X.shape[1]

In [122]:
y = data_label

In [123]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X, y)
X_smote, y_smote = SMOTE().fit_sample(X, y)
X_adasyne, y_adasyne = ADASYN().fit_sample(X, y)


In [124]:
y = np.array(pd.get_dummies(data_label))
y_resampled = np.array(pd.get_dummies(y_resampled))
y_smote = np.array(pd.get_dummies(y_smote))
y_adsyne = np.array(pd.get_dummies(y_adasyne))

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.3, random_state=42)

In [126]:
import tensorflow as tf
s = tf.InteractiveSession()

In [127]:
import keras
from keras.models import Sequential
import keras.layers as ll

model = Sequential(name="mlp")
model.add(ll.InputLayer([features_num]))

#model.add(ll.Flatten())


model.add(ll.Dense(200))
model.add(ll.Activation('tanh'))

model.add(ll.Dense(50))
model.add(ll.Activation('tanh'))

model.add(ll.Dense(classes_num, activation='softmax'))

model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])

In [128]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 253)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 200)               50800     
_________________________________________________________________
activation_15 (Activation)   (None, 200)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 50)                10050     
_________________________________________________________________
activation_16 (Activation)   (None, 50)                0         
_________________________________________________________________
dense_24 (Dense)             (None, 3)                 153       
Total params: 61,003
Trainable params: 61,003
Non-trainable params: 0
_________________________________________________________________


In [129]:
model.fit(X_train, y_train, epochs=8);

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [130]:
print("\nLoss, Accuracy = ", model.evaluate(X_test, y_test))

Loss, Accuracy =  [0.22424462058993636, 0.93442622950819676]


### Adding regularization

In [195]:
from keras import regularizers

model_reg = Sequential(name="mlp")
model_reg.add(ll.InputLayer([features_num]))

#model.add(ll.Flatten())


model_reg.add(ll.Dense(200, kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.01)))
model_reg.add(ll.Activation('tanh'))

model_reg.add(ll.Dense(50, kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.01)))
model_reg.add(ll.Activation('tanh'))

model_reg.add(ll.Dense(classes_num, activation='softmax'))

model_reg.compile("adam", "categorical_crossentropy", metrics=["accuracy"])

In [196]:
model_reg.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_29 (InputLayer)        (None, 253)               0         
_________________________________________________________________
dense_79 (Dense)             (None, 200)               50800     
_________________________________________________________________
activation_53 (Activation)   (None, 200)               0         
_________________________________________________________________
dense_80 (Dense)             (None, 50)                10050     
_________________________________________________________________
activation_54 (Activation)   (None, 50)                0         
_________________________________________________________________
dense_81 (Dense)             (None, 3)                 153       
Total params: 61,003
Trainable params: 61,003
Non-trainable params: 0
_________________________________________________________________


In [197]:
model_reg.fit(X_train, y_train, epochs=8);

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [198]:
print("\nLoss, Accuracy = ", model_reg.evaluate(X_test, y_test))


Loss, Accuracy =  [14.563441276550293, 0.6428571343421936]
