# Car test-time prediction

## Loading MB dataset

In [1]:
import pandas as pd
data = pd.read_csv('mercedes_test.csv')

## Data pre-processing

In [2]:
# Choose categorical data columns
cf = data.select_dtypes(include=['object']).columns
# To change it into "categorical" data type
data[cf]=data[cf].astype('category')
# One hot encoding
data = pd.get_dummies(data)
# Obtain X from data (excluding 'ID' and 'y')
X_df = data.drop(['ID','y'],axis=1)
# Obtain y from data
y_df = data['y']

# Convert y_df into binary labels
import numpy as np
TF_vector= (y_df<np.median(y_df))
y_df=TF_vector.astype(float)

# Conver data frame into numpy array
X,y = X_df.values, y_df.values

# Split into train and test datasets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3788, 563)
(421, 563)
(3788,)
(421,)


## DNN: Hyparameter search via cross validation

In [3]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Dropout 
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

In [4]:
from sklearn.model_selection import RandomizedSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier 
# Enables the use of Scikit-learn APIs for Keras models

In [5]:
def build_model(n_layer=2,lambda_=0,lr=1e-3):
    model = Sequential()
    for i in range(n_layer-1):
        model.add(Dense(20,activation='relu',
                  kernel_regularizer=l2(lambda_),bias_regularizer=l2(lambda_)))
  
    model.add(Dense(1, activation='sigmoid',
                  kernel_regularizer=l2(lambda_),bias_regularizer=l2(lambda_)))
    optimizer = Adam(learning_rate=lr)
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['acc'])
    return model 

In [6]:
# return a scikit-learn-like Keras model
model = KerasClassifier(build_model) 
n_layer = [2,5,10]
lambda_ = [1e-3,1e-2,1e-1,1,10] 
grid = {'n_layer':n_layer,'lambda_':lambda_}
#grid = dict(n_layer=n_layer,lambda_=lambda_)
cv = RandomizedSearchCV(model,grid,n_iter=15,cv=5)

  model = KerasClassifier(build_model)


In [7]:
cv.fit(X_train,y_train,epochs=10,verbose=0)



RandomizedSearchCV(cv=5,
                   estimator=<keras.wrappers.scikit_learn.KerasClassifier object at 0x000002570053BA30>,
                   n_iter=15,
                   param_distributions={'lambda_': [0.001, 0.01, 0.1, 1, 10],
                                        'n_layer': [2, 5, 10]})

In [20]:
cv.cv_results_ # logs results

{'mean_fit_time': array([0.92690701, 1.26939411, 1.62701445, 0.99394994, 1.23856187,
        1.62207046, 0.93899837, 1.18809657, 1.53056607, 0.97977591,
        1.18282275, 1.54517779, 0.98006988, 1.41293292, 1.56782312]),
 'std_fit_time': array([0.03289318, 0.07189314, 0.09928691, 0.07646787, 0.06153695,
        0.06841646, 0.01440546, 0.07872124, 0.02847128, 0.0260221 ,
        0.01594512, 0.07968916, 0.07618274, 0.20161439, 0.03207019]),
 'mean_score_time': array([0.11459241, 0.14710979, 0.19308376, 0.1104126 , 0.14343796,
        0.19170918, 0.11299796, 0.14011912, 0.24366808, 0.11489229,
        0.17347822, 0.17830563, 0.11238298, 0.18486242, 0.22163424]),
 'std_score_time': array([0.00530147, 0.0074842 , 0.00823786, 0.00353421, 0.00380523,
        0.00739789, 0.0030169 , 0.01190786, 0.07656981, 0.00263164,
        0.06815025, 0.00298402, 0.00291887, 0.05490424, 0.05793637]),
 'param_n_layer': masked_array(data=[2, 5, 10, 2, 5, 10, 2, 5, 10, 2, 5, 10, 2, 5, 10],
              mask

## Store logs into csv file

In [21]:
# Store logs into csv file
import pandas as pd 
df_DNN=pd.DataFrame.from_dict(cv.cv_results_,orient='columns')
# Select columns to be stored
columns = ['params','mean_test_score','std_test_score','rank_test_score']
df_DNN = df_DNN[columns]
df_DNN.to_csv("logs_DNN.csv")

## Save the best model

In [22]:
best_model_DNN=cv.best_estimator_
best_model_DNN.model.save('best_model_DNN')

INFO:tensorflow:Assets written to: best_model_DNN\assets


## Load the best model

In [23]:
from tensorflow.keras.models import load_model
loaded_model = load_model('best_model_DNN')
loaded_model.evaluate(X_test, y_test)



[0.3684735596179962, 0.900237500667572]