### Imports

In [None]:
#Basic imports
import pandas as pd
import numpy as np

#SkLearn imports
from sklearn.preprocessing import LabelEncoder

#Keras imports
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

### Data set aquisiton and preprocessing

In [None]:
#Load dataset
df = pd.read_csv('autos.csv', encoding = 'ISO-8859-1')

1. Treating data:

The data treatment is going to be divided in three steps:


*   Remove attributes based on their importance to the regression;
*   Handle incosistent values;
*   Handle missing data.

In [None]:
#Removing unimportant attributes
to_remove = ['dateCrawled', 'name', 'yearOfRegistration', 'monthOfRegistration', 'dateCreated', 'nrOfPictures', 'lastSeen', 'postalCode']
df = df.drop(columns = to_remove)

In [None]:
#Investigate remaining attributes
import matplotlib.pyplot as plt
import seaborn as sns

for i in df.columns.values:
  plt.figure()
  if df.dtypes[i] == object:
    sns.countplot(df[i])
  else:
    sns.distplot(df[i])

Based on the previous plots, both 'seller' and 'offerType' attributes should not be very important for the regression model. This is simply due to how poorly distributed they are in this specific dataset. It is important to emphasize the decision to remove these attributes is not related to any analysis on their intrisic importance to the problem. 

In [None]:
#Remove attributes based on plot distributions
plt_based_remove = ['seller', 'offerType', 'model']
df = df.drop(columns = plt_based_remove)

The data incosistencies basically are atypical 'price' values, considering usual car prices. These values are likely due to errors in the ad or in the data crawler.

In [None]:
#Handle incosistent values (removing instacies in which 'price' <= 10 or 'price' > 350,000)
df = df[df['price'] > 10]
df = df[df['price'] < 350000]

In [None]:
#Find missing values
is_null = df.isnull().sum()
print(is_null)

price                    0
abtest                   0
vehicleType          33546
gearbox              17236
powerPS                  0
kilometer                0
fuelType             29391
brand                    0
notRepairedDamage    65986
dtype: int64


In [None]:
#Define function to replace missing values
def nan_to_category(data, col):
  new_val = data[col].value_counts().index[0]
  data[col] = data[col].replace(to_replace = np.nan, value = new_val)
  return data

In [None]:
#Replacing missing values for most common value (categorical attributes)
df = nan_to_category(df, 'vehicleType')
df = nan_to_category(df, 'gearbox')
df = nan_to_category(df, 'fuelType')
df = nan_to_category(df, 'notRepairedDamage')

2. Encoding data

In [None]:
#Split data into attributes and target
target = df['price'].copy()
atr = df.drop(columns = 'price')

In [None]:
#Define function to encode data using LabelEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def encode_data(data, is_hot):
  code_list = []
  col_names = []
  hot_list = []
  dummy = data.copy()
  for col in dummy.columns.values:
    if dummy[col].dtypes == object:
      coder = LabelEncoder()
      dummy[col] = coder.fit_transform(dummy[col].values)
      code_list.append(coder)
      col_names.append(col)
      hot_list.append(True)
    else:
      hot_list.append(False)
  encoder_list = pd.DataFrame(data = code_list)
  encoder_list.index = col_names
  if is_hot:
    hot_coder = OneHotEncoder(categorical_features = hot_list)
    dummy = hot_coder.fit_transform(dummy).toarray()
  
  return dummy

In [None]:
#Encode data
coded_atr = encode_data(atr, True)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


### Network model 

Neural network model:

We define the model as function. This way we can tune the hyperparameters and optimize performance.

In [None]:
#Define function for neural network
def regression_net(n_in, n_out, n_layers, n_nodes, opt, act, loss_f, drop):
  model = Sequential()
  model.add(Dense(units = n_nodes, activation = act, input_dim = n_in))
  model.add(Dropout(drop))
  for x in range(n_layers):
    model.add(Dense(units = n_nodes, activation = act))
    model.add(Dropout(drop))
  model.add(Dense(units = n_out, activation = 'linear'))
  
  model.compile(optimizer = opt, loss = loss_f, 
                metrics = ['mean_absolute_error'])
  return model

In [None]:
#Create model
model = regression_net(n_in = coded_atr.shape[1],
                      n_out = 1,
                      n_layers = 1,
                      n_nodes = 160,
                      opt = 'adam',
                      act = 'relu',
                      loss_f = 'mean_absolute_error',
                      drop = 0.0)

In [None]:
#Fit model to dataset
model.fit(coded_atr, target, batch_size = 300, epochs = 500)

In [None]:
#Cross-validate model
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score

new_model = KerasRegressor(build_fn = regression_net,
                          n_in = coded_atr.shape[1],
                          n_out = 1,
                          n_layers = 1,
                          n_nodes = 160,
                          opt = 'adam',
                          act = 'relu',
                          loss_f = 'mean_absolute_error',
                          drop = 0.0,
                          batch_size = 300,
                          epochs = 100)

score = cross_val_score(estimator = new_model,
                       X = coded_atr,
                       y = target,
                       scoring = 'r2',
                       cv = 5,
                       n_jobs = 4,
                       verbose = 2)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed: 23.7min finished


In [None]:
score

array([0.58509757, 0.54303759, 0.49647767, 0.5377635 , 0.60483546])

In [None]:
#Tune loss function for the model 
from sklearn.model_selection import GridSearchCV

param = {'n_in': [coded_atr.shape[1]],
        'n_out': [1],
        'n_layers': [1],
        'n_nodes': [160],
        'opt': ['adam'],
        'act': ['relu'],
        'loss_f': ['mean_absolute_error', 'mean_squared_error', 
                   'mean_absolute_percentage_error', 'squared_hinge',
                   'mean_squared_logarithmic_error'],
        'drop': [0.0],
        'batch_size': [300],
        'epochs': [100]}

grid = GridSearchCV(estimator = new_model, 
                   param_grid = param,
                   scoring = 'r2',
                   cv = 4,
                   n_jobs = 4,
                   verbose = 2)
grid.fit(coded_atr, target)

In [None]:
grid_df = pd.DataFrame(data = grid.cv_results_)
grid_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_act,param_batch_size,param_drop,param_epochs,param_loss_f,param_n_in,param_n_layers,param_n_nodes,param_n_out,param_opt,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,1015.367427,6.833976,1.243595,0.162364,relu,300,0,100,mean_absolute_error,63,1,160,1,adam,"{'act': 'relu', 'batch_size': 300, 'drop': 0.0...",0.573164,0.555776,0.555072,0.54462,0.557158,0.010243,2
1,1025.594013,8.365594,1.576637,0.214552,relu,300,0,100,mean_squared_error,63,1,160,1,adam,"{'act': 'relu', 'batch_size': 300, 'drop': 0.0...",0.596448,0.562235,0.549427,0.621583,0.582423,0.028401,1
2,1043.126174,6.267975,1.6737,0.522014,relu,300,0,100,mean_absolute_percentage_error,63,1,160,1,adam,"{'act': 'relu', 'batch_size': 300, 'drop': 0.0...",-0.035417,-0.001506,-0.106616,0.062177,-0.020341,0.060899,3
3,1040.560825,5.508215,2.061972,0.339477,relu,300,0,100,squared_hinge,63,1,160,1,adam,"{'act': 'relu', 'batch_size': 300, 'drop': 0.0...",-21.790041,-17.713348,-17.071467,-13.378319,-17.488305,2.984176,4
4,1051.98692,4.865683,1.408378,0.889068,relu,300,0,100,mean_squared_logarithmic_error,63,1,160,1,adam,"{'act': 'relu', 'batch_size': 300, 'drop': 0.0...",-1640.294319,-801.057365,-36.39829,-1493.396512,-992.785228,636.654724,5
