# Car test-time prediction

## Loading MB dataset

In [16]:
import pandas as pd
data = pd.read_csv('mercedes_test.csv')

## Data pre-processing

In [17]:
# Choose categorical data columns
cf = data.select_dtypes(include=['object']).columns
# To change it into "categorical" data type
data[cf]=data[cf].astype('category')
# One hot encoding
data = pd.get_dummies(data)
# Obtain X from data (excluding 'ID' and 'y')
X_df = data.drop(['ID','y'],axis=1)
# Obtain y from data
y_df = data['y']

# Convert y_df into binary labels
import numpy as np
TF_vector= (y_df<np.median(y_df))
y_df=TF_vector.astype(float)

# Conver data frame into numpy array
X,y = X_df.values, y_df.values

# Split into train and test datasets
from sklearn.model_selection import train_test_split
#X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,stratify=y)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,shuffle=False)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3788, 563)
(421, 563)
(3788,)
(421,)


## LR: Hyperparameter search via cross validation 

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV 

model_LR = LogisticRegression(solver='liblinear',max_iter=10000)
penalty_list = ['l1','l2']
C_list = [10,1,1e-1,1e-2,1e-3]
grid_LR = {'penalty':penalty_list,'C':C_list}
#grid_LR = dict(penalty=penalty_list,C=C_list)
cv_LR = RandomizedSearchCV(model_LR,grid_LR,n_iter=5,cv=5)
cv_LR.fit(X_train,y_train)

RandomizedSearchCV(cv=5,
                   estimator=LogisticRegression(max_iter=10000,
                                                solver='liblinear'),
                   n_iter=5,
                   param_distributions={'C': [10, 1, 0.1, 0.01, 0.001],
                                        'penalty': ['l1', 'l2']})

In [19]:
cv_LR.cv_results_ #logs results

{'mean_fit_time': array([0.02395062, 0.02944255, 0.03764901, 0.09356132, 0.1758369 ]),
 'std_fit_time': array([0.00108053, 0.00148292, 0.00251484, 0.01141583, 0.06606092]),
 'mean_score_time': array([0.00137501, 0.00179806, 0.00180025, 0.0013999 , 0.0017858 ]),
 'std_score_time': array([0.00049618, 0.0004035 , 0.00039876, 0.00048803, 0.00074348]),
 'param_penalty': masked_array(data=['l1', 'l1', 'l2', 'l2', 'l1'],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_C': masked_array(data=[0.01, 0.1, 0.1, 10, 1],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'penalty': 'l1', 'C': 0.01},
  {'penalty': 'l1', 'C': 0.1},
  {'penalty': 'l2', 'C': 0.1},
  {'penalty': 'l2', 'C': 10},
  {'penalty': 'l1', 'C': 1}],
 'split0_test_score': array([0.86543536, 0.90501319, 0.90105541, 0.88126649, 0.90369393]),
 'split1_test_score': array([0.84432718, 0.88918206, 0.8

## Store logs into csv file

In [20]:
# Store logs into csv file
import pandas as pd 
df_LR = pd.DataFrame.from_dict(cv_LR.cv_results_,orient='columns')
# Select columns to be stored
columns = ['params','mean_test_score','std_test_score','rank_test_score'] 
df_LR = df_LR[columns]
df_LR.to_csv("logs_LR.csv")

## Save the best model

In [21]:
best_model_LR=cv_LR.best_estimator_
from joblib import dump
dump(best_model_LR, 'best_model_LR.joblib')

['best_model_LR.joblib']

## Load "best_model_LS.joblib'

In [22]:
from joblib import load
loaded_model_LR = load('best_model_LR.joblib')
loaded_model_LR.score(X_test, y_test)

0.8931116389548693