In [60]:
# basic data analysis dan visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="ticks", palette="Paired")
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go

# statistic
from scipy.stats import normaltest,kruskal

# preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,RobustScaler
import category_encoders as ce

# model regression
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

# model selection & hyperparameter tuning
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV, KFold

# metric regression
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')


In [1]:
%store -r df
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand,age
0,A1,2017,12500,Manual,15735,Petrol,150.0,55.4,1.4,audi,3
1,A6,2016,16500,Automatic,36203,Diesel,20.0,64.2,2.0,audi,4
2,A1,2016,11000,Manual,29946,Petrol,30.0,55.4,1.4,audi,4
3,A4,2017,16800,Automatic,25952,Diesel,145.0,67.3,2.0,audi,3
4,A3,2019,17300,Manual,1998,Petrol,145.0,49.6,1.0,audi,1
...,...,...,...,...,...,...,...,...,...,...,...
108535,Eos,2012,5990,Manual,74000,Diesel,125.0,58.9,2.0,vw,8
108536,Fox,2008,1799,Manual,88102,Petrol,145.0,46.3,1.2,vw,12
108537,Fox,2009,1590,Manual,70000,Petrol,200.0,42.0,1.4,vw,11
108538,Fox,2006,1250,Manual,82704,Petrol,150.0,46.3,1.2,vw,14


# Splitting Data

In [66]:
x = df.drop(columns=['model', 'year', 'price'])
y = df['price']

In [67]:
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    random_state=2020
)

# Handing Missing Value

## Tax

In [90]:
df['tax']=df['tax'].fillna(0)

In [100]:
df['tax_is_missing']=np.where(df['tax']==0,1,0)

## MPG

In [95]:
df['mpg']=df['mpg'].fillna(0)

In [102]:
df['mpg_is_missing']=np.where(df['mpg']==0,1,0)

## engineSize

In [103]:
df['engineSize_is_missing']=np.where(df['engineSize']==0,1,0)

# Preprocessing

In [68]:
transformer = ColumnTransformer([
    ('one hot encoding', OneHotEncoder(drop='first'), ['transmission', 'fuelType']),
    ('binary encoding', ce.BinaryEncoder(), ['brand']),
    ('Robust Scaler', RobustScaler(), ['mileage', 'tax', 'mpg', 'engineSize', 'age'])
], remainder='passthrough')

# Data Transform

In [69]:
x_train_preprocessed=pd.DataFrame(transformer.fit_transform(x_train))
x_test_preprocessed=pd.DataFrame(transformer.transform(x_test))

# Model Benchmark

In [70]:
modelLinReg = LinearRegression()
modelRidge = Ridge(alpha=0.5)
modelLasso = Lasso(alpha=0.5)
modelTree = DecisionTreeRegressor(max_depth=5)
modelKNN = KNeighborsRegressor(n_neighbors=5)
modelRF = RandomForestRegressor(max_depth=5)

In [71]:
models = [
    modelLinReg, 
    modelRidge,
    modelLasso,
    modelTree,
    modelKNN,
    modelRF
]

In [None]:
# def model_selection():

#     cv_score = []
#     cv_mean = []
#     cv_std = []

#     for i in models:
#         kf = KFold(n_splits=5, shuffle=True, random_state=2020)
#         estimator = Pipeline([
#             ('preprocess', transformer),
#             ('model', i)
#         ])

#         model_cv = cross_val_score(estimator, x_train, y_train, cv=5, scoring='neg_mean_absolute_error')
#         cv_score.append(model_cv)
#         cv_mean.append(model_cv.mean())
#         cv_std.append(model_cv.std())
    
#     return pd.DataFrame({
#         'model': ['linreg', 'ridge', 'lasso', 'tree', 'knn', 'rf'],
#         'score': cv_score,
#         'mean': cv_mean,
#         'std': cv_std
#     })

In [74]:
kf = KFold(n_splits=5, shuffle=True, random_state=2020)

# x_train_preprocessed2=x_train_preprocessed.dropna()

for train_index, val_index in kf.split(x_train_preprocessed):
#     print("TRAIN:", train_index, "VAL:", val_index)
    x1_train, x_val = x_train_preprocessed.iloc[train_index], x_train_preprocessed.iloc[val_index]
    y1_train, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    modelLinReg.fit(x1_train, y1_train)
    print(mean_absolute_error(y_val, modelLinReg.predict(x_val)))

7105.976621397925
6967.1048343992525
7080.706577770745
7069.322212316754
7039.4338049175385
