# **Data Modeling**

In [20]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV, KFold

from sklearn.preprocessing import OneHotEncoder, RobustScaler
from category_encoders import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.compose import TransformedTargetRegressor
import category_encoders as ce

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error


## Data Spliting

In [21]:
cars = pd.read_csv("1. Combined Dataset V.2.csv")

In [22]:
cars

Unnamed: 0.1,Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
0,0,SLK,2005,5200,Automatic,63000,Petrol,325,32.1,1.8,Mercedes
1,1,S Class,2017,34948,Automatic,27000,Hybrid,20,61.4,2.1,Mercedes
2,2,SL CLASS,2016,49948,Automatic,6200,Petrol,555,28.0,5.5,Mercedes
3,3,G Class,2016,61948,Automatic,16000,Petrol,325,30.4,4.0,Mercedes
4,4,G Class,2016,73948,Automatic,4000,Petrol,325,30.1,4.0,Mercedes
...,...,...,...,...,...,...,...,...,...,...,...
63581,69909,Eos,2012,5990,Manual,74000,Diesel,125,58.9,2.0,Volkswagen
63582,69910,Fox,2008,1799,Manual,88102,Petrol,145,46.3,1.2,Volkswagen
63583,69911,Fox,2009,1590,Manual,70000,Petrol,200,42.0,1.4,Volkswagen
63584,69912,Fox,2006,1250,Manual,82704,Petrol,150,46.3,1.2,Volkswagen


In [23]:
cars[cars['transmission'] == "Other"]

Unnamed: 0.1,Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
9334,9650,GLA Class,2016,18700,Other,30895,Other,125,56.5,0.0,Mercedes
12333,12762,SLK,2015,12995,Other,39000,Diesel,150,56.5,2.1,Mercedes
44225,47878,Scala,2019,15999,Other,3500,Petrol,145,47.1,1.0,Skoda


In [24]:
cars.drop(cars[cars['transmission'] == "Other"].index, inplace=True)

In [25]:
cars[cars['fuelType'] == "Other"]

Unnamed: 0.1,Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
12463,12894,C Class,2020,40999,Automatic,400,Other,135,217.3,2.0,Mercedes
44112,47762,Kamiq,2019,20980,Automatic,200,Other,145,43.5,1.5,Skoda
44113,47763,Kamiq,2019,18480,Automatic,200,Other,145,45.6,1.0,Skoda
44114,47764,Kamiq,2019,18980,Automatic,200,Other,145,44.1,1.5,Skoda
44117,47767,Fabia,2017,10980,Manual,24500,Other,145,58.9,1.2,Skoda
...,...,...,...,...,...,...,...,...,...,...,...
62325,68611,Touareg,2015,19995,Automatic,59115,Other,235,42.8,3.0,Volkswagen
62564,68855,Arteon,2019,24989,Automatic,1413,Other,150,50.4,2.0,Volkswagen
62568,68859,Arteon,2019,27495,Automatic,3500,Other,145,47.1,2.0,Volkswagen
62899,69193,Touran,2019,25990,Automatic,4305,Other,145,38.7,1.5,Volkswagen


In [26]:
cars.drop(cars[cars['fuelType'] == "Other"].index, inplace=True)

In [27]:
X = cars.drop(['price'], axis = 1)
y = cars['price']

# Data Splitting dengan proporsi test size 80:20
xtrain, xtest, ytrain, ytest = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state= 2023)

## Encoding

Encoding is a technique used in machine learning to convert categorical data into numerical format. In this project, I've chosen three methods: one-hot encoding for nominal categories, robust scaler Using the robust scaler to standardize the scale of numerical data. One-hot encoding is suitable for categories without a specific order and a limited set of unique values. Here's a breakdown of how each encoding type is applied.

- Onehot: Transmission, FuelType
- Robust: Mileage, Mpg,
- Binary: Model, Brand

In [28]:
transform = ColumnTransformer([
    ('Scaler', RobustScaler(), ['mileage', 'mpg']),
    ('OHE', OneHotEncoder(drop='first'), ['transmission', 'fuelType']),
    ('Binary Encoder', ce.BinaryEncoder(), ['model', 'brand'])
],remainder = "passthrough")

transform

## Choose a Benchmark Model

In the initial phase, we will perform modeling on the 7 selected benchmark models. The results from these benchmark models, in terms of scoring, will be based on the chosen evaluation metrics: RMSE, MAE, and MAPE. The selection of these three metrics is to compare the model's performance by examining its residuals.

In [29]:
# Define the algorithm

lr = LinearRegression()
knn = KNeighborsRegressor()
dt = DecisionTreeRegressor(random_state= 2023)
rf = RandomForestRegressor(random_state= 2023)
ada = AdaBoostRegressor(random_state= 2023)
xgb = XGBRegressor(random_state= 2023)
gbr = GradientBoostingRegressor(random_state= 2023)

models = [lr, knn, dt, rf, ada, xgb, gbr]

score_rmse = []
nilai_mean_rmse = []
nilai_std_rmse = []

score_mae = []
nilai_mean_mae = []
nilai_std_mae = []

score_mape = []
nilai_mean_mape = []
nilai_std_mape = []

score_r2 = []
nilai_mean_r2 = []
nilai_std_r2 = []

# Finding the best algorithm based on each metrics

for i in models:
    
    crossval = KFold(n_splits=5, shuffle=True, random_state=2023)

    estimator = Pipeline([
        ('preprocessing', transform),
        ('model', i)
    ])

    # RMSE
    model_cv_rmse = cross_val_score(
        estimator, 
        xtrain, 
        ytrain, 
        cv=crossval, 
        scoring='neg_root_mean_squared_error', 
        error_score='raise'
        )

    print(model_cv_rmse, i)

    score_rmse.append(model_cv_rmse)
    nilai_mean_rmse.append(model_cv_rmse.mean())
    nilai_std_rmse.append(model_cv_rmse.std())

    # MAE
    model_cv_mae = cross_val_score(
        estimator, 
        xtrain, 
        ytrain, 
        cv=crossval, 
        scoring='neg_mean_absolute_error', 
        error_score='raise'
        )

    print(model_cv_mae, i)

    score_mae.append(model_cv_mae)
    nilai_mean_mae.append(model_cv_mae.mean())
    nilai_std_mae.append(model_cv_mae.std())

    # MAPE
    model_cv_mape = cross_val_score(
        estimator, 
        xtrain, 
        ytrain, 
        cv=crossval, 
        scoring='neg_mean_absolute_percentage_error', 
        error_score='raise'
        )

    print(model_cv_mape, i)

    score_mape.append(model_cv_mape)
    nilai_mean_mape.append(model_cv_mape.mean())
    nilai_std_mape.append(model_cv_mape.std())
    
    model_cv_r2 = cross_val_score(
        estimator, 
        xtrain, 
        ytrain, 
        cv=crossval, 
        scoring='r2', 
        error_score='raise'
        )

    print(model_cv_mape, i)

    score_r2.append(model_cv_r2)
    nilai_mean_r2.append(model_cv_r2.mean())
    nilai_std_r2.append(model_cv_r2.std())

[-4662.00038271 -4650.17748805 -4689.16487273 -4989.46128739
 -4675.11050247] LinearRegression()
[-3003.90132899 -2946.10156564 -2873.21586657 -3005.36139361
 -2887.43983085] LinearRegression()
[-0.21957138 -0.2072236  -0.20429747 -0.20433129 -0.20434215] LinearRegression()
[-0.21957138 -0.2072236  -0.20429747 -0.20433129 -0.20434215] LinearRegression()
[-7082.60731902 -7216.38893351 -7477.45043457 -7458.52051814
 -7197.67518692] KNeighborsRegressor()
[-4510.2949049  -4591.11164991 -4620.32393061 -4631.98974966
 -4509.83029765] KNeighborsRegressor()
[-0.28871704 -0.28739051 -0.30348303 -0.28790979 -0.29473596] KNeighborsRegressor()
[-0.28871704 -0.28739051 -0.30348303 -0.28790979 -0.29473596] KNeighborsRegressor()
[-2948.11234239 -2716.78519805 -2578.88393656 -2777.98235461
 -2999.50606297] DecisionTreeRegressor(random_state=2023)
[-1561.43884892 -1562.32525133 -1549.33540311 -1593.06938695
 -1579.39138577] DecisionTreeRegressor(random_state=2023)
[-0.09150552 -0.09251293 -0.09170694 -

In [30]:
pd.DataFrame({
    'Model': ['Linear Regression', 'KNN Regressor', 'DecisionTree Regressor',
              'RandomForest Regressor', 'AdaBoost Regressor', 'XGBoost Regressor', 'GradientBoosting Regressor'],
    'Mean_RMSE': nilai_mean_rmse,
    'Std_RMSE': nilai_std_rmse,
    'Mean_MAE': nilai_mean_mae,
    'Std_MAE': nilai_std_mae,
    'Mean_MAPE': nilai_mean_mape,
    'Std_MAPE': nilai_std_mape,
    'Mean_R2' : nilai_mean_r2,
    'Std_R2' : nilai_std_r2
}).sort_values('Mean_MAPE',ascending = False)

Unnamed: 0,Model,Mean_RMSE,Std_RMSE,Mean_MAE,Std_MAE,Mean_MAPE,Std_MAPE,Mean_R2,Std_R2
3,RandomForest Regressor,-2022.439426,100.061937,-1197.127175,8.632857,-0.070594,0.001316,0.959408,0.00474
5,XGBoost Regressor,-2067.318894,108.921501,-1294.366628,10.42064,-0.077525,0.001628,0.957588,0.005199
2,DecisionTree Regressor,-2804.253979,153.591176,-1569.112055,15.3341,-0.092482,0.001192,0.921869,0.010105
6,GradientBoosting Regressor,-3085.988156,51.35604,-2044.855199,29.736231,-0.122947,0.001133,0.905818,0.004199
0,Linear Regression,-4733.182907,128.798642,-2943.203997,55.792624,-0.207953,0.005917,0.778606,0.008414
1,KNN Regressor,-7286.528478,155.197138,-4572.710107,52.856971,-0.292447,0.006117,0.475412,0.011806
4,AdaBoost Regressor,-7127.656878,287.627818,-6092.116408,249.231255,-0.511382,0.01818,0.497969,0.028057


In [31]:
# Benchmarking 2 Best Model

rf = RandomForestRegressor(random_state= 2023)
xgb = XGBRegressor(random_state= 2023)

models = [rf, xgb]

score_rmse = []
score_mae = []
score_mape = []
score_r2 = []

# Finding the best algorithm based on each metrics

for i in models:
    model = Pipeline([
        ('preprocessing', transform),
        ('model', i)
    ])

    model.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    score_rmse.append(np.sqrt(mean_squared_error(ytest, y_pred)))
    score_mae.append(mean_absolute_error(ytest, y_pred))
    score_mape.append(mean_absolute_percentage_error(ytest, y_pred))
    score_r2.append(r2_score(ytest, y_pred))
    
score_before_tuning = pd.DataFrame({'RMSE': score_rmse, 'MAE': score_mae, 'MAPE': score_mape, 'R2': score_r2}, index=['Random Forest', 'XGBoost'])
score_before_tuning

Unnamed: 0,RMSE,MAE,MAPE,R2
Random Forest,1863.637495,1155.982202,0.069529,0.964002
XGBoost,1913.571521,1283.982089,0.077638,0.962047


Random Forest Best Model

### Random Forest Hyperparameter Tuning

In [32]:
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest Regressor
rf = RandomForestRegressor(random_state=2023)

# Create a pipeline with data preprocessing and Random Forest model
pipe_rf = Pipeline([
    ('prep', transform),
    ('model', rf)
])

# Hyperparameter tuning with GridSearchCV
grid_rf = GridSearchCV(
    estimator=pipe_rf,
    param_grid=param_grid,
    cv=5,
    scoring=['neg_root_mean_squared_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'r2'],
    refit='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
)

In [33]:
grid_rf.fit(xtrain, ytrain)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [34]:
pd.DataFrame(grid_rf.cv_results_).sort_values(\
    by=['rank_test_neg_root_mean_squared_error','rank_test_neg_mean_absolute_error', 'rank_test_neg_mean_absolute_percentage_error', 'rank_test_r2']).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__min_samples_leaf,param_model__min_samples_split,param_model__n_estimators,params,split0_test_neg_root_mean_squared_error,...,std_test_neg_mean_absolute_percentage_error,rank_test_neg_mean_absolute_percentage_error,split0_test_r2,split1_test_r2,split2_test_r2,split3_test_r2,split4_test_r2,mean_test_r2,std_test_r2,rank_test_r2
56,74.905018,4.459802,0.747004,0.039473,20.0,1,2,200,"{'model__max_depth': 20, 'model__min_samples_l...",-2095.120862,...,0.000983,1,0.95891,0.952244,0.961818,0.962066,0.965489,0.960106,0.004449,1
2,87.359933,10.330962,0.859502,0.039956,,1,2,200,"{'model__max_depth': None, 'model__min_samples...",-2094.790868,...,0.000987,7,0.958923,0.952053,0.96166,0.962084,0.965445,0.960033,0.004496,2
83,78.779064,5.274843,0.825195,0.064797,30.0,1,2,200,"{'model__max_depth': 30, 'model__min_samples_l...",-2094.334901,...,0.000952,8,0.958941,0.952006,0.961652,0.962147,0.965271,0.960003,0.004475,3
1,44.657652,5.703853,0.564292,0.155229,,1,2,100,"{'model__max_depth': None, 'model__min_samples...",-2104.732425,...,0.000946,12,0.958532,0.952075,0.961738,0.961997,0.965246,0.959918,0.00446,4
55,39.074175,2.514318,0.38537,0.025409,20.0,1,2,100,"{'model__max_depth': 20, 'model__min_samples_l...",-2104.155813,...,0.000932,3,0.958555,0.951924,0.961812,0.961807,0.965378,0.959895,0.004533,5


In [35]:
print('Random Forest (by GridSearchCV')
print('Best_score:', grid_rf.best_score_)
print('Best_params:', grid_rf.best_params_)

Random Forest (by GridSearchCV
Best_score: -2007.7408276780272
Best_params: {'model__max_depth': 20, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 200}


In [36]:
# Model Random Forest
model = {'RF': RandomForestRegressor(random_state= 2023)}

# Define model terhadap estimator terbaik
rf_tuning = grid_rf.best_estimator_

# Fitting model
rf_tuning.fit(xtrain, ytrain)

# Predict test set
y_pred_rf_tuning = rf_tuning.predict(xtest)

# Simpan nilai metrics RMSE, MAE & MAPE setelah tuning
rmse_rf_tuning = np.sqrt(mean_squared_error(ytest, y_pred_rf_tuning))
mae_rf_tuning = mean_absolute_error(ytest, y_pred_rf_tuning)
mape_rf_tuning = mean_absolute_percentage_error(ytest, y_pred_rf_tuning)
r2_rf_tuning = r2_score(ytest, y_pred_rf_tuning)

score_after_tuning_rf = pd.DataFrame({'RMSE': rmse_rf_tuning, 'MAE': mae_rf_tuning, 'MAPE': mape_rf_tuning, 'R2': r2_rf_tuning}, index=model.keys())
score_after_tuning_rf

Unnamed: 0,RMSE,MAE,MAPE,R2
RF,1843.87244,1148.349673,0.069106,0.964761


RANDOM FOREST BEST MODEL 