# 3. **Data Modeling**
---

In [1]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV, KFold

from sklearn.preprocessing import OneHotEncoder, RobustScaler
import category_encoders as ce
from category_encoders import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.compose import TransformedTargetRegressor

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error


## Data Spliting

In [2]:
cars = pd.read_csv("1. Combined Dataset.csv")

In [3]:
cars

Unnamed: 0.1,Unnamed: 0,model,year,price,transmission,mileage,fuelType,engineSize,tax,mpg,brand
0,0,C Class,2020,30495,Automatic,1200,Diesel,2.0,145.0,61.4,Mercedes
1,1,C Class,2020,29989,Automatic,1000,Petrol,1.5,145.0,46.3,Mercedes
2,2,C Class,2020,37899,Automatic,500,Diesel,2.0,145.0,61.4,Mercedes
3,3,C Class,2019,30399,Automatic,5000,Diesel,2.0,145.0,61.4,Mercedes
4,5,C Class,2019,29899,Automatic,4500,Diesel,2.0,145.0,61.4,Mercedes
...,...,...,...,...,...,...,...,...,...,...,...
72402,78027,Eos,2012,5990,Manual,74000,Diesel,2.0,125.0,58.9,Volkswagen
72403,78028,Fox,2008,1799,Manual,88102,Petrol,1.2,145.0,46.3,Volkswagen
72404,78029,Fox,2009,1590,Manual,70000,Petrol,1.4,200.0,42.0,Volkswagen
72405,78030,Fox,2006,1250,Manual,82704,Petrol,1.2,150.0,46.3,Volkswagen


In [4]:
cars.drop(cars[cars['fuelType'] == "Other"].index, inplace=True)

In [5]:
cars.drop(cars[cars['transmission'] == "Other"].index, inplace=True)

In [6]:
X = cars.drop(['price', 'Unnamed: 0'], axis = 1)
y = cars['price']

# Data Splitting dengan proporsi test size 80:20
xtrain, xtest, ytrain, ytest = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state= 2023)

In [7]:
X

Unnamed: 0,model,year,transmission,mileage,fuelType,engineSize,tax,mpg,brand
0,C Class,2020,Automatic,1200,Diesel,2.0,145.0,61.4,Mercedes
1,C Class,2020,Automatic,1000,Petrol,1.5,145.0,46.3,Mercedes
2,C Class,2020,Automatic,500,Diesel,2.0,145.0,61.4,Mercedes
3,C Class,2019,Automatic,5000,Diesel,2.0,145.0,61.4,Mercedes
4,C Class,2019,Automatic,4500,Diesel,2.0,145.0,61.4,Mercedes
...,...,...,...,...,...,...,...,...,...
72402,Eos,2012,Manual,74000,Diesel,2.0,125.0,58.9,Volkswagen
72403,Fox,2008,Manual,88102,Petrol,1.2,145.0,46.3,Volkswagen
72404,Fox,2009,Manual,70000,Petrol,1.4,200.0,42.0,Volkswagen
72405,Fox,2006,Manual,82704,Petrol,1.2,150.0,46.3,Volkswagen


## Encoding

Encoding is a technique used in machine learning to convert categorical data into numerical format. In this project, I've chosen three methods: one-hot encoding for nominal categories, robust scaler Using the robust scaler to standardize the scale of numerical data. One-hot encoding is suitable for categories without a specific order and a limited set of unique values. Here's a breakdown of how each encoding type is applied.

- Onehot: Transmission, FuelType
- Robust: Mileage, Mpg,
- Binary: Model, Brand

In [8]:
transform = ColumnTransformer([
    ('Scaler', RobustScaler(), ['mileage', 'mpg']),
    ('OHE', OneHotEncoder(drop='first'), ['transmission', 'fuelType']),
    ('Binary Encoder', ce.BinaryEncoder(), ['model', 'brand'])
],remainder = "passthrough")

transform

## Choose a Benchmark Model

In the initial phase, we will perform modeling on the 7 selected benchmark models. The results from these benchmark models, in terms of scoring, will be based on the chosen evaluation metrics: RMSE, MAE, and MAPE. The selection of these three metrics is to compare the model's performance by examining its residuals.

In [9]:
# Define the algorithm

lr = LinearRegression()
knn = KNeighborsRegressor()
dt = DecisionTreeRegressor(random_state= 2023)
rf = RandomForestRegressor(random_state= 2023)
ada = AdaBoostRegressor(random_state= 2023)
xgb = XGBRegressor(random_state= 2023)
gbr = GradientBoostingRegressor(random_state= 2023)

models = [lr, knn, dt, rf, ada, xgb, gbr]

score_rmse = []
nilai_mean_rmse = []
nilai_std_rmse = []

score_mae = []
nilai_mean_mae = []
nilai_std_mae = []

score_mape = []
nilai_mean_mape = []
nilai_std_mape = []

score_r2 = []
nilai_mean_r2 = []
nilai_std_r2 = []

# Finding the best algorithm based on each metrics

for i in models:
    
    crossval = KFold(n_splits=5, shuffle=True, random_state=2023)

    estimator = Pipeline([
        ('preprocessing', transform),
        ('model', i)
    ])

    # RMSE
    model_cv_rmse = cross_val_score(
        estimator, 
        xtrain, 
        ytrain, 
        cv=crossval, 
        scoring='neg_root_mean_squared_error', 
        error_score='raise'
        )

    print(model_cv_rmse, i)

    score_rmse.append(model_cv_rmse)
    nilai_mean_rmse.append(model_cv_rmse.mean())
    nilai_std_rmse.append(model_cv_rmse.std())

    # MAE
    model_cv_mae = cross_val_score(
        estimator, 
        xtrain, 
        ytrain, 
        cv=crossval, 
        scoring='neg_mean_absolute_error', 
        error_score='raise'
        )

    print(model_cv_mae, i)

    score_mae.append(model_cv_mae)
    nilai_mean_mae.append(model_cv_mae.mean())
    nilai_std_mae.append(model_cv_mae.std())

    # MAPE
    model_cv_mape = cross_val_score(
        estimator, 
        xtrain, 
        ytrain, 
        cv=crossval, 
        scoring='neg_mean_absolute_percentage_error', 
        error_score='raise'
        )

    print(model_cv_mape, i)

    score_mape.append(model_cv_mape)
    nilai_mean_mape.append(model_cv_mape.mean())
    nilai_std_mape.append(model_cv_mape.std())
    
    model_cv_r2 = cross_val_score(
        estimator, 
        xtrain, 
        ytrain, 
        cv=crossval, 
        scoring='r2', 
        error_score='raise'
        )

    print(model_cv_mape, i)

    score_r2.append(model_cv_r2)
    nilai_mean_r2.append(model_cv_r2.mean())
    nilai_std_r2.append(model_cv_r2.std())

[-4669.78478666 -4609.99112746 -4404.87632642 -4430.32708416
 -4535.75902669] LinearRegression()
[-3052.88322127 -2982.10046798 -2949.77304173 -2992.45402196
 -2911.93409955] LinearRegression()
[-0.21654742 -0.21301334 -0.21935511 -0.22505293 -0.21643872] LinearRegression()
[-0.21654742 -0.21301334 -0.21935511 -0.22505293 -0.21643872] LinearRegression()
[-2345.4269617  -2563.3087754  -2276.10001604 -2239.77249231
 -2458.68886118] KNeighborsRegressor()
[-1385.46864561 -1402.57772393 -1372.88051926 -1364.08912159
 -1366.32461485] KNeighborsRegressor()
[-0.08176879 -0.08229922 -0.0816115  -0.08231486 -0.08134199] KNeighborsRegressor()
[-0.08176879 -0.08229922 -0.0816115  -0.08231486 -0.08134199] KNeighborsRegressor()
[-2655.09525978 -2535.85648812 -2741.72481131 -2478.76282237
 -2640.45512016] DecisionTreeRegressor(random_state=2023)
[-1515.75967557 -1516.61342637 -1546.24229981 -1500.68586012
 -1517.21372767] DecisionTreeRegressor(random_state=2023)
[-0.09100956 -0.09062355 -0.09318149 -

In [10]:
pd.DataFrame({
    'Model': ['Linear Regression', 'KNN Regressor', 'DecisionTree Regressor',
              'RandomForest Regressor', 'AdaBoost Regressor', 'XGBoost Regressor', 'GradientBoosting Regressor'],
    'Mean_RMSE': nilai_mean_rmse,
    'Std_RMSE': nilai_std_rmse,
    'Mean_MAE': nilai_mean_mae,
    'Std_MAE': nilai_std_mae,
    'Mean_MAPE': nilai_mean_mape,
    'Std_MAPE': nilai_std_mape,
    'Mean_R2' : nilai_mean_r2,
    'Std_R2' : nilai_std_r2
}).sort_values('Mean_MAPE',ascending = False)

Unnamed: 0,Model,Mean_RMSE,Std_RMSE,Mean_MAE,Std_MAE,Mean_MAPE,Std_MAPE,Mean_R2,Std_R2
3,RandomForest Regressor,-2021.742268,46.094226,-1231.086014,10.324458,-0.074334,0.000637,0.957586,0.002046
5,XGBoost Regressor,-1998.254259,60.462728,-1285.250135,14.029206,-0.078245,0.000934,0.958562,0.002402
1,KNN Regressor,-2376.659421,119.492679,-1378.268125,14.25147,-0.081867,0.000384,0.941349,0.00521
2,DecisionTree Regressor,-2610.3789,92.770967,-1519.302998,14.808631,-0.091415,0.000931,0.92928,0.004617
6,GradientBoosting Regressor,-3032.968413,77.43148,-2056.587405,70.731832,-0.127645,0.003986,0.904598,0.003691
0,Linear Regression,-4530.14767,101.550035,-2977.82897,46.892531,-0.218082,0.004024,0.787166,0.007255
4,AdaBoost Regressor,-6469.626345,356.736007,-5391.945931,427.175676,-0.467432,0.045799,0.563687,0.054279


In [11]:
# Benchmarking 2 Best Model

rf = RandomForestRegressor(random_state= 2023)
xgb = XGBRegressor(random_state= 2023)

models = [rf, xgb]

score_rmse = []
score_mae = []
score_mape = []
score_r2 = []

# Finding the best algorithm based on each metrics

for i in models:
    model = Pipeline([
        ('preprocessing', transform),
        ('model', i)
    ])

    model.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    score_rmse.append(np.sqrt(mean_squared_error(ytest, y_pred)))
    score_mae.append(mean_absolute_error(ytest, y_pred))
    score_mape.append(mean_absolute_percentage_error(ytest, y_pred))
    score_r2.append(r2_score(ytest, y_pred))
    
score_before_tuning = pd.DataFrame({'RMSE': score_rmse, 'MAE': score_mae, 'MAPE': score_mape, 'R2': score_r2}, index=['Random Forest', 'XGBoost'])
score_before_tuning

Unnamed: 0,RMSE,MAE,MAPE,R2
Random Forest,2127.335381,1194.770224,0.072294,0.955736
XGBoost,2024.792153,1264.56837,0.077615,0.959901


Random Forest Best Model

### Random Forest Hyperparameter Tuning

In [12]:
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest Regressor
rf = RandomForestRegressor(random_state=2023)

# Create a pipeline with data preprocessing and Random Forest model
pipe_rf = Pipeline([
    ('prep', transform),
    ('model', rf)
])

# Hyperparameter tuning with GridSearchCV
grid_rf = GridSearchCV(
    estimator=pipe_rf,
    param_grid=param_grid,
    cv=5,
    scoring=['neg_root_mean_squared_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'r2'],
    refit='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
)

In [13]:
grid_rf.fit(xtrain, ytrain)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [14]:
pd.DataFrame(grid_rf.cv_results_).sort_values(\
    by=['rank_test_neg_root_mean_squared_error','rank_test_neg_mean_absolute_error', 'rank_test_neg_mean_absolute_percentage_error', 'rank_test_r2']).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__min_samples_leaf,param_model__min_samples_split,param_model__n_estimators,params,split0_test_neg_root_mean_squared_error,...,std_test_neg_mean_absolute_percentage_error,rank_test_neg_mean_absolute_percentage_error,split0_test_r2,split1_test_r2,split2_test_r2,split3_test_r2,split4_test_r2,mean_test_r2,std_test_r2,rank_test_r2
59,62.836552,2.32783,0.72978,0.017646,20.0,1,5,200,"{'model__max_depth': 20, 'model__min_samples_l...",-1889.053538,...,0.001147,1,0.960673,0.953154,0.961205,0.960898,0.956603,0.958507,0.003161,1
58,31.870027,1.761755,0.426776,0.027584,20.0,1,5,100,"{'model__max_depth': 20, 'model__min_samples_l...",-1891.564083,...,0.001206,2,0.960569,0.953112,0.960926,0.960766,0.956488,0.958372,0.003108,2
5,65.869338,1.128739,0.995269,0.110579,,1,5,200,"{'model__max_depth': None, 'model__min_samples...",-1898.515635,...,0.001156,12,0.960278,0.952858,0.960864,0.960557,0.956314,0.958174,0.003132,3
86,57.79936,0.727828,0.781647,0.016043,30.0,1,5,200,"{'model__max_depth': 30, 'model__min_samples_l...",-1897.619078,...,0.001141,11,0.960316,0.952784,0.960867,0.960526,0.956323,0.958163,0.003158,4
85,28.54591,0.540613,0.406983,0.005816,30.0,1,5,100,"{'model__max_depth': 30, 'model__min_samples_l...",-1900.931897,...,0.001207,16,0.960177,0.952919,0.960677,0.960435,0.956264,0.958094,0.003054,5


In [15]:
print('Random Forest (by GridSearchCV')
print('Best_score:', grid_rf.best_score_)
print('Best_params:', grid_rf.best_params_)

Random Forest (by GridSearchCV
Best_score: -1998.1381262765485
Best_params: {'model__max_depth': 20, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 200}


In [16]:
# Model Random Forest
model = {'RF': RandomForestRegressor(random_state= 2023)}

# Define model terhadap estimator terbaik
rf_tuning = grid_rf.best_estimator_

# Fitting model
rf_tuning.fit(xtrain, ytrain)

# Predict test set
y_pred_rf_tuning = rf_tuning.predict(xtest)

# Simpan nilai metrics RMSE, MAE & MAPE setelah tuning
rmse_rf_tuning = np.sqrt(mean_squared_error(ytest, y_pred_rf_tuning))
mae_rf_tuning = mean_absolute_error(ytest, y_pred_rf_tuning)
mape_rf_tuning = mean_absolute_percentage_error(ytest, y_pred_rf_tuning)
r2_rf_tuning = r2_score(ytest, y_pred_rf_tuning)

score_after_tuning_rf = pd.DataFrame({'RMSE': rmse_rf_tuning, 'MAE': mae_rf_tuning, 'MAPE': mape_rf_tuning, 'R2': r2_rf_tuning}, index=model.keys())
score_after_tuning_rf

Unnamed: 0,RMSE,MAE,MAPE,R2
RF,2125.234434,1173.51856,0.070912,0.955824


### XGBOOST

In [17]:
param_grid_xgb = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [3, 6, 9],  # XGBoost uses maximum depth instead of None
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],
}

# Create an XGBoost Regressor
xgb = XGBRegressor(random_state=2023)

# Create a pipeline with data preprocessing and XGBoost model
pipe_xgb = Pipeline([
    ('prep', transform),
    ('model', xgb)
])

# Hyperparameter tuning with GridSearchCV
grid_xgb = GridSearchCV(
    estimator=pipe_xgb,
    param_grid=param_grid_xgb,
    cv=5,
    scoring={
        'neg_root_mean_squared_error': 'neg_root_mean_squared_error',
        'neg_mean_absolute_error': 'neg_mean_absolute_error',
        'neg_mean_absolute_percentage_error': 'neg_mean_absolute_percentage_error',
        'r2': 'r2'
    },
    refit='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
)
grid_xgb.fit(xtrain, ytrain)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [18]:
pd.DataFrame(grid_xgb.cv_results_).sort_values(\
    by=['rank_test_neg_root_mean_squared_error','rank_test_neg_mean_absolute_error', 'rank_test_neg_mean_absolute_percentage_error', 'rank_test_r2']).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__colsample_bytree,param_model__learning_rate,param_model__max_depth,param_model__n_estimators,param_model__subsample,params,...,std_test_neg_mean_absolute_percentage_error,rank_test_neg_mean_absolute_percentage_error,split0_test_r2,split1_test_r2,split2_test_r2,split3_test_r2,split4_test_r2,mean_test_r2,std_test_r2,rank_test_r2
34,5.100914,0.123128,0.344959,0.078126,0.8,0.1,9,200,0.8,"{'model__colsample_bytree': 0.8, 'model__learn...",...,0.001538,2,0.964138,0.964083,0.964055,0.966491,0.961629,0.964079,0.001538,1
35,3.941443,0.0473,0.300563,0.016063,0.8,0.1,9,200,1.0,"{'model__colsample_bytree': 0.8, 'model__learn...",...,0.001132,1,0.963914,0.963559,0.96345,0.965524,0.960692,0.963428,0.001558,2
88,5.306451,0.21867,0.336173,0.073466,1.0,0.1,9,200,0.8,"{'model__colsample_bytree': 1.0, 'model__learn...",...,0.001393,5,0.962668,0.962797,0.96303,0.965546,0.960613,0.962931,0.001569,3
32,3.074023,0.077661,0.229645,0.050869,0.8,0.1,9,100,0.8,"{'model__colsample_bytree': 0.8, 'model__learn...",...,0.001331,14,0.962385,0.963119,0.963114,0.965096,0.959837,0.96271,0.001696,4
50,2.957511,0.044185,0.187145,0.038547,0.8,0.2,9,100,0.8,"{'model__colsample_bytree': 0.8, 'model__learn...",...,0.001161,9,0.96092,0.964268,0.962615,0.965075,0.960589,0.962694,0.001774,5


In [19]:
print('XGBoost (by GridSearchCV')
print('Best_score:', grid_xgb.best_score_)
print('Best_params:', grid_xgb.best_params_)

XGBoost (by GridSearchCV
Best_score: -1859.8141773475468
Best_params: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.1, 'model__max_depth': 9, 'model__n_estimators': 200, 'model__subsample': 0.8}


In [20]:
# Model Random Forest
model = {'XGB': XGBRegressor(random_state= 2023)}

# Define model terhadap estimator terbaik
xgb_tuning = grid_xgb.best_estimator_

# Fitting model
xgb_tuning.fit(xtrain, ytrain)

# Predict test set
y_pred_xgb_tuning = xgb_tuning.predict(xtest)

# Simpan nilai metrics RMSE, MAE & MAPE setelah tuning
rmse_xgb_tuning = np.sqrt(mean_squared_error(ytest, y_pred_xgb_tuning))
mae_xgb_tuning = mean_absolute_error(ytest, y_pred_xgb_tuning)
mape_xgb_tuning = mean_absolute_percentage_error(ytest, y_pred_xgb_tuning)
r2_xgb_tuning = r2_score(ytest, y_pred_xgb_tuning)

score_after_tuning_xgb = pd.DataFrame({'RMSE': rmse_xgb_tuning, 'MAE': mae_xgb_tuning, 'MAPE': mape_xgb_tuning, 'R2': r2_xgb_tuning}, index=model.keys())
score_after_tuning_xgb

Unnamed: 0,RMSE,MAE,MAPE,R2
XGB,1952.686709,1132.706045,0.068183,0.962706


### Decission Tree

In [21]:
param_grid_dt = {
    'model__max_depth': [None, 3, 6, 9],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

# Create a Decision Tree Regressor
dt = DecisionTreeRegressor(random_state=2023)

# Create a pipeline with data preprocessing and Decision Tree model
pipe_dt = Pipeline([
    ('prep', transform),
    ('model', dt)
])

# Hyperparameter tuning with GridSearchCV
grid_dt = GridSearchCV(
    estimator=pipe_dt,
    param_grid=param_grid_dt,
    cv=5,
    scoring={
        'neg_root_mean_squared_error': 'neg_root_mean_squared_error',
        'neg_mean_absolute_error': 'neg_mean_absolute_error',
        'neg_mean_absolute_percentage_error': 'neg_mean_absolute_percentage_error',
        'r2': 'r2'
    },
    refit='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
)

# Fit the model
grid_dt.fit(xtrain, ytrain)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [22]:
pd.DataFrame(grid_dt.cv_results_).sort_values(\
    by=['rank_test_neg_root_mean_squared_error','rank_test_neg_mean_absolute_error', 'rank_test_neg_mean_absolute_percentage_error', 'rank_test_r2']).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__min_samples_leaf,param_model__min_samples_split,params,split0_test_neg_root_mean_squared_error,split1_test_neg_root_mean_squared_error,...,std_test_neg_mean_absolute_percentage_error,rank_test_neg_mean_absolute_percentage_error,split0_test_r2,split1_test_r2,split2_test_r2,split3_test_r2,split4_test_r2,mean_test_r2,std_test_r2,rank_test_r2
8,0.636831,0.04325,0.050474,0.003622,,4,10,"{'model__max_depth': None, 'model__min_samples...",-2204.417256,-2608.285978,...,0.001286,2,0.946447,0.93022,0.94339,0.939323,0.934273,0.938731,0.005897,1
5,0.662724,0.017608,0.046384,0.008483,,2,10,"{'model__max_depth': None, 'model__min_samples...",-2235.593033,-2659.68871,...,0.00147,1,0.944921,0.927443,0.943973,0.938023,0.938692,0.93861,0.006223,2
6,0.650655,0.020804,0.049079,0.011867,,4,2,"{'model__max_depth': None, 'model__min_samples...",-2219.324991,-2624.092965,...,0.00136,3,0.94572,0.929372,0.944279,0.939184,0.933353,0.938382,0.006257,3
7,0.659735,0.041141,0.050171,0.008879,,4,5,"{'model__max_depth': None, 'model__min_samples...",-2219.324991,-2624.092965,...,0.00136,3,0.94572,0.929372,0.944279,0.939184,0.933353,0.938382,0.006257,3
2,0.651123,0.02266,0.045482,0.006252,,1,10,"{'model__max_depth': None, 'model__min_samples...",-2247.667997,-2589.884946,...,0.00151,5,0.944325,0.931201,0.941305,0.931338,0.939192,0.937472,0.005321,5


In [23]:
print('Decission Tree (by GridSearchCV')
print('Best_score:', grid_dt.best_score_)
print('Best_params:', grid_dt.best_params_)

Decission Tree (by GridSearchCV
Best_score: -2427.5414952744377
Best_params: {'model__max_depth': None, 'model__min_samples_leaf': 4, 'model__min_samples_split': 10}


In [24]:
# Model Random Forest
model = {'DT': DecisionTreeRegressor(random_state= 2023)}

# Define model terhadap estimator terbaik
dt_tuning = grid_dt.best_estimator_

# Fitting model
dt_tuning.fit(xtrain, ytrain)

# Predict test set
y_pred_dt_tuning = dt_tuning.predict(xtest)

# Simpan nilai metrics RMSE, MAE & MAPE setelah tuning
rmse_dt_tuning = np.sqrt(mean_squared_error(ytest, y_pred_dt_tuning))
mae_dt_tuning = mean_absolute_error(ytest, y_pred_dt_tuning)
mape_dt_tuning = mean_absolute_percentage_error(ytest, y_pred_dt_tuning)
r2_dt_tuning = r2_score(ytest, y_pred_dt_tuning)

score_after_tuning_dt = pd.DataFrame({'RMSE': rmse_dt_tuning, 'MAE': mae_dt_tuning, 'MAPE': mape_dt_tuning, 'R2': r2_dt_tuning}, index=model.keys())
score_after_tuning_dt

Unnamed: 0,RMSE,MAE,MAPE,R2
DT,2639.085108,1376.203049,0.082476,0.931878


XGBOOST BEST MODEL 