# **Data Modeling**

In [1]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV, KFold

from sklearn.preprocessing import OneHotEncoder, RobustScaler
from category_encoders import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.compose import TransformedTargetRegressor
import category_encoders as ce

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error


## Data Spliting

In [2]:
cars = pd.read_csv("1. Combined Dataset.csv")

In [3]:
cars

Unnamed: 0.1,Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
0,0,SLK,2005,5200,Automatic,63000,Petrol,325,32.1,1.8,Mercedes
1,1,S Class,2017,34948,Automatic,27000,Hybrid,20,61.4,2.1,Mercedes
2,2,SL CLASS,2016,49948,Automatic,6200,Petrol,555,28.0,5.5,Mercedes
3,3,G Class,2016,61948,Automatic,16000,Petrol,325,30.4,4.0,Mercedes
4,4,G Class,2016,73948,Automatic,4000,Petrol,325,30.1,4.0,Mercedes
...,...,...,...,...,...,...,...,...,...,...,...
75165,76156,Eos,2012,5990,Manual,74000,Diesel,125,58.9,2.0,Volkswagen
75166,76157,Fox,2008,1799,Manual,88102,Petrol,145,46.3,1.2,Volkswagen
75167,76158,Fox,2009,1590,Manual,70000,Petrol,200,42.0,1.4,Volkswagen
75168,76159,Fox,2006,1250,Manual,82704,Petrol,150,46.3,1.2,Volkswagen


In [4]:
X = cars.drop(['price'], axis = 1)
y = cars['price']

# Data Splitting dengan proporsi test size 80:20
xtrain, xtest, ytrain, ytest = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state= 2023)

## Encoding

Encoding is a technique used in machine learning to convert categorical data into numerical format. In this project, I've chosen three methods: one-hot encoding for nominal categories, robust scaler Using the robust scaler to standardize the scale of numerical data. One-hot encoding is suitable for categories without a specific order and a limited set of unique values. Here's a breakdown of how each encoding type is applied.

- Onehot: Transmission, FuelType
- Robust: Mileage, Mpg,
- Binary: Model, Brand

In [5]:
transform = ColumnTransformer([
    ('Scaler', RobustScaler(), ['mileage', 'mpg']),
    ('OHE', OneHotEncoder(drop='first'), ['transmission', 'fuelType']),
    ('Binary Encoder', ce.BinaryEncoder(), ['model', 'brand'])
],remainder = "passthrough")

transform

## Choose a Benchmark Model

In the initial phase, we will perform modeling on the 7 selected benchmark models. The results from these benchmark models, in terms of scoring, will be based on the chosen evaluation metrics: RMSE, MAE, and MAPE. The selection of these three metrics is to compare the model's performance by examining its residuals.

In [6]:
# Define the algorithm

lr = LinearRegression()
knn = KNeighborsRegressor()
dt = DecisionTreeRegressor(random_state= 2023)
rf = RandomForestRegressor(random_state= 2023)
ada = AdaBoostRegressor(random_state= 2023)
xgb = XGBRegressor(random_state= 2023)
gbr = GradientBoostingRegressor(random_state= 2023)

models = [lr, knn, dt, rf, ada, xgb, gbr]

score_rmse = []
nilai_mean_rmse = []
nilai_std_rmse = []

score_mae = []
nilai_mean_mae = []
nilai_std_mae = []

score_mape = []
nilai_mean_mape = []
nilai_std_mape = []

score_r2 = []
nilai_mean_r2 = []
nilai_std_r2 = []

# Finding the best algorithm based on each metrics

for i in models:
    
    crossval = KFold(n_splits=5, shuffle=True, random_state=2023)

    estimator = Pipeline([
        ('preprocessing', transform),
        ('model', i)
    ])

    # RMSE
    model_cv_rmse = cross_val_score(
        estimator, 
        xtrain, 
        ytrain, 
        cv=crossval, 
        scoring='neg_root_mean_squared_error', 
        error_score='raise'
        )

    print(model_cv_rmse, i)

    score_rmse.append(model_cv_rmse)
    nilai_mean_rmse.append(model_cv_rmse.mean())
    nilai_std_rmse.append(model_cv_rmse.std())

    # MAE
    model_cv_mae = cross_val_score(
        estimator, 
        xtrain, 
        ytrain, 
        cv=crossval, 
        scoring='neg_mean_absolute_error', 
        error_score='raise'
        )

    print(model_cv_mae, i)

    score_mae.append(model_cv_mae)
    nilai_mean_mae.append(model_cv_mae.mean())
    nilai_std_mae.append(model_cv_mae.std())

    # MAPE
    model_cv_mape = cross_val_score(
        estimator, 
        xtrain, 
        ytrain, 
        cv=crossval, 
        scoring='neg_mean_absolute_percentage_error', 
        error_score='raise'
        )

    print(model_cv_mape, i)

    score_mape.append(model_cv_mape)
    nilai_mean_mape.append(model_cv_mape.mean())
    nilai_std_mape.append(model_cv_mape.std())
    
    model_cv_r2 = cross_val_score(
        estimator, 
        xtrain, 
        ytrain, 
        cv=crossval, 
        scoring='r2', 
        error_score='raise'
        )

    print(model_cv_mape, i)

    score_r2.append(model_cv_r2)
    nilai_mean_r2.append(model_cv_r2.mean())
    nilai_std_r2.append(model_cv_r2.std())

[-4722.59195669 -4430.98769878 -4627.43512669 -4398.45756075
 -4687.66350878] LinearRegression()
[-2884.65312138 -2847.16862896 -2915.95511229 -2836.85541484
 -2921.24764019] LinearRegression()
[-0.21574798 -0.20937981 -0.21226257 -0.20371318 -0.21407117] LinearRegression()
[-0.21574798 -0.20937981 -0.21226257 -0.20371318 -0.21407117] LinearRegression()
[-7276.36817664 -6847.2696906  -7041.62539042 -6667.24241649
 -7089.19845128] KNeighborsRegressor()
[-4447.1269704  -4325.5728278  -4404.8821485  -4254.41187329
 -4382.66683296] KNeighborsRegressor()
[-0.28693982 -0.28313608 -0.28087833 -0.27347919 -0.28310094] KNeighborsRegressor()
[-0.28693982 -0.28313608 -0.28087833 -0.27347919 -0.28310094] KNeighborsRegressor()
[-3172.77430258 -2754.72453428 -2649.44360337 -2672.38871569
 -2561.37349506] DecisionTreeRegressor(random_state=2023)
[-1587.6462421  -1561.07133949 -1575.22915108 -1562.75214102
 -1542.83304232] DecisionTreeRegressor(random_state=2023)
[-0.09431743 -0.09311897 -0.09472817 -

In [7]:
pd.DataFrame({
    'Model': ['Linear Regression', 'KNN Regressor', 'DecisionTree Regressor',
              'RandomForest Regressor', 'AdaBoost Regressor', 'XGBoost Regressor', 'GradientBoosting Regressor'],
    'Mean_RMSE': nilai_mean_rmse,
    'Std_RMSE': nilai_std_rmse,
    'Mean_MAE': nilai_mean_mae,
    'Std_MAE': nilai_std_mae,
    'Mean_MAPE': nilai_mean_mape,
    'Std_MAPE': nilai_std_mape,
    'Mean_R2' : nilai_mean_r2,
    'Std_R2' : nilai_std_r2
}).sort_values('Mean_MAPE',ascending = False)

Unnamed: 0,Model,Mean_RMSE,Std_RMSE,Mean_MAE,Std_MAE,Mean_MAPE,Std_MAPE,Mean_R2,Std_R2
3,RandomForest Regressor,-2034.370148,77.377968,-1205.897074,16.853813,-0.072074,0.001124,0.956967,0.003909
5,XGBoost Regressor,-2035.152529,99.612979,-1290.292198,18.927505,-0.078566,0.001048,0.95685,0.005121
2,DecisionTree Regressor,-2762.14093,214.354137,-1565.906383,15.001609,-0.093323,0.001106,0.92066,0.010746
6,GradientBoosting Regressor,-2997.353259,20.854404,-1995.20503,28.595274,-0.122923,0.001717,0.906811,0.002481
0,Linear Regression,-4573.42717,133.50678,-2881.175984,34.491242,-0.211035,0.004226,0.783165,0.007581
1,KNN Regressor,-6984.340825,209.235042,-4362.932131,66.948334,-0.281507,0.004462,0.494363,0.016196
4,AdaBoost Regressor,-7510.461249,646.901744,-6522.477025,681.642149,-0.549553,0.053805,0.411623,0.092515


In [8]:
# Benchmarking 2 Best Model

rf = RandomForestRegressor(random_state= 2023)
xgb = XGBRegressor(random_state= 2023)

models = [rf, xgb]

score_rmse = []
score_mae = []
score_mape = []
score_r2 = []

# Finding the best algorithm based on each metrics

for i in models:
    model = Pipeline([
        ('preprocessing', transform),
        ('model', i)
    ])

    model.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    score_rmse.append(np.sqrt(mean_squared_error(ytest, y_pred)))
    score_mae.append(mean_absolute_error(ytest, y_pred))
    score_mape.append(mean_absolute_percentage_error(ytest, y_pred))
    score_r2.append(r2_score(ytest, y_pred))
    
score_before_tuning = pd.DataFrame({'RMSE': score_rmse, 'MAE': score_mae, 'MAPE': score_mape, 'R2': score_r2}, index=['Random Forest', 'XGBoost'])
score_before_tuning

Unnamed: 0,RMSE,MAE,MAPE,R2
Random Forest,1806.894499,1166.452441,0.070427,0.965383
XGBoost,1888.900552,1269.234766,0.077125,0.962169


Random Forest Best Model

### Random Forest Hyperparameter Tuning

In [9]:
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest Regressor
rf = RandomForestRegressor(random_state=2023)

# Create a pipeline with data preprocessing and Random Forest model
pipe_rf = Pipeline([
    ('prep', transform),
    ('model', rf)
])

# Hyperparameter tuning with GridSearchCV
grid_rf = GridSearchCV(
    estimator=pipe_rf,
    param_grid=param_grid,
    cv=5,
    scoring=['neg_root_mean_squared_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'r2'],
    refit='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
)

In [10]:
grid_rf.fit(xtrain, ytrain)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [11]:
pd.DataFrame(grid_rf.cv_results_).sort_values(\
    by=['rank_test_neg_root_mean_squared_error','rank_test_neg_mean_absolute_error', 'rank_test_neg_mean_absolute_percentage_error', 'rank_test_r2']).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__min_samples_leaf,param_model__min_samples_split,param_model__n_estimators,params,split0_test_neg_root_mean_squared_error,...,std_test_neg_mean_absolute_percentage_error,rank_test_neg_mean_absolute_percentage_error,split0_test_r2,split1_test_r2,split2_test_r2,split3_test_r2,split4_test_r2,mean_test_r2,std_test_r2,rank_test_r2
56,92.546472,2.593522,0.897402,0.024457,20.0,1,2,200,"{'model__max_depth': 20, 'model__min_samples_l...",-2005.321614,...,0.000743,1,0.9594,0.95388,0.954177,0.956167,0.962275,0.95718,0.003219,1
55,47.036695,0.247258,0.474332,0.018951,20.0,1,2,100,"{'model__max_depth': 20, 'model__min_samples_l...",-2013.827149,...,0.000668,2,0.959055,0.954097,0.954336,0.956262,0.961753,0.9571,0.002927,2
83,99.842374,2.382117,1.061563,0.018526,30.0,1,2,200,"{'model__max_depth': 30, 'model__min_samples_l...",-2003.207471,...,0.000722,7,0.959486,0.953663,0.953996,0.955906,0.962034,0.957017,0.003252,3
59,84.856224,0.912061,0.744211,0.021699,20.0,1,5,200,"{'model__max_depth': 20, 'model__min_samples_l...",-2018.157028,...,0.000727,3,0.958879,0.953754,0.95424,0.956012,0.962167,0.95701,0.003142,4
2,112.269154,10.788045,1.399366,0.211095,,1,2,200,"{'model__max_depth': None, 'model__min_samples...",-2005.532795,...,0.000728,8,0.959392,0.953402,0.954077,0.956223,0.961935,0.957006,0.00323,5


In [12]:
print('Random Forest (by GridSearchCV')
print('Best_score:', grid_rf.best_score_)
print('Best_params:', grid_rf.best_params_)

Random Forest (by GridSearchCV
Best_score: -2029.8375423411821
Best_params: {'model__max_depth': 20, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 200}


In [13]:
# Model Random Forest
model = {'RF': RandomForestRegressor(random_state= 2023)}

# Define model terhadap estimator terbaik
rf_tuning = grid_rf.best_estimator_

# Fitting model
rf_tuning.fit(xtrain, ytrain)

# Predict test set
y_pred_rf_tuning = rf_tuning.predict(xtest)

# Simpan nilai metrics RMSE, MAE & MAPE setelah tuning
rmse_rf_tuning = np.sqrt(mean_squared_error(ytest, y_pred_rf_tuning))
mae_rf_tuning = mean_absolute_error(ytest, y_pred_rf_tuning)
mape_rf_tuning = mean_absolute_percentage_error(ytest, y_pred_rf_tuning)
r2_rf_tuning = r2_score(ytest, y_pred_rf_tuning)

score_after_tuning_rf = pd.DataFrame({'RMSE': rmse_rf_tuning, 'MAE': mae_rf_tuning, 'MAPE': mape_rf_tuning, 'R2': r2_rf_tuning}, index=model.keys())
score_after_tuning_rf

Unnamed: 0,RMSE,MAE,MAPE,R2
RF,1800.454333,1158.646305,0.069834,0.965629


RANDOM FOREST BEST MODEL 