# 👋 Welcome to Ngoding "Hyperparameter Tuning"

Notebook ini dirancang untuk mengoptimalkan performance dari Machine Learning Model

---

## 📚 Apa yang Akan Anda Pelajari?

Dalam pelatihan ini, Anda akan mempelajari:

- ✅ Implementasi Model yang Paling Bagus (Sebelum Tuning)
- ✅ Memahami Optimasi ML Model dengan GridSearchCV
- ✅ Implementasi Model yang sudah dioptimasi + Menyimpan untuk Deployment

In [3]:
# !pip install -r "/content/drive/MyDrive/Ari Folders/Data_Ari/Data_Science/Digital Skola after Fazz/Corporate Training/Astra Honda Motor/For Ari/requirements - Colab.txt"

In [15]:
# import library
import pandas as pd
import numpy as np
import os
import glob

# import visualization library
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# import machine learning model
from sklearn.utils import resample
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Regression Model
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Classification Model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.model_selection import GridSearchCV

import pickle

#import evaluation metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report, accuracy_score, confusion_matrix

import sys
sys.path.append('/content/drive/MyDrive/Ari Folders/Data_Ari/Data_Science/Digital Skola after Fazz/Corporate Training/Astra Honda Motor/For Ari')
# import collection_function as cus_viz

import warnings
warnings.filterwarnings('ignore')

## Regression Model

In [5]:
# # Upload file from Google Drive

# from google.colab import files

# uploaded = files.upload()

In [6]:
# data = pd.read_csv("insurance.csv")

# display(data.info())
# data.head()

In [7]:
data = pd.read_csv("https://raw.githubusercontent.com/densaiko/data_science_learning/refs/heads/main/dataset/USA_cars_datasets%20.csv")

display(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2499 entries, 0 to 2498
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         2499 non-null   int64 
 1   price         2499 non-null   int64 
 2   brand         2499 non-null   object
 3   model         2499 non-null   object
 4   year          2499 non-null   int64 
 5   title_status  2499 non-null   object
 6   mileage       2499 non-null   int64 
 7   color         2499 non-null   object
 8   vin           2499 non-null   object
 9   lot           2499 non-null   int64 
 10  state         2499 non-null   object
 11  country       2499 non-null   object
 12  condition     2499 non-null   object
dtypes: int64(5), object(8)
memory usage: 253.9+ KB


None

Unnamed: 0,index,price,brand,model,year,title_status,mileage,color,vin,lot,state,country,condition
0,0,6300,toyota,cruiser,2008,clean vehicle,274117,black,jtezu11f88k007763,159348797,new jersey,usa,10 days left
1,1,2899,ford,se,2011,clean vehicle,190552,silver,2fmdk3gc4bbb02217,166951262,tennessee,usa,6 days left
2,2,5350,dodge,mpv,2018,clean vehicle,39590,silver,3c4pdcgg5jt346413,167655728,georgia,usa,2 days left
3,3,25000,ford,door,2014,clean vehicle,64146,blue,1ftfw1et4efc23745,167753855,virginia,usa,22 hours left
4,4,27700,chevrolet,1500,2018,clean vehicle,6654,red,3gcpcrec2jg473991,167763266,florida,usa,22 hours left


### Data Pre-processing

In [8]:
# Select desired columns
data_new = data.drop(columns=['index','vin','lot'])

In [9]:
# Label Encoding for categorical columns
categorical_cols = ['brand', 'model', 'title_status', 'color', 'state', 'country', 'condition']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    encoded_col = f"{col}_encoded"
    data_new[encoded_col] = le.fit_transform(data_new[col])
    encoders[col] = le  # Save the encoder

data_new

Unnamed: 0,price,brand,model,year,title_status,mileage,color,state,country,condition,brand_encoded,model_encoded,title_status_encoded,color_encoded,state_encoded,country_encoded,condition_encoded
0,6300,toyota,cruiser,2008,clean vehicle,274117,black,new jersey,usa,10 days left,27,25,0,2,24,1,3
1,2899,ford,se,2011,clean vehicle,190552,silver,tennessee,usa,6 days left,8,92,0,39,35,1,39
2,5350,dodge,mpv,2018,clean vehicle,39590,silver,georgia,usa,2 days left,7,75,0,39,7,1,16
3,25000,ford,door,2014,clean vehicle,64146,blue,virginia,usa,22 hours left,8,32,0,4,39,1,20
4,27700,chevrolet,1500,2018,clean vehicle,6654,red,florida,usa,22 hours left,5,0,0,34,6,1,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2494,7800,nissan,versa,2019,clean vehicle,23609,red,california,usa,1 days left,24,120,0,34,3,1,0
2495,9200,nissan,versa,2018,clean vehicle,34553,silver,florida,usa,21 hours left,24,120,0,39,6,1,19
2496,9200,nissan,versa,2018,clean vehicle,31594,silver,florida,usa,21 hours left,24,120,0,39,6,1,19
2497,9200,nissan,versa,2018,clean vehicle,32557,black,florida,usa,2 days left,24,120,0,2,6,1,16


In [10]:
def show_encoding_mappings(df, encoded_columns):
    for col in encoded_columns:
        original_col = col.replace('_encoded', '')
        if original_col in df.columns:
            print(f"\nMapping for '{original_col}' ➜ '{col}':")
            mapping_df = df[[original_col, col]].drop_duplicates().sort_values(col)
            print(mapping_df.to_string(index=False))

In [11]:
encoded_columns = [
    'brand_encoded', 'model_encoded', 'title_status_encoded',
    'color_encoded', 'state_encoded', 'country_encoded', 'condition_encoded'
]

show_encoding_mappings(data_new, encoded_columns)



Mapping for 'brand' ➜ 'brand_encoded':
          brand  brand_encoded
          acura              0
           audi              1
            bmw              2
          buick              3
       cadillac              4
      chevrolet              5
       chrysler              6
          dodge              7
           ford              8
            gmc              9
harley-davidson             10
      heartland             11
          honda             12
        hyundai             13
       infiniti             14
         jaguar             15
           jeep             16
            kia             17
           land             18
          lexus             19
        lincoln             20
       maserati             21
          mazda             22
  mercedes-benz             23
         nissan             24
      peterbilt             25
            ram             26
         toyota             27

Mapping for 'model' ➜ 'model_encoded':
      model  model_en

In [12]:
# Data for Machine Learning
cols = ['price', 'year', 'mileage', 'brand_encoded', 'model_encoded',
       'title_status_encoded', 'color_encoded', 'state_encoded', 'condition_encoded']
data_ml = data_new[cols]
data_ml.head()

Unnamed: 0,price,year,mileage,brand_encoded,model_encoded,title_status_encoded,color_encoded,state_encoded,condition_encoded
0,6300,2008,274117,27,25,0,2,24,3
1,2899,2011,190552,8,92,0,39,35,39
2,5350,2018,39590,7,75,0,39,7,16
3,25000,2014,64146,8,32,0,4,39,20
4,27700,2018,6654,5,0,0,34,6,20


In [13]:
# Separating dependent and independent variable
X = data_ml.drop(columns="price") #independent variable
y = data_ml["price"] #dependent variable

# Performing train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

### Modelling & Evaluation

In [14]:
# modelling
lr = GradientBoostingRegressor()
lr.fit(X_train, y_train)

# Evaluation
y_predict_train = lr.predict(X_train)
y_predict_test = lr.predict(X_test)

print("Model: Linear Regression")
print("MAE: {:.2f}".format(mean_absolute_error(y_test, y_predict_test)))
print("MSE: {:.2f}".format(mean_squared_error(y_test, y_predict_test)))
print("R2 Score: {:.2f}".format(r2_score(y_test, y_predict_test)))

Model: Linear Regression
MAE: 5296.07
MSE: 64472886.77
R2 Score: 0.54


### Hyperparameter Tuning

In [16]:
# # Model
# gbr = GradientBoostingRegressor()

# # Define the param grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'max_depth': [3, 5, 7],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 3],
#     'subsample': [0.8, 1.0],
#     'max_features': ['auto', 'sqrt', 'log2']
# }

# # Initialize GridSearchCV
# grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid,
#                            cv=5, scoring='neg_mean_squared_error',
#                            n_jobs=-1, verbose=2)

# # Fit on training data
# grid_search.fit(X_train, y_train)

# # Get the best model
# best_model = grid_search.best_estimator_

# # Evaluate
# y_predict_test = best_model.predict(X_test)
# print("Model: Tuned Gradient Boosting")
# print("Best Params:", grid_search.best_params_)
# print("MAE: {:.2f}".format(mean_absolute_error(y_test, y_predict_test)))
# print("MSE: {:.2f}".format(mean_squared_error(y_test, y_predict_test)))
# print("R2 Score: {:.2f}".format(r2_score(y_test, y_predict_test)))

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Model: Tuned Gradient Boosting
Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300, 'subsample': 1.0}
MAE: 4194.17
MSE: 47648240.86
R2 Score: 0.66


### Predict New Data for Regression Model

In [18]:
# Due to time, kita simpan hasil tuning
params = {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'log2',
          'min_samples_leaf': 1, 'min_samples_split': 2,
          'n_estimators': 300, 'subsample': 1.0}

gbr = GradientBoostingRegressor(**params)

# Fit on training data
gbr.fit(X_train, y_train)

# Evaluate
y_predict_test = gbr.predict(X_test)
print("Model: Tuned Gradient Boosting")
print("MAE: {:.2f}".format(mean_absolute_error(y_test, y_predict_test)))
print("MSE: {:.2f}".format(mean_squared_error(y_test, y_predict_test)))
print("R2 Score: {:.2f}".format(r2_score(y_test, y_predict_test)))

Model: Tuned Gradient Boosting
MAE: 4123.19
MSE: 45858188.51
R2 Score: 0.68


In [43]:
# data baru yang belum ada price
new_data = pd.DataFrame([{
    'year': 2018,
    'mileage': 6654,
    'brand': "chevrolet",
    'model': "1500",
    'title_status': "clean vehicle",
    'color': "red",
    'state': "florida",
    'condition': "22 hours left"
}])

# Tampilkan data baru
new_data


Unnamed: 0,year,mileage,brand,model,title_status,color,state,condition
0,2018,6654,chevrolet,1500,clean vehicle,red,florida,22 hours left


In [44]:
# Assuming data_new contains raw text like "mpv", "cruiser", etc.
categorical_cols = ['brand', 'model', 'title_status', 'color', 'state', 'condition']

for col in categorical_cols:
    encoded_col = f"{col}_encoded"
    le = encoders[col]  # Load saved encoder
    new_data[encoded_col] = le.transform(new_data[col])

new_data.head()

Unnamed: 0,year,mileage,brand,model,title_status,color,state,condition,brand_encoded,model_encoded,title_status_encoded,color_encoded,state_encoded,condition_encoded
0,2018,6654,chevrolet,1500,clean vehicle,red,florida,22 hours left,5,0,0,34,6,20


In [48]:
prediction_old_model = lr.predict(new_data[['year', 'mileage', 'brand_encoded', 'model_encoded', 'title_status_encoded',
                                        'color_encoded', 'state_encoded', 'condition_encoded']])

prediction_new_model = gbr.predict(new_data[['year', 'mileage', 'brand_encoded', 'model_encoded', 'title_status_encoded',
                                        'color_encoded', 'state_encoded', 'condition_encoded']])

print("Hasil Prediksi {:.2f} with Old Model".format(prediction_old_model[0]))
print("Hasil Prediksi {:.2f} with New Model".format(prediction_new_model[0]))
print("Actual Price {}".format(data['price'].iloc[4].item()))

Hasil Prediksi 30170.14 with Old Model
Hasil Prediksi 27084.74 with New Model
Actual Price 27700


## Classification Model

In [49]:
data_clf = pd.read_csv("https://raw.githubusercontent.com/densaiko/data_science_learning/main/dataset/Human%20Capital.csv")
display(data_clf.info())
data_clf.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52399 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  int64  
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  awards_won            54808 non-null  int64  
 11  avg_training_score    52248 non-null  float64
 12  is_promoted           54808 non-null  int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 5.4+ MB


None

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49.0,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,60.0,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,50.0,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,50.0,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,73.0,0


### Data Pre-processing

In [50]:
# Drop missing values
data_clf = data_clf.dropna()

# Select desired columns
data_clf_new = data_clf.drop(columns=['employee_id'])

# get the categorical column
categorical_cols = data_clf_new.select_dtypes(include='object').columns.tolist()
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    encoded_col = f"{col}_encoded"
    data_clf_new[encoded_col] = le.fit_transform(data_clf_new[col])
    encoders[col] = le  # Save the encoder

data_clf_new

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted,department_encoded,region_encoded,education_encoded,gender_encoded,recruitment_channel_encoded
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49.0,0,7,31,2,0,2
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,60.0,0,4,14,0,1,0
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,50.0,0,7,10,0,1,2
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,50.0,0,7,15,0,1,0
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,73.0,0,8,18,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54802,Sales & Marketing,region_14,Bachelor's,m,other,2,31,1.0,2,0,49.0,0,7,5,0,1,0
54803,Technology,region_14,Bachelor's,m,sourcing,1,48,3.0,17,0,78.0,0,8,5,0,1,2
54804,Operations,region_27,Master's & above,f,other,1,37,2.0,6,0,56.0,0,4,19,2,0,0
54805,Analytics,region_1,Bachelor's,m,other,1,27,5.0,3,0,79.0,0,0,0,0,1,0


In [51]:
def show_encoding_mappings(df, encoded_columns):
    for col in encoded_columns:
        original_col = col.replace('_encoded', '')
        if original_col in df.columns:
            print(f"\nMapping for '{original_col}' ➜ '{col}':")
            mapping_df = df[[original_col, col]].drop_duplicates().sort_values(col)
            print(mapping_df.to_string(index=False))

In [52]:
encoded_columns = ["department_encoded", "region_encoded", "education_encoded", "gender_encoded", "recruitment_channel_encoded"]

show_encoding_mappings(data_clf_new, encoded_columns)


Mapping for 'department' ➜ 'department_encoded':
       department  department_encoded
        Analytics                   0
          Finance                   1
               HR                   2
            Legal                   3
       Operations                   4
      Procurement                   5
              R&D                   6
Sales & Marketing                   7
       Technology                   8

Mapping for 'region' ➜ 'region_encoded':
   region  region_encoded
 region_1               0
region_10               1
region_11               2
region_12               3
region_13               4
region_14               5
region_15               6
region_16               7
region_17               8
region_18               9
region_19              10
 region_2              11
region_20              12
region_21              13
region_22              14
region_23              15
region_24              16
region_25              17
region_26              18
region_2

In [53]:
# Data for Machine Learning
cols = ["department_encoded", "region_encoded", "education_encoded", "gender_encoded", "recruitment_channel_encoded", "no_of_trainings", 'age',
        "previous_year_rating", "length_of_service", "awards_won", "avg_training_score", "is_promoted"]

data_ml = data_clf_new[cols]
data_ml.head()

Unnamed: 0,department_encoded,region_encoded,education_encoded,gender_encoded,recruitment_channel_encoded,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted
0,7,31,2,0,2,1,35,5.0,8,0,49.0,0
1,4,14,0,1,0,1,30,5.0,4,0,60.0,0
2,7,10,0,1,2,1,34,3.0,7,0,50.0,0
3,7,15,0,1,0,2,39,1.0,10,0,50.0,0
4,8,18,0,1,0,1,45,3.0,2,0,73.0,0


In [54]:
# Separating dependent and independent variable
X = data_ml.drop(columns="is_promoted") #independent variable
y = data_ml["is_promoted"] #dependent variable

# Performing train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

### Modelling & Evaluation

In [55]:
# Logistic Regression Modelling
legReg = LogisticRegression()
legReg.fit(X_train, y_train)

# Evaluation
y_predict_train = legReg.predict(X_train)
y_predict_test = legReg.predict(X_test)

print("Model: Logistic Regression")
print("Training Accuracy: {:.2f}".format(accuracy_score(y_train, y_predict_train)))
print("Testing Accuracy: {:.2f}".format(accuracy_score(y_test, y_predict_test)))
print(classification_report(y_test, y_predict_test))

Model: Logistic Regression
Training Accuracy: 0.91
Testing Accuracy: 0.92
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      8504
           1       0.61      0.06      0.12       772

    accuracy                           0.92      9276
   macro avg       0.77      0.53      0.54      9276
weighted avg       0.90      0.92      0.89      9276



### Hyperparameter Tuning

In [57]:
# # Define model
# xgb = XGBClassifier()

# # Define parameter grid
# param_grid = {
#     'n_estimators': [100, 200],
#     'learning_rate': [0.01, 0.1],
#     'max_depth': [3, 5],
#     'min_child_weight': [1, 3],
#     'subsample': [0.8, 1],
#     'colsample_bytree': [0.8, 1]
# }

# # Grid search with 5-fold CV
# grid_search = GridSearchCV(estimator=xgb,
#                            param_grid=param_grid,
#                            scoring='accuracy',
#                            cv=5,
#                            n_jobs=-1,
#                            verbose=2)

# # Fit to training data
# grid_search.fit(X_train, y_train)

# # Best model
# best_model = grid_search.best_estimator_

# # Predict and evaluate
# y_predict_test = best_model.predict(X_test)
# y_predict_train = best_model.predict(X_train)

# print("Model: Tuned Logistic Regression")
# print("Best Params:", grid_search.best_params_)
# print("Training Accuracy: {:.2f}".format(accuracy_score(y_train, y_predict_train)))
# print("Testing Accuracy: {:.2f}".format(accuracy_score(y_test, y_predict_test)))
# print(classification_report(y_test, y_predict_test))

Fitting 5 folds for each of 64 candidates, totalling 320 fits
Model: Tuned Logistic Regression
Best Params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1}
Training Accuracy: 0.94
Testing Accuracy: 0.95
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      8504
           1       0.96      0.36      0.52       772

    accuracy                           0.95      9276
   macro avg       0.95      0.68      0.75      9276
weighted avg       0.95      0.95      0.93      9276



### Predict New Data for Classification Model

In [59]:
# Due to time, kita simpan hasil tuning
params = {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5,
          'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1}

xgb_best = XGBClassifier(**params)

# Fit on training data
xgb_best.fit(X_train, y_train)

# Evaluate
y_predict_test = xgb_best.predict(X_test)
print("Model: Tuned XGB Classifier")
print("Training Accuracy: {:.2f}".format(accuracy_score(y_train, y_predict_train)))
print("Testing Accuracy: {:.2f}".format(accuracy_score(y_test, y_predict_test)))
print(classification_report(y_test, y_predict_test))

Model: Tuned XGB Classifier
Training Accuracy: 0.94
Testing Accuracy: 0.95
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      8504
           1       0.96      0.36      0.52       772

    accuracy                           0.95      9276
   macro avg       0.95      0.68      0.75      9276
weighted avg       0.95      0.95      0.93      9276



In [60]:
# data baru yang belum ada price
new_data = pd.DataFrame([{
    'department': "Sales & Marketing",
    'region': "region_7",
    'education': "Master's & above",
    'gender': "f",
    'recruitment_channel': "sourcing",
    'no_of_trainings': 1,
    'age': 35,
    'previous_year_rating': 5.0,
    'length_of_service': 8,
    'awards_won': 0,
    'avg_training_score': 49.0
}])

# Tampilkan data baru
new_data


Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49.0


In [61]:
# Assuming data_new contains raw text like "mpv", "cruiser", etc.
categorical_cols = ["department",	"region",	"education",	"gender",	"recruitment_channel"]

for col in categorical_cols:
    encoded_col = f"{col}_encoded"
    le = encoders[col]  # Load saved encoder
    new_data[encoded_col] = le.transform(new_data[col])

new_data.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,department_encoded,region_encoded,education_encoded,gender_encoded,recruitment_channel_encoded
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49.0,7,31,2,0,2


In [65]:
cols = ['department_encoded', 'region_encoded', 'education_encoded',
        'gender_encoded', 'recruitment_channel_encoded', 'no_of_trainings',
        'age', 'previous_year_rating', 'length_of_service', 'awards_won',
        'avg_training_score']

old_model_prediction = legReg.predict(new_data[cols])
new_model_prediction = xgb_best.predict(new_data[cols])

print("Hasil Prediksi OLD ML Model: {}".format(old_model_prediction[0]))
print("Hasil Prediksi NEW ML Model: {}".format(new_model_prediction[0]))
print("Nilai Sebenarnya: {}".format(data_clf['is_promoted'].iloc[4].item()))

Hasil Prediksi OLD ML Model: 0
Hasil Prediksi NEW ML Model: 0
Nilai Sebenarnya: 0


# 🎉 Congratulations! 🎉

Terima kasih telah mengikuti dan menyelesaikan pelatihan **Hyperparameter Tuning**! 👏

Anda telah berhasil mempelajari bagaimana melakukan optimasi Machine Learning Model menggunakan bahasa pemrograman Python, mulai dari:
- ✅ Implementasi Model yang Paling Bagus (Sebelum Tuning)
- ✅ Memahami Optimasi ML Model dengan GridSearchCV
- ✅ Implementasi Model yang sudah dioptimasi + Menyimpan untuk Deployment


---