In [4]:
#requirements
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score, GridSearchCV
from itertools import product
from queue import PriorityQueue
import xgboost as xgb
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import RandomizedSearchCV

In [5]:
# Load the dataset
base = pd.read_csv('base.csv')
train = pd.read_csv('train.csv')

In [6]:
train

Unnamed: 0,id,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,reservation_status
0,0,Resort Hotel,312,2017,March,10,5,2,5,2,...,A,0,No Deposit,298.0,,0,Transient-Party,56.0,0,0
1,1,City Hotel,2,2015,December,51,18,0,2,2,...,D,1,No Deposit,9.0,,0,Transient,97.0,0,1
2,2,City Hotel,41,2016,March,14,31,0,3,2,...,A,0,No Deposit,9.0,,0,Transient,117.9,0,1
3,3,Resort Hotel,228,2016,August,36,29,2,5,2,...,D,0,No Deposit,175.0,,0,Transient,86.4,0,1
4,4,City Hotel,128,2017,May,19,13,0,1,3,...,A,0,No Deposit,9.0,,0,Transient,144.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94541,94541,City Hotel,26,2016,October,40,1,2,2,2,...,F,0,No Deposit,9.0,,0,Transient,294.0,0,0
94542,94542,City Hotel,269,2016,November,48,24,0,2,1,...,A,1,No Deposit,14.0,,0,Transient,93.0,0,0
94543,94543,City Hotel,302,2015,August,33,15,2,2,2,...,A,0,No Deposit,1.0,,0,Transient-Party,62.0,0,0
94544,94544,City Hotel,53,2017,June,25,19,1,3,2,...,D,0,No Deposit,42.0,,0,Transient,153.0,0,1


In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1,test_size = 0.6)
for train_indices,val_indices in split.split(train,train[["reservation_status"]]):
    trainset = train.loc[train_indices]
    valset = train.loc[val_indices]

numeric_columns = [col for col in train.select_dtypes(include=['int64', 'float64']).columns if train[col].nunique() > 2]

In [8]:
def cleanse(df):
    cleaned_df = df.copy()
    cleaned_df = cleaned_df.replace("", pd.NA)
    cols_to_check = [col for col in cleaned_df.columns if col != 'company' and col != 'agent']
    cleaned_df = cleaned_df.dropna(subset=cols_to_check)
    return cleaned_df
trainset = cleanse(trainset)

In [9]:
class Impute(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='most_frequent'):
        self.strategy = strategy
    def fit(self, X, y=None):
        self.imputer_dict = {}
        num_cols = X.select_dtypes(include=['float64', 'int64']).columns
        cat_cols = X.select_dtypes(exclude=['float64', 'int64']).columns
        for col in num_cols:
            num_imputer = SimpleImputer(missing_values=np.nan, strategy=self.strategy)
            num_imputer.fit(X[col].values.reshape(-1, 1))
            self.imputer_dict[col] = num_imputer
        for col in cat_cols:
            cat_imputer = SimpleImputer(missing_values=np.nan, strategy=self.strategy)
            cat_imputer.fit(X[col].values.reshape(-1, 1))
            self.imputer_dict[col] = cat_imputer
        return self
    def transform(self, X):
        new_X = X.copy()
        for col, imputer in self.imputer_dict.items():
            if col in new_X.columns: 
                new_X[col] = imputer.transform(new_X[col].values.reshape(-1, 1)).ravel()
            else:
                pass
        return new_X

In [10]:
class Scaler(BaseEstimator, TransformerMixin):    
    def fit(self, X, y=None):
        self.numerical_cols = [col for col in X.columns if X[col].dtype != 'object' and X[col].nunique() > 2]
        self.sc = StandardScaler().fit(X[self.numerical_cols])
        return self
    
    def transform(self, X):
        X[self.numerical_cols] = self.sc.transform(X[self.numerical_cols])
        return X

In [11]:
class Drop(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.drop(["reserved_room_type","assigned_room_type","id","country"], axis=1, errors="ignore")
        return X


In [12]:
class Encode(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.encoders = {}
        self.categories = {
            'arrival_date_month': [
                'January', 'February', 'March', 'April', 'May', 'June',
                'July', 'August', 'September', 'October', 'November', 'December'
            ],
            'hotel': ['Resort Hotel', 'City Hotel'],
            'meal': ["HB", "BB", "FB", "SC", "Undefined"],
            'market_segment': ["Groups", "Online TA", "Offline TA/TO", "Direct", "Aviation", "Corporate", "Complementary", "Undefined"],
            'distribution_channel': ['TA/TO', 'Direct', 'Corporate', 'GDS', 'Undefined'],
            'deposit_type': ["No Deposit", "Non Refund", "Refundable"],
            'customer_type': ['Transient-Party', 'Transient', 'Contract', 'Group']
        }
        for column, categories in self.categories.items():
            try:
                encoder = OneHotEncoder(categories=[categories], drop=None, sparse_output=False, handle_unknown='ignore')
            except TypeError:
                encoder = OneHotEncoder(categories=[categories], drop=None, sparse=False, handle_unknown='ignore')
            encoder.fit(X[[column]])
            self.encoders[column] = encoder
        return self
    
    def transform(self, X):
        encoded_X = X.copy()
        for column, encoder in self.encoders.items():
            matrix = encoder.transform(X[[column]])
            column_names = encoder.get_feature_names_out([column])
            for i in range(len(matrix.T)):
                encoded_X[column_names[i]] = matrix.T[i]
            encoded_X.drop(column, axis=1, inplace=True)
        return encoded_X

In [13]:
pipeline = Pipeline([("Impute",Impute()),("Scaler",Scaler()),("Drop",Drop()),("Encode",Encode())])

In [14]:
trainset = pipeline.fit_transform(trainset)
valset = pipeline.transform(valset)
baseset = pipeline.transform(base)



In [15]:
X_trainset = trainset.drop(columns=['reservation_status'])  
X_valset = valset.drop(columns=['reservation_status'])  
y_trainset = trainset['reservation_status'] 
y_valset = valset['reservation_status'] 

In [16]:
param_grid = {
    'n_estimators': [80, 120, 200],        
    'max_depth': [4, 5, 10],               
    'learning_rate': [0.03, 0.07, 0.15],   
    'subsample': [0.6, 0.75, 0.9],        
    'colsample_bytree': [0.65, 0.85, 1.0]  
}

In [17]:
classifier = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
classifier.fit(X_trainset, y_trainset)

y_pred = classifier.predict(X_valset)
accuracy = accuracy_score(y_valset, y_pred)
print(f'Tingkat Akurasi sebelum tuning: {accuracy:.4f}')

Tingkat Akurasi sebelum tuning: 0.8204


In [18]:
xgb_classifier = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
random_search = RandomizedSearchCV(estimator=xgb_classifier, param_distributions=param_grid,
                                   n_iter=10,
                                   scoring='accuracy', n_jobs=-1, cv=3, verbose=2, random_state=42)
random_search.fit(X_trainset, y_trainset)
best_params_random = random_search.best_params_
best_xgb_random = random_search.best_estimator_
y_pred_random = best_xgb_random.predict(X_valset)
accuracy_random = accuracy_score(y_valset, y_pred_random)
print(f'Tingkat akurasi setelah tuning dengan RandomizedSearchCV: {accuracy_random:.4f}')
print(f'Parameter terbaik berdasarkan RandomizedSearchCV: {best_params_random}')

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Tingkat akurasi setelah tuning dengan RandomizedSearchCV: 0.8259
Parameter terbaik berdasarkan RandomizedSearchCV: {'subsample': 0.75, 'n_estimators': 120, 'max_depth': 10, 'learning_rate': 0.15, 'colsample_bytree': 1.0}
Tingkat akurasi setelah tuning dengan RandomizedSearchCV: 0.8259
Parameter terbaik berdasarkan RandomizedSearchCV: {'subsample': 0.75, 'n_estimators': 120, 'max_depth': 10, 'learning_rate': 0.15, 'colsample_bytree': 1.0}


In [19]:
xgb_classifier = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid,
                           scoring='accuracy', n_jobs=-1, cv=3, verbose=2)
grid_search.fit(X_trainset, y_trainset)
best_params_grid = grid_search.best_params_
best_xgb_grid = grid_search.best_estimator_
y_pred_grid = best_xgb_grid.predict(X_valset)
accuracy_grid = accuracy_score(y_valset, y_pred_grid)
print(f'Tingkat akurasi setelah tuning dengan GridSearchCV: {accuracy_grid:.4f}')
print(f'Parameter terbaik berdasarkan GridSearchCV: {best_params_grid}')

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Tingkat akurasi setelah tuning dengan GridSearchCV: 0.8281
Parameter terbaik berdasarkan GridSearchCV: {'colsample_bytree': 0.85, 'learning_rate': 0.15, 'max_depth': 10, 'n_estimators': 120, 'subsample': 0.9}
Tingkat akurasi setelah tuning dengan GridSearchCV: 0.8281
Parameter terbaik berdasarkan GridSearchCV: {'colsample_bytree': 0.85, 'learning_rate': 0.15, 'max_depth': 10, 'n_estimators': 120, 'subsample': 0.9}


In [22]:
import time

def manhattan_heuristic(params, param_grid):
    distance = 0
    for key, value in params.items():
        grid = list(param_grid[key])
        mid_index = len(grid) // 2
        try:
            distance += abs(grid.index(value) - mid_index)
        except ValueError:
            distance += mid_index 
    return distance 

def calc_cost(params, X, y):
    rf = RandomForestClassifier(**params, random_state=42).fit(X, y)
    scores = cross_val_score(rf, X, y, cv=3, scoring=make_scorer(accuracy_score))
    return np.mean(scores)

def ida(param_grid, X, y, max_depth=5, timeout=60):
    best_score = float("-inf")
    best_params = {}
    start_time = time.time()

    def ids(params, keys, depth, threshold):
        nonlocal best_score, best_params
        # Cek timeout
        if time.time() - start_time > timeout:
            raise TimeoutError("Pencarian parameter dihentikan karena melebihi batas waktu.")
        if depth > threshold or not keys:
            score = calc_cost(params, X, y)
            f_cost = -score + manhattan_heuristic(params, param_grid)
            if score > best_score:
                best_score = score
                best_params = params.copy()
                print(f"Parameter baru berdasarkan IDA*: {best_params}, dengan Hasil: {best_score:.4f}")
            return f_cost

        next_key = keys[0]
        min_f = float("inf")
        for value in param_grid[next_key]:
            new_params = params.copy()
            new_params[next_key] = value
            f = ids(new_params, keys[1:], depth + 1, threshold)
            min_f = min(min_f, f)
        return min_f

    try:
        for depth in range(1, max_depth + 1):
            print(f"Depth limit: {depth}")
            ids({}, list(param_grid.keys()), 0, depth)
    except TimeoutError as e:
        print(str(e))

    return best_params

best_params_ida = ida(param_grid, X_trainset, y_trainset, max_depth=5, timeout=60)
best_xgb_ida = xgb.XGBClassifier(**best_params_ida, random_state=42, eval_metric='logloss')
best_xgb_ida.fit(X_trainset, y_trainset)
y_pred_ida = best_xgb_ida.predict(X_valset)
accuracy_ida = accuracy_score(y_valset, y_pred_ida)
print(f'Tingkat akurasi setelah tuning dengan IDA*: {accuracy_ida:.4f}')
print(f'Parameter terbaik berdasarkan IDA*: {best_params_ida}')

Depth limit: 1
Parameter baru berdasarkan IDA*: {'n_estimators': 80, 'max_depth': 4}, dengan Hasil: 0.7702
Parameter baru berdasarkan IDA*: {'n_estimators': 80, 'max_depth': 4}, dengan Hasil: 0.7702
Parameter baru berdasarkan IDA*: {'n_estimators': 80, 'max_depth': 5}, dengan Hasil: 0.7739
Parameter baru berdasarkan IDA*: {'n_estimators': 80, 'max_depth': 5}, dengan Hasil: 0.7739
Parameter baru berdasarkan IDA*: {'n_estimators': 80, 'max_depth': 10}, dengan Hasil: 0.7810
Parameter baru berdasarkan IDA*: {'n_estimators': 80, 'max_depth': 10}, dengan Hasil: 0.7810
Pencarian parameter dihentikan karena melebihi batas waktu.
Pencarian parameter dihentikan karena melebihi batas waktu.
Tingkat akurasi setelah tuning dengan IDA*: 0.8254
Parameter terbaik berdasarkan IDA*: {'n_estimators': 80, 'max_depth': 10}
Tingkat akurasi setelah tuning dengan IDA*: 0.8254
Parameter terbaik berdasarkan IDA*: {'n_estimators': 80, 'max_depth': 10}
