In [28]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score as acc

import warnings
warnings.filterwarnings('ignore')

In [29]:
train = pd.read_csv('/kaggle/input/playground-series-s3e7/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e7/test.csv')
sub = pd.read_csv('/kaggle/input/playground-series-s3e7/sample_submission.csv')
extra = pd.read_csv('/kaggle/input/reservation-cancellation-prediction/train__dataset.csv')
train.drop("id", axis=1, inplace=True)
test.drop("id", axis=1, inplace=True)

In [30]:
df = pd.concat([train,extra,test])

In [31]:
x = df.drop(columns=['booking_status'])
kmeans = KMeans(n_clusters=3, random_state=2023, n_init="auto").fit(x)

df['knn'] = kmeans.predict(x)

In [32]:
df['date'] = pd.to_datetime(df['arrival_year'].astype(str) + df['arrival_month'].astype(str) + df['arrival_month'].astype(str), format='%Y%m%d')
df['week'] = df['date'].dt.week
df['weekday'] = df['date'].dt.weekday

In [33]:
df.drop(columns=['date'],inplace=True)
train_df = df.iloc[:-len(test),:]
test_df = df.iloc[-len(test):,:]

X = train_df.drop(['booking_status'], axis=1)
y = train_df.booking_status

X_test = test_df.drop('booking_status', axis=1)

In [34]:
n_folds = 10
PATIENCE = 2000
predsCB = []

k_fold = StratifiedKFold(n_splits=n_folds, random_state=2023, shuffle=True)

In [35]:
def train_model(X,y,modelo):
    vl = []
    for fold, (train_index, test_index) in enumerate (k_fold.split(X, y)):        
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
        mod = modelo()
        mod.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)
        mod_preds = mod.predict_proba(X_valid)[:,1]
        mod_test_preds = mod.predict_proba(X_test)[:,1]
        vl.append(round(roc_auc_score(y_valid, mod_preds),4))
    dic = {'model':str(modelo), 'result':round(np.mean(vl),4)}
    return dic, mod

In [36]:
df_result = pd.DataFrame(columns=['model','result'])

dic, xgb_cla = train_model(X,y,XGBClassifier)
df_result = df_result.append(dic, ignore_index=True)

dic, lgc_cla = train_model(X,y,LGBMClassifier)
df_result = df_result.append(dic, ignore_index=True)

dic, cat_cla = train_model(X,y,CatBoostClassifier)
df_result = df_result.append(dic, ignore_index=True)

In [37]:
df_result.sort_values(by='result', ascending=False)

Unnamed: 0,model,result
2,<class 'catboost.core.CatBoostClassifier'>,0.9132
0,<class 'xgboost.sklearn.XGBClassifier'>,0.9122
1,<class 'lightgbm.sklearn.LGBMClassifier'>,0.9079


In [38]:
sub['booking_status'] = cat_cla.predict_proba(X_test)[:,1]
sub.to_csv("submission.csv", index=False)