# IMPORTS

In [14]:
import pandas as pd
import numpy as np
import joblib
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.metrics import f1_score
import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")

# DATA LOAD

In [2]:
df_hotels_test = pd.read_csv('hotels_test.csv')

In [3]:
len(df_hotels_test)

26535

In [4]:
df_hotels_test = df_hotels_test.drop(['reservation_status_date'], axis=1)

In [5]:
df_hotels_test['hotel'] = df_hotels_test['hotel'].astype('string')
df_hotels_test['arrival_date_month'] = df_hotels_test['arrival_date_month'].astype('string')
df_hotels_test['meal'] = df_hotels_test['meal'].astype('string')
df_hotels_test['country'] = df_hotels_test['country'].astype('string')
df_hotels_test['id'] = df_hotels_test['id'].astype('string')
df_hotels_test['customer_type'] = df_hotels_test['customer_type'].astype('string')
df_hotels_test['market_segment'] = df_hotels_test['market_segment'].astype('string')
df_hotels_test['distribution_channel'] = df_hotels_test['distribution_channel'].astype('string')
df_hotels_test['reserved_room_type'] = df_hotels_test['reserved_room_type'].astype('string')
df_hotels_test['assigned_room_type'] = df_hotels_test['assigned_room_type'].astype('string')
df_hotels_test['deposit_type'] = df_hotels_test['deposit_type'].astype('string')

# FILLNA

In [6]:
df_hotels_test['company'].fillna(-9999, inplace=True)
df_hotels_test['agent'].fillna(-9999, inplace=True)
df_hotels_test['country'].fillna('PRT', inplace=True)
df_hotels_test['children'].fillna(0, inplace=True)

# ENCODING

In [7]:
df_submit = df_hotels_test[['id']]
df_hotels_test.drop(['id'], axis='columns', inplace=True)

In [8]:
df_hotels_test_to_encode = df_hotels_test.copy()
transformer = joblib.load('tr.joblib')
hotels_test_encoded = transformer.transform(df_hotels_test_to_encode)

In [9]:
df_hotels_test_encoded = pd.DataFrame(
    hotels_test_encoded.toarray(), 
    columns=transformer.get_feature_names_out()
)

In [10]:
#Mostramos las primera 10 columnas de country, ya que el resto tiene una cantidad bajisima de datos y no aportan demasiado
country_cols = df_hotels_test_encoded.columns[df_hotels_test_encoded.columns.str.startswith('country_')]
selected_columns = (df_hotels_test_encoded[country_cols] != 0).sum()
#top_cols = selected_columns.nlargest(10).index
top_cols = ['country_PRT', 'country_GBR', 'country_FRA', 'country_ESP', 'country_DEU', 'country_ITA', 'country_IRL', 'country_BRA', 'country_BEL', 'country_USA']
other_countries = [col for col in country_cols if col not in top_cols]
df_hotels_test_encoded['country_other'] = df_hotels_test_encoded[other_countries].sum(axis=1)
df_hotels_test_encoded.drop(other_countries, axis=1, inplace=True)

In [11]:
x_submit = df_hotels_test_encoded

In [12]:
#Escalamos los datos de train de nuestro data set
scaler = StandardScaler()
x_submit_scaled = scaler.fit_transform(x_submit)

# PREDICTIONS

In [19]:
nn = tf.keras.models.load_model("test.pkl")

In [32]:
y_pred_nn = nn.predict(x_submit_scaled)
y_pred_nn = np.where(y_pred_nn>0.4,1,0) #Seteamos un corte en los datos para determinar si cancelo o no 



In [36]:
y_pred_nn.flatten()

array([1, 1, 0, ..., 1, 1, 1])

# SUBMIT

In [39]:
df_submit_nn = pd.DataFrame({'id': df_submit['id'],'is_canceled':y_pred_nn.flatten()})
df_submit_nn.to_csv('Submit_nn.csv', index = False)

df_submit_rf = pd.DataFrame({'id': df_submit['id'],'is_canceled':y_pred_rf})
df_submit_rf.to_csv('Submit_rf.csv',index = False)

df_submit_knn = pd.DataFrame({'id': df_submit['id'],'is_canceled':y_pred_knn})
df_submit_knn.to_csv('Submit_knn.csv',index = False)

df_submit_xgb = pd.DataFrame({'id': df_submit['id'], 'is_canceled': y_pred_xgb})
df_submit_xgb.to_csv('Submit_xgb.csv', index = False)