In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
os.listdir()

['hotel_booking_sim.ipynb']

In [3]:
df = pd.read_csv("../data/hotel.csv", index_col=0, parse_dates=True)
df.not_canceled = df.not_canceled.astype(float)
df = pd.get_dummies(df)
df = df.replace(np.nan, 0)

In [4]:
def train_test_splitter(df, pct=0.8):
    from datetime import timedelta
    timespan = (df.index.max() - df.index.min()).days
    train_date = round(timespan*pct)
    train = df.index.min() + timedelta(days=train_date)
    return [df[df.index <= train], df[df.index > train]]


train, test = train_test_splitter(df, 0.8)

In [6]:
X_train = train.drop(["not_canceled"], axis=1)
X_test = test.drop(["not_canceled"], axis=1)
y_train = train.not_canceled
y_test = test.not_canceled

In [7]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, max_iter=5000).fit(X_train, y_train)
y_pred = clf.predict(X_train)

In [8]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_pred)
# bad but oh well, this is the first thing I would imporve

array([[12250, 19086],
       [  987, 50082]])

In [9]:
p_arrival = clf.predict_proba(X_test)
p_arrival = p_arrival[:,1]

In [10]:
from scipy.stats import binom

In [11]:
def capacity_max(p, rooms=100, loss=4, min_p=0.065):
    E, E_m1, Fx, P_overbook, bookings = 0, 0, 0, 0, rooms
    if p < min_p:  
        bookings = rooms * 10
    else:
        while(E_m1 >= E):
            E = bookings * Fx - loss * ((bookings - rooms) * P_overbook)
            Fx = binom.cdf(rooms, bookings, p)
            bookings = bookings + 1
            P_overbook =  1 - Fx
            E_m1 = bookings * Fx - loss * ((bookings - rooms) * P_overbook)
    return bookings

In [12]:
# sanity check
for p in np.arange(0, 1, 0.1):
    print(capacity_max(p), p)

1000 0.0
797 0.1
406 0.2
276 0.30000000000000004
211 0.4
172 0.5
147 0.6000000000000001
129 0.7000000000000001
116 0.8
106 0.9


In [13]:
from tqdm import tqdm # progress bar
cap_max = []
for p in tqdm(p_arrival):
    cf = capacity_max(p)
    cap_max.append(cf)

100%|██████████| 36985/36985 [01:14<00:00, 498.63it/s]


In [32]:
sim_df = pd.DataFrame(
    {
    "arrived":y_test,
    "p_arrival":p_arrival,
    "cap_max":cap_max}, index=X_test.index
)
sim_df["cap_frac"] = 1/sim_df.cap_max
test = sim_df[sim_df.index == "2016-10-31"]
samp = test.sample(1)
samp.cap_frac.values

array([0.00862069])

In [52]:
from random import randint

In [53]:
test.iloc[2,:].values

array([0.00000000e+00, 7.74169687e-01, 1.19000000e+02, 8.40336134e-03])

In [77]:
# there is a better way to set that threashold 
# with renewal theory. well see if i get to it


def resample_from_reservations(df, p_thresh=0.99):
    success = False
    i = 0
    pct_capacity = []
    sample_rows = np.zeros(4) # winning ugly
    while not success:
        rand_index = randint(0, df.shape[0])
        rand_row = test.iloc[rand_index,:].values
        sample_rows = np.concatenate(([sample_rows], [rand_row]), axis=0)
        capacity_frac = sum(sample_rows[:, 3]) 
        success = sum(capacity_frac) > p_thresh
    return sum(sample.arrived)
resample_from_reservations(test) 

TypeError: 'numpy.float64' object is not iterable

In [None]:
for date in sim_df.index.unique():
    temp = sim_df[sim_df.index == date]

              adults  children  babies  lead_time  stay_length  \
booking_date                                                     
2016-10-30         2       0.0       0         57            4   
2016-10-30         2       0.0       0          0            1   
2016-10-30         1       0.0       0          0            1   
2016-10-30         2       0.0       0          1            1   
2016-10-30         1       0.0       0          0            2   
...              ...       ...     ...        ...          ...   
2016-10-30         2       0.0       0         66            3   
2016-10-30         2       0.0       0         19            3   
2016-10-30         2       0.0       0        169            3   
2016-10-30         2       0.0       0        280            2   
2016-10-30         2       0.0       0        280            2   

              customer_type_Contract  customer_type_Group  \
booking_date                                                
2016-10-30         