In [8]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv("../data/hotel.csv", index_col=0, parse_dates=True)
df.not_canceled = df.not_canceled.astype(float)
df[df.hotel=="Resort Hotel"]
df = df.drop(["hotel"], axis=1)

In [10]:

df = pd.get_dummies(df)
df = df.replace(np.nan, 0)

In [11]:
def train_test_splitter(df, pct=0.8):
    from datetime import timedelta
    timespan = (df.index.max() - df.index.min()).days
    train_date = round(timespan*pct)
    train = df.index.min() + timedelta(days=train_date)
    return [df[df.index <= train], df[df.index > train]]


train, test = train_test_splitter(df, 0.8)

In [13]:
X_train = train.drop(["not_canceled"], axis=1)
X_test = test.drop(["not_canceled"], axis=1)
y_train = train.not_canceled
y_test = test.not_canceled

In [14]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, max_iter=5000).fit(X_train, y_train)
y_pred = clf.predict(X_train)

In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_pred)
# bad but oh well, this is the first thing I would imporve

array([[12250, 19086],
       [  987, 50082]])

In [16]:
p_arrival = clf.predict_proba(X_test)
p_arrival = p_arrival[:,1]

In [17]:
from scipy.stats import binom

In [18]:
def capacity_max(p, rooms=100, loss=4, min_p=0.065):
    E, E_m1, Fx, P_overbook, bookings = 0, 0, 0, 0, rooms
    if p < min_p:  
        bookings = rooms * 10
    else:
        while(E_m1 >= E):
            E = bookings * Fx - loss * ((bookings - rooms) * P_overbook)
            Fx = binom.cdf(rooms, bookings, p)
            bookings = bookings + 1
            P_overbook =  1 - Fx
            E_m1 = bookings * Fx - loss * ((bookings - rooms) * P_overbook)
    return bookings

In [19]:
# sanity check
for p in np.arange(0, 1, 0.1):
    print(capacity_max(p), p)

1000 0.0
797 0.1
406 0.2
276 0.30000000000000004
211 0.4
172 0.5
147 0.6000000000000001
129 0.7000000000000001
116 0.8
106 0.9


In [20]:
from tqdm import tqdm # progress bar
cap_max = []
for p in tqdm(p_arrival):
    cf = capacity_max(p)
    cap_max.append(cf)

100%|██████████| 36985/36985 [01:23<00:00, 444.85it/s]


In [21]:
sim_df = pd.DataFrame(
    {
    "arrived":y_test,
    "p_arrival":p_arrival,
    "cap_max":cap_max
    }, index=X_test.index
)
sim_df["cap_frac"] = 1/sim_df.cap_max
test = sim_df[sim_df.index == "2016-10-31"]
samp = test.sample(1)
samp.cap_frac.values

array([0.00806452])

In [22]:
from random import randint

In [25]:
# there is a better way to set that threashold 
# with renewal theory. well see if i get to it
def resample_from_reservations(df, strategy="pred_probs", p_thresh=0.99):
    success = False
    niters = 0
    sample_rows = np.zeros(4) # winning ugly
    while not success:
        rand_index = randint(0, df.shape[0]-1)
        rand_row = df.iloc[rand_index,:].values
        sample_rows = np.vstack((sample_rows, rand_row))
        if strategy == "capacity":
            niters += 1
            success = niters == 100
        if strategy == "mean_p":
            niters += 1
            success = niters == 129 # number of rooms * 1/0.64
        if strategy == "pred_probs":
            success = sum(sample_rows[:, 3]) > p_thresh
    return sum(sample_rows[:, 0])



In [26]:
dates = []
pred_probs_arrivals = []
mean_p_arrivals = []
capacity_arrivals = []
for date in sim_df.index.unique():
    temp = sim_df[sim_df.index == date]
    pred_probs_arrivals.append(resample_from_reservations(df=temp, strategy="pred_probs"))
    mean_p_arrivals.append(resample_from_reservations(df=temp, strategy="mean_p"))
    capacity_arrivals.append(resample_from_reservations(df=temp, strategy="capacity"))
    dates.append(date)

In [27]:
pd.DataFrame({"pred_probs_arrivals":pred_probs_arrivals,
              "mean_p_arrivals":mean_p_arrivals,
              "capacity_arrivals":capacity_arrivals
              }, index=dates)

Unnamed: 0,pred_probs_arrivals,mean_p_arrivals,capacity_arrivals
2016-10-31,73.0,85.0,60.0
2016-11-04,98.0,98.0,77.0
2016-11-07,90.0,101.0,77.0
2016-11-14,94.0,90.0,67.0
2016-11-13,85.0,78.0,58.0
...,...,...,...
2017-08-16,109.0,122.0,95.0
2017-08-22,109.0,125.0,92.0
2017-08-27,120.0,129.0,100.0
2017-08-28,118.0,129.0,100.0
