In [47]:
import pandas as pd
import numpy as np

In [48]:
df = pd.read_csv("../data/hotel.csv", index_col=0, parse_dates=True)
df.not_canceled = df.not_canceled.astype(float)
df[df.hotel=="City Hotel"]
df = df.drop(["hotel"], axis=1)

In [49]:
df = pd.get_dummies(df)
df = df.replace(np.nan, 0)

In [50]:
def train_test_splitter(df, pct=0.8):
    from datetime import timedelta
    timespan = (df.index.max() - df.index.min()).days
    train_date = round(timespan*pct)
    train = df.index.min() + timedelta(days=train_date)
    return [df[df.index <= train], df[df.index > train]]


train, test = train_test_splitter(df, 0.8)

In [51]:
X_train = train.drop(["not_canceled"], axis=1)
X_test = test.drop(["not_canceled"], axis=1)
y_train = train.not_canceled
y_test = test.not_canceled

In [52]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, max_iter=5000).fit(X_train, y_train)
y_pred = clf.predict(X_train)

In [53]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_pred)
# bad but oh well, this is the first thing I would imporve

array([[12250, 19086],
       [  987, 50082]])

In [54]:
p_arrival = clf.predict_proba(X_test)
p_arrival = p_arrival[:,1]

In [55]:
from scipy.stats import binom

In [56]:
def capacity_max(p, rooms=100, loss=4, min_p=0.065):
    E, E_m1, Fx, P_overbook, bookings = 0, 0, 0, 0, rooms
    if p < min_p:  
        bookings = rooms * 10
    else:
        while(E_m1 >= E):
            E = bookings * Fx - loss * ((bookings - rooms) * P_overbook)
            Fx = binom.cdf(rooms, bookings, p)
            bookings = bookings + 1
            P_overbook =  1 - Fx
            E_m1 = bookings * Fx - loss * ((bookings - rooms) * P_overbook)
    return bookings

In [57]:
# # sanity check
[capacity_max(p) for p in np.arange(0,1, 0.1)]

[1000, 797, 406, 276, 211, 172, 147, 129, 116, 106]

In [58]:
from tqdm import tqdm # progress bar
cap_max = []
for p in tqdm(p_arrival):
    cf = capacity_max(p)
    cap_max.append(cf)

100%|██████████| 36985/36985 [01:14<00:00, 493.36it/s]


In [59]:
sim_df = pd.DataFrame(
    {
    "arrived":y_test,
    "p_arrival":p_arrival,
    "cap_max":cap_max
    }, index=X_test.index
)
sim_df["cap_frac"] = 1/sim_df.cap_max

In [60]:
from random import randint

In [61]:
# there is a better way to set that threashold 
# with renewal theory. well see if i get to it
def resample_from_reservations(df, strategy="pred_probs", p_thresh=0.99):
    success = False
    niters = 0
    sample_rows = np.zeros(4) # winning ugly
    while not success:
        rand_index = randint(0, df.shape[0]-1)
        rand_row = df.iloc[rand_index,:].values
        sample_rows = np.vstack((sample_rows, rand_row))
        if strategy == "capacity":
            niters += 1
            success = niters == 100
        if strategy == "mean_p":
            niters += 1
            success = niters == 129 # number of rooms * 1/0.64
        if strategy == "pred_probs":
            success = sum(sample_rows[:, 3]) > p_thresh
    return sum(sample_rows[:, 0])

In [62]:
def simulate_arrivals(df, iteration=0):
    dates = []
    pred_probs_arrivals = []
    mean_p_arrivals = []
    capacity_arrivals = []

    for date in df.index.unique():
        temp = df[df.index == date]
        pred_probs_arrivals.append(resample_from_reservations(df=temp, strategy="pred_probs"))
        mean_p_arrivals.append(resample_from_reservations(df=temp, strategy="mean_p"))
        capacity_arrivals.append(resample_from_reservations(df=temp, strategy="capacity"))
        dates.append(date)

    return pd.DataFrame({"pred_probs_arrivals":pred_probs_arrivals,
                "mean_p_arrivals":mean_p_arrivals,
                "capacity_arrivals":capacity_arrivals,
                "trial":iteration
                }, index=dates)

In [63]:
np.random.seed(2021) # dont tell maccluter were using a seed
simulate_arrivals(df=sim_df)

Unnamed: 0,pred_probs_arrivals,mean_p_arrivals,capacity_arrivals,trial
2016-10-31,75.0,77.0,69.0,0
2016-11-04,100.0,94.0,78.0,0
2016-11-07,88.0,92.0,74.0,0
2016-11-14,86.0,94.0,76.0,0
2016-11-13,68.0,69.0,52.0,0
...,...,...,...,...
2017-08-16,113.0,113.0,90.0,0
2017-08-22,108.0,126.0,98.0,0
2017-08-27,120.0,129.0,100.0,0
2017-08-28,118.0,129.0,100.0,0


In [64]:
sim = []
for iteration in tqdm(range(5)):
    np.random.seed(iteration) 
    sim.append(simulate_arrivals(df=sim_df,iteration=iteration))
    
simulation_df = pd.concat(sim)
simulation_df["date"] = simulation_df.index
simulation_df.to_csv("../data/simulation/first_sim.csv", index=False)

 40%|████      | 2/5 [00:33<00:50, 16.81s/it]

In [None]:
simulation_results = pd.read_csv("../data/simulation/first_sim.csv")
simulation_results

Unnamed: 0,pred_probs_arrivals,mean_p_arrivals,capacity_arrivals,trial,date
0,81.0,80.0,62.0,0,2016-10-31
1,101.0,92.0,77.0,0,2016-11-04
2,92.0,103.0,66.0,0,2016-11-07
3,84.0,86.0,73.0,0,2016-11-14
4,79.0,72.0,53.0,0,2016-11-13
...,...,...,...,...,...
1525,116.0,119.0,92.0,4,2017-08-16
1526,111.0,123.0,98.0,4,2017-08-22
1527,120.0,129.0,100.0,4,2017-08-27
1528,118.0,129.0,100.0,4,2017-08-28
