In [18]:
import pandas as pd
import numpy as np

from agg import prep_demand_features
from demand_features import X_cols

pd.options.display.max_rows = 150
pd.options.display.max_columns = 250
pd.options.display.max_colwidth = None

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import r2_score

from xgboost import XGBRegressor

In [2]:
h1_stats = pd.read_pickle("../data/h1_stats.pick")
h1_stats["WeekNum"] = h1_stats["WeekNum"].astype(int)

In [3]:
h1_stats["WeekNum"] = h1_stats.StayDate.dt.isocalendar().week

In [4]:
h1_stats[["StayDate", "WeekNum"]].sample(10)

Unnamed: 0,StayDate,WeekNum
5302,2017-02-03,5
10746,2017-07-27,30
9691,2017-06-25,25
4133,2016-12-12,50
4587,2017-01-01,52
10389,2017-07-11,28
2730,2016-11-03,44
4771,2016-12-30,52
4422,2016-12-22,51
1178,2016-10-01,39


In [5]:
h1_stats.describe()

Unnamed: 0,RoomsOTB,RevOTB,CxlForecast,TRN_RoomsOTB,TRN_RevOTB,TRN_CxlForecast,DaysUntilArrival,ADR_OTB,SellingPrice,RemSupply,Mon,Sat,Sun,Thu,Tue,Wed,NONTRN_RoomsOTB,NONTRN_RevOTB,NONTRN_ADR_OTB,NONTRN_CxlForecast,ACTUAL_RoomsSold,ACTUAL_ADR,ACTUAL_RoomRev,ACTUAL_NumCancels,WeekNum,TRN_ADR_OTB,TM30_RoomsPickup,TM30_RevPickup,TM30_ADR_Pickup,TM30_TRN_RoomsPickup,TM30_TRN_RevPickup,TM30_TRN_ADR_Pickup,TM30_NONTRN_RoomsPickup,TM30_NONTRN_RevPickup,TM30_NONTRN_ADR_Pickup,TM15_RoomsPickup,TM15_RevPickup,TM15_ADR_Pickup,TM15_TRN_RoomsPickup,TM15_TRN_RevPickup,TM15_TRN_ADR_Pickup,TM15_NONTRN_RoomsPickup,TM15_NONTRN_RevPickup,TM15_NONTRN_ADR_Pickup,TM05_RoomsPickup,TM05_RevPickup,TM05_ADR_Pickup,TM05_TRN_RoomsPickup,TM05_TRN_RevPickup,TM05_TRN_ADR_Pickup,TM05_NONTRN_RoomsPickup,TM05_NONTRN_RevPickup,TM05_NONTRN_ADR_Pickup,RoomsGapToLYA,RevGapToLYA,ADR_GapToLYA,TRN_RoomsGapToLYA,TRN_RevGapToLYA,TRN_ADR_GapToLYA,NONTRN_RoomsGapToLYA,NONTRN_RevGapToLYA,NONTRN_ADR_GapToLYA,DaysUntilArrival_STLY,RoomsOTB_Pace,ADR_OTB_Pace,RevOTB_Pace,CxlForecast_Pace,RemSupply_Pace,SellingPrice_Pace,TRN_RoomsOTB_Pace,TRN_ADR_OTB_Pace,TRN_RevOTB_Pace,TRN_CxlForecast_Pace,NONTRN_RoomsOTB_Pace,NONTRN_ADR_OTB_Pace,NONTRN_RevOTB_Pace,NONTRN_CxlForecast_Pace,TM30_RoomsPickup_Pace,TM30_ADR_Pickup_Pace,TM30_RevPickup_Pace,TM30_TRN_RoomsPickup_Pace,TM30_TRN_ADR_Pickup_Pace,TM30_TRN_RevPickup_Pace,TM30_NONTRN_RoomsPickup_Pace,TM30_NONTRN_ADR_Pickup_Pace,TM30_NONTRN_RevPickup_Pace,TM15_RoomsPickup_Pace,TM15_ADR_Pickup_Pace,TM15_RevPickup_Pace,TM15_TRN_RoomsPickup_Pace,TM15_TRN_ADR_Pickup_Pace,TM15_TRN_RevPickup_Pace,TM15_NONTRN_RoomsPickup_Pace,TM15_NONTRN_ADR_Pickup_Pace,TM15_NONTRN_RevPickup_Pace,TM05_RoomsPickup_Pace,TM05_ADR_Pickup_Pace,TM05_RevPickup_Pace,TM05_TRN_RoomsPickup_Pace,TM05_TRN_ADR_Pickup_Pace,TM05_TRN_RevPickup_Pace,TM05_NONTRN_RoomsPickup_Pace,TM05_NONTRN_ADR_Pickup_Pace,TM05_NONTRN_RevPickup_Pace
count,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0,12208.0
mean,139.693562,14401.099538,18.41055,89.930374,10275.610964,16.057585,15.27654,94.521588,101.740257,65.716989,0.142857,0.142038,0.142448,0.144004,0.143267,0.143676,49.763188,4125.488574,82.780837,2.352965,156.167595,94.948845,15719.184259,77.09805,26.820691,100.618447,9.912516,939.082892,0.91885,7.298984,703.86121,0.953703,2.613532,235.221682,1.081473,0.781045,17.742694,-0.264474,0.7577,36.606579,-0.334466,0.023345,-18.863885,-0.359994,-7.672592,-606.246786,-0.258184,-5.445773,-453.559906,-0.251895,-2.226818,-152.68688,-0.395004,4.550704,-1294.865487,-11.824486,10.010321,-469.331316,-14.292563,-5.459617,-825.534172,2.468077,15.27654,12.729112,12.460346,2544.981504,3.797919,-8.931193,15.078701,5.844856,14.970347,1603.077616,4.595675,6.884256,11.142054,941.903888,-0.797756,0.86001,-0.152889,150.932805,-0.548083,-0.247721,31.750894,1.408093,0.031976,119.18191,-0.503113,-0.008107,-35.850835,-0.317497,-0.008416,-0.5674,-0.185616,-0.373123,-35.283435,-1.155799,0.271393,-95.686034,0.264581,0.303965,-20.685684,-1.42038,0.407824,-75.00035
std,43.694905,9958.10562,13.534843,34.751367,8390.726846,12.852206,9.230766,48.589927,53.250189,38.343608,0.349941,0.349103,0.349523,0.351108,0.350359,0.350776,33.556251,2751.999242,38.639588,3.946746,34.447963,48.773683,9903.522332,37.291373,14.735611,51.946247,13.490788,1119.433687,2.236801,9.406587,868.65929,2.730732,7.572595,541.761362,4.193019,11.53552,881.577638,1.637557,7.610045,659.732404,1.992613,7.050058,450.632016,3.124845,13.838311,974.796704,2.03578,8.898958,739.938713,2.43515,9.212522,524.843452,4.394178,27.118418,2675.46465,12.146313,29.348901,2354.627595,12.838117,31.881203,2250.485497,5.817534,9.230766,31.914971,12.599505,2723.355731,14.075847,31.229994,13.024367,29.201282,13.090886,2206.224025,10.599064,33.671748,17.069277,2364.696265,8.117581,12.71684,3.054577,1048.118987,9.941674,3.587071,895.907824,9.022817,5.735596,643.540654,10.127571,2.192171,783.052803,7.291547,2.600998,653.638593,7.781312,4.448342,511.414979,13.058136,2.681185,949.975169,8.895244,3.092753,742.19495,10.53202,6.21102,648.368136
min,22.0,1005.93,0.0,11.0,858.7,0.0,0.0,42.98,44.68,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,44.48,2565.67,9.0,1.0,44.0,-20.0,-1783.57,-10.18,-12.0,-1683.88,-17.64,-18.0,-2045.5,-20.83,-58.0,-4434.57,-11.55,-36.0,-3517.45,-21.64,-35.0,-2635.5,-19.99,-105.0,-5304.12,-10.27,-48.0,-4992.68,-13.54,-99.0,-3963.31,-43.0,-101.0,-13952.8,-64.71,-97.0,-8395.67,-54.36,-130.0,-11336.94,-25.92,0.0,-124.0,-55.22,-14481.44,-101.0,-120.0,-39.32,-87.0,-48.0,-6292.78,-47.0,-118.0,-76.77,-11055.52,-79.0,-66.0,-15.33,-4227.05,-63.0,-17.91,-4734.9,-61.0,-43.22,-4212.13,-68.0,-12.87,-3944.04,-57.0,-21.0,-3521.67,-56.0,-27.86,-3814.13,-96.0,-16.87,-4457.05,-37.0,-13.83,-3650.58,-98.0,-43.0,-5039.88
25%,122.0,6973.3525,8.0,60.0,3823.6075,6.0,7.0,56.2,57.28,38.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,1489.4,56.0,0.0,147.0,56.47,8268.05,44.0,14.0,58.0,1.0,111.0,-0.17,1.0,87.0,-0.42,0.0,0.0,-0.04,-3.0,-344.665,-0.94,-3.0,-290.4925,-1.2,-1.0,-58.5,-0.16,-12.0,-1110.08,-1.11,-10.0,-863.7675,-1.3,-2.0,-180.36,-0.56,-4.0,-2895.59,-17.9,-4.0,-1948.9675,-21.95,-16.0,-2000.93,-0.72,7.0,-5.0,5.07,1042.6875,-2.0,-26.0,5.65,-12.0,6.0,258.005,-1.0,-6.0,0.81,-26.465,-2.0,-4.0,-1.49,-366.2025,-4.0,-2.08,-349.12,-1.0,-0.97,-82.29,-4.0,-1.0425,-376.49,-3.0,-1.4225,-303.365,-1.0,-0.52,-118.0,-5.0,-0.96,-510.7575,-4.0,-1.31,-385.415,-2.0,-0.82,-180.36
50%,158.0,11789.605,15.0,92.0,7182.45,12.0,15.0,76.84,87.29,55.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,4369.18,68.43,1.0,173.0,77.57,12879.74,77.0,28.0,84.0,6.0,602.005,0.52,5.0,491.89,0.59,0.0,-0.0,0.0,0.0,0.0,-0.11,0.0,0.0,-0.21,0.0,0.0,0.0,-4.0,-410.775,-0.09,-4.0,-361.06,-0.16,0.0,0.0,0.0,6.0,-1282.34,-10.65,13.0,-387.155,-13.38,-4.0,-745.28,1.97,15.0,5.0,11.03,2499.895,4.0,-4.0,13.93,0.0,14.0,1445.135,4.0,6.0,10.8,877.875,0.0,0.0,0.0,39.245,0.0,-0.07,9.15,0.0,0.0,0.0,0.0,-0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,-30.25,0.0,0.12,-7.585,0.0,0.0,0.0
75%,173.0,19710.445,25.0,121.0,14600.83,23.0,23.0,113.6825,123.39,84.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0,5759.26,93.51,3.0,180.0,114.64,20582.77,109.0,39.0,124.0,14.0,1502.15,1.99,11.0,1121.5125,2.27,3.0,325.0,1.09,4.0,374.5425,0.46,3.0,335.335,0.59,1.0,45.5,0.16,0.0,0.0,0.59,0.0,0.0,0.83,0.0,0.0,0.21,17.0,192.1275,-5.32,25.0,1009.7675,-6.84,9.0,183.74,5.08,23.0,27.0,19.475,4255.2925,11.0,12.0,24.78,21.0,24.0,2937.095,10.0,18.0,19.7,2172.35,1.0,5.0,1.47,618.29,4.0,1.88,456.15,3.0,0.93,272.94,3.0,1.0,343.6325,3.0,1.38,292.725,1.0,0.4,85.0,4.0,1.36,380.4825,4.0,1.79,320.985,1.0,1.38,101.64
max,190.0,38376.69,99.0,159.0,32456.05,84.0,31.0,206.8,220.18,172.0,1.0,1.0,1.0,1.0,1.0,1.0,176.0,14756.01,203.91,87.0,185.0,205.41,37692.69,158.0,52.0,227.0,116.0,7062.04,11.07,72.0,6445.18,14.02,98.0,3920.11,29.6,117.0,5744.2,11.13,55.0,5127.34,10.35,102.0,4143.31,31.06,49.0,4105.08,11.38,37.0,3624.93,18.87,26.0,1821.0,20.11,116.0,14921.48,56.07,100.0,6462.99,48.99,105.0,11055.52,28.16,31.0,120.0,59.19,15696.73,97.0,101.0,50.82,108.0,54.0,9254.43,76.0,132.0,100.46,11521.94,87.0,71.0,14.64,5213.3,45.0,15.02,4375.45,94.0,36.47,5389.88,86.0,12.27,3506.69,40.0,11.52,4439.19,98.0,60.76,3946.71,67.0,15.09,4305.96,64.0,18.36,4059.77,48.0,68.19,3360.46


In [6]:
# mask = (h1_stats.Thu == 0) & (h1_stats.Wed == 0) & (h1_stats.Tue == 0) & (h1_stats.Mon == 0) & (h1_stats.Sun == 0)
# h1_stats["WE"] = mask


In [7]:
mask = (h1_stats["StayDate"] < '2017-08-01')
test_mask = (h1_stats['AsOfDate'] == '2017-08-01')
h1_train = h1_stats.loc[mask].copy()
h1_test = h1_stats.loc[test_mask].copy()

X1_train = h1_train[X_cols].copy()
X1_test = h1_test[X_cols].copy()
y1_train = h1_train['ACTUAL_RoomsSold'].copy()
y1_test = h1_test['ACTUAL_RoomsSold'].copy()

# not sure why I need to do this again (it's done in agg.py), but here we are. 
X1_train["WeekNum"] = X1_train.WeekNum.astype('float')
X1_test["WeekNum"] = X1_test.WeekNum.astype('float')

In [61]:
[c for c in h1_stats.columns]

['id',
 'DOW',
 'RoomsOTB',
 'RevOTB',
 'CxlForecast',
 'TRN_RoomsOTB',
 'TRN_RevOTB',
 'TRN_CxlForecast',
 'StayDate',
 'WE',
 'WD',
 'STLY_StayDate',
 'DaysUntilArrival',
 'ADR_OTB',
 'SellingPrice',
 'AsOfDate',
 'STLY_AsOfDate',
 'RemSupply',
 'Mon',
 'Sat',
 'Sun',
 'Thu',
 'Tue',
 'Wed',
 'NONTRN_RoomsOTB',
 'NONTRN_RevOTB',
 'NONTRN_ADR_OTB',
 'NONTRN_CxlForecast',
 'ACTUAL_RoomsSold',
 'ACTUAL_ADR',
 'ACTUAL_RoomRev',
 'ACTUAL_NumCancels',
 'WeekNum',
 'TRN_ADR_OTB',
 'TM30_RoomsPickup',
 'TM30_RevPickup',
 'TM30_ADR_Pickup',
 'TM30_TRN_RoomsPickup',
 'TM30_TRN_RevPickup',
 'TM30_TRN_ADR_Pickup',
 'TM30_NONTRN_RoomsPickup',
 'TM30_NONTRN_RevPickup',
 'TM30_NONTRN_ADR_Pickup',
 'TM15_RoomsPickup',
 'TM15_RevPickup',
 'TM15_ADR_Pickup',
 'TM15_TRN_RoomsPickup',
 'TM15_TRN_RevPickup',
 'TM15_TRN_ADR_Pickup',
 'TM15_NONTRN_RoomsPickup',
 'TM15_NONTRN_RevPickup',
 'TM15_NONTRN_ADR_Pickup',
 'TM05_RoomsPickup',
 'TM05_RevPickup',
 'TM05_ADR_Pickup',
 'TM05_TRN_RoomsPickup',
 'TM05_TR

In [None]:
X1_train.shape

In [8]:
X1_train.head()

Unnamed: 0,WeekNum,RoomsOTB,RevOTB,CxlForecast,TRN_RoomsOTB,WE,DaysUntilArrival,ADR_OTB,SellingPrice,RemSupply,Mon,Sat,Sun,Thu,Tue,Wed,NONTRN_RoomsOTB,TM30_RoomsPickup,TM30_TRN_RoomsPickup,TM30_NONTRN_RoomsPickup,TM15_RoomsPickup,TM15_TRN_RoomsPickup,TM15_NONTRN_RoomsPickup,TM05_RoomsPickup,TM05_TRN_RoomsPickup,TM05_NONTRN_RoomsPickup,RoomsGapToLYA,RevGapToLYA,TRN_RoomsGapToLYA,TRN_RevGapToLYA,NONTRN_RoomsGapToLYA,RoomsOTB_Pace,RemSupply_Pace,SellingPrice_Pace,TRN_RoomsOTB_Pace,NONTRN_RoomsOTB_Pace,TM30_RoomsPickup_Pace,TM30_TRN_RoomsPickup_Pace,TM30_NONTRN_RoomsPickup_Pace,TM15_RoomsPickup_Pace,TM15_TRN_RoomsPickup_Pace,TM15_NONTRN_RoomsPickup_Pace,TM05_RoomsPickup_Pace,TM05_TRN_RoomsPickup_Pace,TM05_NONTRN_RoomsPickup_Pace
0,30.0,170.0,28570.36,22.0,137.0,False,0,168.06,179.12,39,0,0,1,0,0,0,33.0,20.0,15.0,5.0,15.0,10.0,5.0,8.0,9.0,-1.0,-2.0,-4224.25,-8.0,-4009.26,6.0,2.0,-2,25.75,8.0,-6.0,13.0,5.0,8.0,8.0,1.0,7.0,2.0,3.0,-1.0
1,31.0,178.0,29525.52,28.0,148.0,False,1,165.87,176.01,37,1,0,0,0,0,0,30.0,3.0,3.0,0.0,3.0,3.0,0.0,7.0,7.0,0.0,0.0,-2945.92,-15.0,-3984.54,15.0,3.0,-1,15.65,18.0,-15.0,0.0,-3.0,3.0,-4.0,-6.0,2.0,5.0,5.0,0.0
2,31.0,182.0,30820.89,29.0,158.0,False,2,169.35,178.09,34,0,0,0,0,1,0,24.0,2.0,2.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,-3235.06,-26.0,-5813.27,26.0,4.0,-1,14.95,30.0,-26.0,1.0,0.0,1.0,-4.0,-4.0,0.0,-2.0,-2.0,0.0
3,31.0,174.0,30144.62,30.0,152.0,False,3,173.24,181.69,43,0,0,0,0,0,1,22.0,-5.0,-5.0,0.0,-2.0,-2.0,0.0,1.0,1.0,0.0,8.0,-2312.89,-15.0,-4577.58,23.0,-1.0,3,17.76,22.0,-23.0,-6.0,-7.0,1.0,-3.0,-4.0,1.0,0.0,0.0,0.0
4,31.0,179.0,32412.13,35.0,149.0,False,4,181.07,187.83,43,0,0,0,1,0,0,30.0,6.0,4.0,2.0,2.0,0.0,2.0,2.0,0.0,2.0,1.0,-4464.21,-12.0,-4743.03,13.0,3.0,2,22.86,16.0,-13.0,-1.0,-4.0,3.0,0.0,-4.0,4.0,2.0,0.0,2.0


In [9]:
lm = LinearRegression()
lr_model = lm.fit(X1_train, y1_train)
scores = cross_val_score(lm, X1_train, y1_train, scoring='r2', cv=5)
scores.mean()

0.7200482002998536

In [10]:
lr_model.score(X1_test, y1_test)

-0.3065913778547442

In [11]:
rfm = RandomForestRegressor(n_jobs=-1, random_state=20)
rf_model = rfm.fit(X1_train, y1_train)
scores = cross_val_score(rfm, X1_train, y1_train, scoring='r2', cv=5)
scores.mean()

0.6629948201445237

In [12]:
rf_model.score(X1_test, y1_test)

0.34271206291251166

In [16]:
xgbm = XGBRegressor(n_jobs=-1, random_state=20)
xgb_model = xgbm.fit(X1_train, y1_train)
scores = cross_val_score(xgbm, X1_train, y1_train, scoring='r2', cv=5)
scores.mean()

0.6891447167302646

In [17]:
xgbm.score(X1_test, y1_test)

0.10568133288851345

## Random Forest had the highest preliminary test score.

Let's use RandomizedSearchCV to tune the parameters.

Parameters of random grid search
```
random_grid = {
    "n_estimators": range(200, 2000, 100),
    "max_features": ["auto", "sqrt"],
    "max_depth": range(10, 110, 11),
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

rf = RandomForestRegressor()
rf_random = (RandomizedSearchCV(rf, random_grid, verbose=2, n_iter=50, random_state=42, n_jobs=-1))

rf_random.fit(X1_train, y1_train)
```

Results of random grid search:

```
{'n_estimators': 500,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 43,
 'bootstrap': True}
```

Score: 0.6519058137402494

In [58]:
rf_grid = {
    "n_estimators": [405, 410, 415, 420, 425, 430],
    "max_features": ['auto', 'log2', 70],
    "max_depth": [41, 42, 43, 44, 45],
    "bootstrap": [True]
}
rfm = RandomForestRegressor()

rf_grid = GridSearchCV(rfm, rf_grid, n_jobs=-1, verbose=10, cv=5, random_seed=20)
rf_grid.fit(X1_train, y1_train)

TypeError: __init__() got an unexpected keyword argument 'random_seed'

In [53]:
rf_grid.best_params_

{'bootstrap': True,
 'max_depth': 44,
 'max_features': 'auto',
 'n_estimators': 420}

In [54]:
rf_grid.best_score_

0.656012479002637

In [37]:
# from keras.models import Sequential
# from keras.layers import LSTM


# lstm_model = Sequential()
# model.add(LSTM(4, input_shape=(31, len)