In [1]:
import pandas as pd
import numpy as np

from agg import prep_demand_features
from demand_features import X_cols, rf_cols

pd.options.display.max_rows = 160
pd.options.display.max_columns = 250
pd.options.display.max_colwidth = None

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import r2_score

from xgboost import XGBRegressor

In [2]:
h1_stats = pd.read_pickle("../data/h1_stats.pick")

In [3]:
[col for col in h1_stats.columns]

['id',
 'DOW',
 'RoomsOTB',
 'RevOTB',
 'CxlForecast',
 'TRN_RoomsOTB',
 'TRN_RevOTB',
 'TRN_CxlForecast',
 'TRNP_RoomsOTB',
 'GRP_RoomsOTB',
 'CNT_RoomsOTB',
 'StayDate',
 'STLY_StayDate',
 'DaysUntilArrival',
 'Realized_Cxls',
 'SellingPrice',
 'TM30_RoomsOTB',
 'TM30_TRN_RoomsOTB',
 'TM15_RoomsOTB',
 'TM15_TRN_RoomsOTB',
 'TM05_RoomsOTB',
 'TM05_TRN_RoomsOTB',
 'AsOfDate',
 'STLY_AsOfDate',
 'RemSupply',
 'NONTRN_RoomsOTB',
 'NONTRN_RevOTB',
 'NONTRN_ADR_OTB',
 'NONTRN_CxlForecast',
 'ADR_OTB',
 'TRN_ADR_OTB',
 'ACTUAL_RoomsSold',
 'ACTUAL_ADR',
 'ACTUAL_RoomRev',
 'ACTUAL_TRN_RoomsSold',
 'ACTUAL_TRN_ADR',
 'ACTUAL_TRN_RoomRev',
 'ACTUAL_NumCancels',
 'ACTUAL_RoomsPickup',
 'ACTUAL_ADR_Pickup',
 'ACTUAL_RevPickup',
 'ACTUAL_TRN_RoomsPickup',
 'ACTUAL_TRN_ADR_Pickup',
 'ACTUAL_TRN_RevPickup',
 'ACTUAL_NONTRN_RoomsPickup',
 'ACTUAL_NONTRN_ADR_Pickup',
 'ACTUAL_NONTRN_RevPickup',
 'MonthNum',
 'DayOfWeek',
 'Mon',
 'Sat',
 'Sun',
 'Thu',
 'Tue',
 'Wed',
 'WE',
 'WeekNum',
 'week_of_ye

In [4]:
h1_stats["RemSupply"] = h1_stats["RemSupply"].astype(float)

In [5]:
mask = (h1_stats["StayDate"] < '2017-08-01')
test_mask = (h1_stats['AsOfDate'] == '2017-08-01')
h1_train = h1_stats.loc[mask].copy()
h1_test = h1_stats.loc[test_mask].copy()

X1_train = h1_train[rf_cols].copy()
X1_test = h1_test[rf_cols].copy()
y1_train = h1_train['ACTUAL_TRN_RoomsPickup'].copy()
y1_test = h1_test['ACTUAL_TRN_RoomsPickup'].copy()

In [6]:
X1_train.shape

(11216, 81)

In [7]:
X1_train.head()

Unnamed: 0,week_of_year,RoomsOTB,RoomsOTB_STLY,RevOTB,RevOTB_STLY,CxlForecast,TRN_RoomsOTB,TRN_RoomsOTB_STLY,TRN_RevOTB,TRN_RevOTB_STLY,TRN_CxlForecast,WE,DaysUntilArrival,ADR_OTB,SellingPrice,RemSupply,Mon,Sat,Sun,Thu,Tue,Wed,ACTUAL_RoomsPickup_STLY,ACTUAL_ADR_Pickup_STLY,ACTUAL_RevPickup_STLY,ACTUAL_TRN_RoomsPickup_STLY,ACTUAL_TRN_ADR_Pickup_STLY,ACTUAL_TRN_RevPickup_STLY,OTB_GapToLYA_RoomsSold,OTB_GapToLYA_ADR,OTB_GapToLYA_RoomRev,OTB_GapToLYA_NumCancels,OTB_GapToLYA_TRN_RoomsSold,OTB_GapToLYA_TRN_ADR,OTB_GapToLYA_TRN_RoomRev,TRN_ADR_OTB,TM30_RoomsPickup,TM30_RevPickup,TM30_ADR_Pickup,TM30_TRN_RoomsPickup,TM30_TRN_RevPickup,TM30_TRN_ADR_Pickup,TM15_RoomsPickup,TM15_RevPickup,TM15_ADR_Pickup,TM15_TRN_RoomsPickup,TM15_TRN_RevPickup,TM15_TRN_ADR_Pickup,TM05_RoomsPickup,TM05_RevPickup,TM05_ADR_Pickup,TM05_TRN_RoomsPickup,TM05_TRN_RevPickup,TM05_TRN_ADR_Pickup,Pace_RoomsOTB,Pace_ADR_OTB,Pace_RevOTB,Pace_CxlForecast,Pace_RemSupply,Pace_SellingPrice,Pace_TRN_RoomsOTB,Pace_TRN_ADR_OTB,Pace_TRN_RevOTB,Pace_TM30_RoomsPickup,Pace_TM30_ADR_Pickup,Pace_TM30_RevPickup,Pace_TM30_TRN_RoomsPickup,Pace_TM30_TRN_ADR_Pickup,Pace_TM30_TRN_RevPickup,Pace_TM15_RoomsPickup,Pace_TM15_ADR_Pickup,Pace_TM15_RevPickup,Pace_TM15_TRN_RoomsPickup,Pace_TM15_TRN_ADR_Pickup,Pace_TM15_TRN_RevPickup,Pace_TM05_RoomsPickup,Pace_TM05_ADR_Pickup,Pace_TM05_RevPickup,Pace_TM05_TRN_RoomsPickup,Pace_TM05_TRN_ADR_Pickup,Pace_TM05_TRN_RevPickup
0,30.0,170.0,168.0,28570.36,24346.11,25.0,137.0,129.0,23869.77,19860.51,23.0,False,0.0,168.06,179.12,42.0,False,False,1,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,-23.14,-4224.25,2.0,-8.0,-20.27,-4009.26,174.23,20.0,4588.44,8.18,15.0,3554.44,7.71,15.0,3316.35,5.13,10.0,2282.35,4.25,8.0,1821.23,2.94,9.0,2003.23,3.4,2.0,23.14,4224.25,3.0,1.0,25.75,8.0,20.27,4009.26,13.0,4.02,2905.19,5.0,2.9,1443.21,8.0,2.59,1893.79,1.0,1.26,538.81,2.0,1.57,730.36,3.0,2.04,912.36
1,31.0,178.0,175.0,29525.52,26061.4,31.0,148.0,130.0,25429.03,20926.29,30.0,False,1.0,165.87,176.01,40.0,True,False,0,False,False,False,3.0,0.4,518.2,3.0,0.27,518.2,0.0,-16.55,-2945.92,-16.0,-15.0,-10.58,-3984.54,171.82,3.0,1121.52,3.56,3.0,1121.52,4.18,3.0,988.72,2.8,3.0,988.72,3.27,7.0,1592.18,2.52,7.0,1592.18,2.76,3.0,16.95,3464.12,5.0,2.0,15.65,18.0,10.85,4502.74,0.0,-0.02,59.0,-3.0,-0.18,-384.98,-4.0,0.15,-500.11,-6.0,0.29,-821.09,5.0,2.23,1243.09,5.0,2.55,1243.09
2,31.0,182.0,178.0,30820.89,26730.73,35.0,158.0,128.0,27539.05,20870.68,34.0,False,2.0,169.35,178.09,40.0,False,False,0,False,True,False,4.0,1.4,855.1,4.0,1.54,855.1,0.0,-17.78,-3235.06,-22.0,-26.0,-9.71,-5813.27,174.3,2.0,940.63,3.35,2.0,940.63,3.8,2.0,695.79,1.99,2.0,695.79,2.23,0.0,222.25,1.23,0.0,222.25,1.41,4.0,19.18,4090.16,9.0,5.0,14.95,30.0,11.25,6668.37,1.0,-0.48,111.51,0.0,-0.05,128.53,-4.0,-1.11,-738.24,-4.0,-0.36,-598.22,-2.0,1.06,-109.35,-2.0,1.37,-109.35
3,31.0,174.0,175.0,30144.62,26469.93,38.0,152.0,130.0,27062.88,21123.5,36.0,False,3.0,173.24,181.69,51.0,False,False,0,False,False,True,7.0,1.66,1361.8,7.0,1.64,1361.8,8.0,-20.32,-2312.89,-26.0,-15.0,-13.92,-4577.58,178.05,-5.0,-372.97,2.75,-5.0,-372.97,3.3,-2.0,-217.34,0.73,-2.0,-217.34,0.91,1.0,237.75,0.37,1.0,237.75,0.4,-1.0,21.98,3674.69,10.0,11.0,17.76,22.0,15.56,5939.38,-6.0,-0.21,-1039.09,-7.0,0.77,-1022.07,-3.0,0.32,-439.59,-4.0,1.08,-520.12,0.0,0.54,117.15,0.0,0.72,117.15
4,31.0,179.0,176.0,32412.13,27065.32,42.0,149.0,133.0,27503.67,21878.04,38.0,False,4.0,181.07,187.83,50.0,False,False,0,True,False,False,4.0,1.49,882.6,4.0,1.64,882.6,1.0,-25.8,-4464.21,-37.0,-12.0,-18.45,-4743.03,184.59,6.0,1790.17,4.06,4.0,1289.51,3.8,2.0,610.48,1.4,0.0,109.82,0.74,2.0,481.46,0.67,0.0,-19.2,-0.13,3.0,27.29,5346.81,12.0,9.0,22.86,16.0,20.09,5625.63,-1.0,-0.75,-99.26,-4.0,-0.65,-582.9,0.0,1.04,240.44,-4.0,-0.28,-679.77,2.0,0.67,481.46,0.0,-0.13,-19.2


In [8]:
%%time
lm = LinearRegression()
lr_model = lm.fit(X1_train, y1_train)
scores = cross_val_score(lm, X1_train, y1_train, scoring='r2', cv=5)
scores.mean()

CPU times: user 4.99 s, sys: 6.67 s, total: 11.7 s
Wall time: 993 ms


0.6487081378768551

In [9]:
lr_model.score(X1_test, y1_test)

0.3871581817989218

In [10]:
%%time
rfm = RandomForestRegressor(n_jobs=-1, random_state=20)
rf_model = rfm.fit(X1_train, y1_train)
scores = cross_val_score(rfm, X1_train, y1_train, scoring='r2', cv=5)
scores.mean()

CPU times: user 1min 9s, sys: 1.25 s, total: 1min 10s
Wall time: 30.8 s


0.7740308754588255

In [11]:
rf_model.score(X1_test, y1_test)

0.6819918877114461

In [12]:
%%time
xgbm = XGBRegressor(n_jobs=-1, random_state=20)
xgb_model = xgbm.fit(X1_train, y1_train)
scores = cross_val_score(xgbm, X1_train, y1_train, scoring='r2', cv=5)
scores.mean()

CPU times: user 1h 29min 43s, sys: 34.5 s, total: 1h 30min 18s
Wall time: 7min 37s


0.771774576609926

In [13]:
xgbm.score(X1_test, y1_test)

0.6284803585279483

### Including STLY OTB Pickup (instead of just pace). 

I am debating whether or not to include these. They could be redundant.

CV / Test scores withOUT NONTRN: (rf clearly wins out)

lr: 0.65199443203582 / 0.38715818179913886

rf: 0.78-something (whoops) / 0.6781696834068323

xgb: 0.772595022467872 / 0.5452480935516137

And with STLY OTB cols, here were the results (test):

lr: 0.3871581817989218

rf: 0.6819918877114461

xgb: 0.6284803585279483


## Random Forest had the highest preliminary test score.

Let's use RandomizedSearchCV to tune the parameters.

Parameters of random grid search
```
random_grid = {
    "n_estimators": range(200, 2000, 100),
    "max_features": ["auto", "sqrt"],
    "max_depth": range(10, 110, 11),
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

rf = RandomForestRegressor()
rf_random = (RandomizedSearchCV(rf, random_grid, verbose=2, n_iter=50, random_state=42, n_jobs=-1))

rf_random.fit(X1_train, y1_train)
```

Results of random grid search:

```
{'n_estimators': 500,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 43,
 'bootstrap': True}
```

Score: 0.6519058137402494

In [None]:
rf_grid = {
    "n_estimators": range(300, 800, 50),
    "max_features": ['auto'],
    "max_depth": [30, 60, 2],
    "bootstrap": [True],
    "min_samples_split": [2, 3, 4, 7, 8, 9]
}
rfm = RandomForestRegressor()

rf_grid = GridSearchCV(rfm, rf_grid, n_jobs=-1, verbose=10, cv=5)
rf_grid.fit(X1_train, y1_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


In [None]:
rf_grid.best_params_

In [None]:
rf_grid.best_score_

In [None]:
# from keras.models import Sequential
# from keras.layers import LSTM


# lstm_model = Sequential()
# model.add(LSTM(4, input_shape=(31, len)