# Modeling Demand

**Target variable**: `Proj_TRN_RoomsPickup`: How many transient rooms will be booked for each stay date, from this point (8/1/17) forward, at current prices?

In [1]:
import pandas as pd
import numpy as np

from agg import prep_demand_features
from demand_features import rf_cols, rf2_cols

pd.options.display.max_rows = 160
pd.options.display.max_columns = 250
pd.options.display.max_colwidth = None

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import r2_score

from xgboost import XGBRegressor

DATE_FMT = "%Y-%m-%d"
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV, HalvingGridSearchCV
from sklearn.ensemble import RandomForestClassifier


print(__doc__)

Automatically created module for IPython interactive environment


In [2]:
print(len(rf_cols))
len(set(rf_cols))

81


81

In [3]:
print(len(rf2_cols))
len(set(rf2_cols))

89


89

In [4]:
df_stats = pd.read_pickle("../data/h2_stats.pick")

## Splitting Up Our Data for Train/Test

Our training set will contain all dates prior to as_of_date.

Our testing set will contain 31 stay dates starting on as_of_date. Our predictions will be used to provide price recommendations later on.

In [5]:
mask = (df_stats["StayDate"] < '2017-08-01')
test_mask = (df_stats['AsOfDate'] == '2017-08-01')
df_train = df_stats.loc[mask].copy()
df_test = df_stats.loc[test_mask].copy()

X_train = df_train[rf2_cols].copy()
X_test = df_test[rf2_cols].copy()
y_train = df_train['ACTUAL_TRN_RoomsPickup'].copy()
y_test = df_test['ACTUAL_TRN_RoomsPickup'].copy()

In [6]:
X_train.shape

(11216, 89)

In [7]:
X_train.head()

Unnamed: 0,week_of_year,RoomsOTB,RoomsOTB_STLY,RevOTB,RevOTB_STLY,CxlForecast,TRN_RoomsOTB,TRN_ADR_OTB,TRN_ADR_OTB_STLY,TRN_RoomsOTB_STLY,TRN_RevOTB,TRN_RevOTB_STLY,TRN_CxlForecast,TRNP_RoomsOTB,TRNP_ADR_OTB,TRNP_ADR_OTB_STLY,TRNP_RoomsOTB_STLY,TRNP_RevOTB,TRNP_RevOTB_STLY,TRNP_CxlForecast,WE,DaysUntilArrival,ADR_OTB,SellingPrice,RemSupply,Mon,Sat,Sun,Thu,Tue,Wed,ACTUAL_RoomsPickup_STLY,ACTUAL_ADR_Pickup_STLY,ACTUAL_RevPickup_STLY,ACTUAL_TRN_RoomsPickup_STLY,ACTUAL_TRN_ADR_Pickup_STLY,ACTUAL_TRN_RevPickup_STLY,ACTUAL_TRNP_RoomsPickup_STLY,ACTUAL_TRNP_ADR_Pickup_STLY,ACTUAL_TRNP_RevPickup_STLY,OTB_GapToLYA_RoomsSold,OTB_GapToLYA_ADR,OTB_GapToLYA_RoomRev,OTB_GapToLYA_NumCancels,OTB_GapToLYA_TRN_RoomsSold,OTB_GapToLYA_TRN_ADR,OTB_GapToLYA_TRN_RoomRev,OTB_GapToLYA_TRNP_RoomsSold,OTB_GapToLYA_TRNP_ADR,OTB_GapToLYA_TRNP_RoomRev,Pace_RoomsOTB,Pace_ADR_OTB,Pace_RevOTB,Pace_CxlForecast,Pace_RemSupply,Pace_SellingPrice,Pace_TRN_RoomsOTB,Pace_TRN_ADR_OTB,Pace_TRN_RevOTB,Pace_TRNP_RoomsOTB,Pace_TRNP_ADR_OTB,Pace_TRNP_RevOTB,Pace_TM30_RoomsPickup,Pace_TM30_ADR_Pickup,Pace_TM30_RevPickup,Pace_TM30_TRN_RoomsPickup,Pace_TM30_TRN_ADR_Pickup,Pace_TM30_TRN_RevPickup,Pace_TM30_TRNP_RoomsPickup,Pace_TM30_TRNP_ADR_Pickup,Pace_TM30_TRNP_RevPickup,Pace_TM15_RoomsPickup,Pace_TM15_ADR_Pickup,Pace_TM15_RevPickup,Pace_TM15_TRN_RoomsPickup,Pace_TM15_TRN_ADR_Pickup,Pace_TM15_TRN_RevPickup,Pace_TM15_TRNP_RoomsPickup,Pace_TM15_TRNP_ADR_Pickup,Pace_TM15_TRNP_RevPickup,Pace_TM05_RoomsPickup,Pace_TM05_ADR_Pickup,Pace_TM05_RevPickup,Pace_TM05_TRN_RoomsPickup,Pace_TM05_TRN_ADR_Pickup,Pace_TM05_TRN_RevPickup,Pace_TM05_TRNP_RoomsPickup,Pace_TM05_TRNP_ADR_Pickup,Pace_TM05_TRNP_RevPickup
0,30.0,212.0,34.0,23157.67,2368.18,10.0,103.0,115.26,54.31,15.0,11871.6,814.72,8.0,109.0,103.54,81.76,19.0,11286.07,1553.46,2.0,False,0.0,109.23,115.27,24.0,False,False,True,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-178.0,-39.58,-20789.49,-41.0,-88.0,-60.95,-11056.88,-90.0,-21.78,-9732.61,178.0,39.58,20789.49,-9.0,-187.0,53.31,88.0,60.95,11056.88,90.0,21.78,9732.61,9.0,13.43,1093.47,1.0,16.1,393.22,8.0,6.06,700.25,9.0,13.92,1230.9,3.0,15.93,692.65,6.0,6.52,538.25,7.0,12.18,795.92,0.0,14.69,221.42,7.0,4.89,574.5
1,31.0,189.0,48.0,22065.26,3342.22,12.0,149.0,121.95,66.11,16.0,18169.89,1057.76,8.0,40.0,97.38,71.39,32.0,3895.37,2284.46,4.0,False,1.0,116.75,118.95,49.0,True,False,False,False,False,False,3.0,-2.82,65.0,2.0,-7.35,0.0,0.0,0.0,0.0,-138.0,-49.94,-18658.04,33.0,-131.0,-63.19,-17112.13,-8.0,-25.99,-1610.91,141.0,47.12,18723.04,-10.0,-151.0,61.49,133.0,55.84,17112.13,8.0,25.99,1610.91,35.0,5.02,3096.35,5.0,6.18,729.1,30.0,3.92,2367.25,41.0,4.4,3894.25,14.0,5.76,1827.0,27.0,2.7,2067.25,28.0,-1.38,2294.22,11.0,1.11,1249.72,17.0,-2.9,1044.5
2,31.0,210.0,30.0,24525.32,1764.67,16.0,172.0,120.62,59.85,14.0,20745.85,837.96,12.0,38.0,99.46,57.92,16.0,3779.47,926.71,4.0,False,2.0,116.79,119.48,32.0,False,False,False,False,True,False,8.0,1.7,535.0,5.0,0.57,310.0,0.0,0.0,0.0,-172.0,-56.27,-22225.65,1.0,-153.0,-60.2,-19597.89,-22.0,-41.54,-2852.76,180.0,57.97,22760.65,3.0,-177.0,64.05,158.0,60.77,19907.89,22.0,41.54,2852.76,30.0,5.49,2806.9,11.0,10.1,1513.5,19.0,4.45,1293.4,33.0,3.15,2948.35,15.0,8.53,1792.95,18.0,2.16,1155.4,28.0,2.12,2305.27,10.0,4.1,1149.87,18.0,2.16,1155.4
3,31.0,218.0,80.0,25384.31,4959.01,19.0,178.0,121.01,68.83,16.0,21539.09,1101.3,13.0,40.0,96.13,60.28,64.0,3845.22,3857.71,6.0,False,3.0,116.44,120.04,27.0,False,False,False,False,False,True,21.0,1.12,1415.0,13.0,-4.99,750.0,2.0,0.9,180.0,-117.0,-53.33,-19010.3,27.0,-149.0,-57.17,-19687.79,26.0,-34.95,192.49,138.0,54.45,20425.3,9.0,-129.0,60.36,162.0,52.18,20437.79,-24.0,35.85,-12.49,39.0,3.55,3296.06,5.0,7.62,1191.06,34.0,0.78,2105.0,4.0,1.84,578.86,3.0,5.17,526.86,1.0,0.32,52.0,18.0,0.29,1119.12,1.0,0.18,109.12,17.0,0.14,1010.0
4,31.0,213.0,80.0,25259.44,5041.04,20.0,181.0,121.98,73.75,10.0,22077.97,737.5,14.0,32.0,99.42,61.48,70.0,3181.47,4303.54,6.0,False,4.0,118.59,114.64,33.0,False,False,False,True,False,False,33.0,3.51,2476.0,23.0,-2.77,1605.0,2.0,0.79,180.0,-100.0,-52.07,-17742.4,-14.0,-148.0,-51.0,-19735.47,40.0,-37.15,1302.07,133.0,55.58,20218.4,12.0,-121.0,42.48,171.0,48.23,21340.47,-38.0,37.94,-1122.07,11.0,3.33,1639.21,7.0,3.16,1346.21,4.0,0.64,293.0,5.0,1.55,685.21,3.0,1.24,527.21,2.0,0.49,158.0,-1.0,-0.07,-132.88,-1.0,-0.06,-132.88,0.0,0.0,0.0


## LINEAR REGRESSION

Failed to generalize. Our target variable is not a linear combination of the rate & revenue features that we know have an impact on demand.

In [8]:
%%time
lm = LinearRegression()
lr_model = lm.fit(X_train, y_train)
scores = cross_val_score(lm, X_train, y_train, scoring='r2', cv=5)
scores.mean()

CPU times: user 6.64 s, sys: 8.45 s, total: 15.1 s
Wall time: 1.25 s


0.6514330666407834

In [9]:
lr_model.score(X_test, y_test)

-0.012249378141351519

## RANDOM FOREST MODEL

I had high hopes for RF, and it came through. It works because of the amount and quality of the features I have engineered, despite the small training set. 

That's just not the case for H2, even after adding back in TRNT

In [10]:
%%time
rfm = RandomForestRegressor(n_jobs=-1, random_state=21)
rf_model = rfm.fit(X_train, y_train)
scores = cross_val_score(rfm, X_train, y_train, scoring='r2', cv=5)
scores.mean()

CPU times: user 1min 16s, sys: 1.55 s, total: 1min 18s
Wall time: 29 s


0.6033335851924635

In [11]:
rf_model.score(X_test, y_test)

0.4773217312648691

In [12]:
len(rf2_cols)

89

## XGBOOST MODEL (GRADIENT BOOSTING TREES)

XGBoost failed to generalize, likely due to the small training sample. 

In [13]:
# %%time
# xgbm = XGBRegressor(n_jobs=-1, random_state=21)
# xgb_model = xgbm.fit(X_train, y_train)
# scores = cross_val_score(xgbm, X_train, y_train, scoring='r2', cv=5)
# scores.mean()

In [14]:
# xgbm.score(X_test, y_test)

## MOVING FORWARD WITH RANDOM FOREST....

H2 model not as good (not even close). Hoping it can be fixed with hyperparameters, but it's likely due to the features not being able to predict city demand as well as resorts. After all, resorts tend to have more seasonal  demand than city.


## Successive Halving Grid Search

In [None]:
random_grid = {
    "n_estimators": range(80, 1020, 20),
    "max_depth": range(55, 92, 2),
    "min_samples_split": range(2, 20, 2)
}

rf = RandomForestRegressor()
rf_hgs = (HalvingGridSearchCV(rf, random_grid, min_resources=100, verbose=10, random_state=20, cv=5, n_jobs=36))

rf_hgs.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 9
n_possible_iterations: 5
min_resources_: 100
max_resources_: 11216
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 8037
n_resources: 100
Fitting 5 folds for each of 8037 candidates, totalling 40185 fits


Randomized halving did not improve score much. Resulting params were:
{'n_estimators': 740,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_depth': 80}

Trying HalvingGridSearch now, maybe it can tell me something.

In [None]:
rf_hgs.score(X_test, y_test)

In [None]:
rf_hgs.best_params_

In [None]:
# results.to_csv("halving_random_results_h2.csv")

Parameters of random grid search
```
random_grid = {
    "n_estimators": range(200, 2000, 100),
    "max_features": ["auto", "sqrt"],
    "max_depth": range(10, 110, 11),
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

rf = RandomForestRegressor()
rf_random = (RandomizedSearchCV(rf, random_grid, verbose=2, n_iter=50, random_state=42, n_jobs=-1))

rf_random.fit(X_train, y_train)
```

Results of random grid search:

```
{'n_estimators': 500,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 43,
 'bootstrap': True}
```

Score: 0.6519058137402494

## Brute Force Hyperparameter Tuning (GridSearchCV)

Best params thus far: 
Setup params:
```
GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [30, 56, 2],
                         'max_features': ['auto'],
                         'min_samples_split': [2, 3, 4, 8],
                         'n_estimators': range(300, 800, 40)},
             verbose=10)
```
Best resulting params:
```
{'bootstrap': True,
 'max_depth': 56,
 'max_features': 'auto',
 'min_samples_split': 3,
 'n_estimators': 300}
```

 $R^2$ CV score: `0.7785714200550233`


<font size="5.1" color='blue' style='strong'>Round 2 (Best Results, **Final Model**)</font>


Param grid:
```
rf_grid = {
    "n_estimators": range(150, 500, 50),
    "max_features": ['auto'],
    "max_depth": range(32,56,2),
    "bootstrap": [True],
    "min_samples_split": [2, 3, 4]
}
```

And the **results**:
```
{'bootstrap': True,
 'max_depth': 48,
 'min_samples_split': 2,
 'n_estimators': 150}
```
$R^2$ CV score: `0.779336423856766`
 
### Round 3 (Worse than Round 2)

Param grid:
```
rf_grid = {
    "n_estimators": range(75, 225, 25),
    "max_depth": [47, 48, 49],
    "bootstrap": [True],
    "min_samples_split": [2],
}
```

And the **results**:

Best params:
```
{'bootstrap': True,
 'max_depth': 47,
 'min_samples_split': 2,
 'n_estimators': 125}
```
$R^2$ CV score: `0.7775378755829061`

In [None]:
# rf_grid = {
#     "n_estimators": range(75, 200, 25),
#     "max_depth": [47, 48],
#     "bootstrap": [True],
#     "min_samples_split": [2],
# }
# rfm = RandomForestRegressor()

# rf_grid = GridSearchCV(rfm, rf_grid, n_jobs=-1, verbose=10, cv=5)
# rf_grid.fit(X1_train, y1_train)

In [None]:
# rf_grid.best_params_

In [None]:
# rf_grid.best_score_

In [None]:
# rf_grid.score(X1_test, y1_test)

## Final Model

In [None]:
rf = RandomForestRegressor(n_estimators=150,
      max_depth=48,
      min_samples_split=2,
      bootstrap=True,
      n_jobs=-1,
      random_state=20)

rf.fit(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

## Now that we have our model, let's get it in the simulation so we can evaluate our results.

Head over to `demand_model_evaluation.ipynb` for more.