In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [3]:
df = pd.read_csv('../data/master.csv', parse_dates = ['visit_date'])

In [4]:
df.drop('calendar_date', axis=1, inplace=True)

In [5]:
df = df.fillna(0)

In [6]:
df.head()

Unnamed: 0,id,visit_date,visitors,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
0,air_ba937bf13d40fb24,2016-01-13,25,Wednesday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0
1,air_ba937bf13d40fb24,2016-01-14,32,Thursday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0
2,air_ba937bf13d40fb24,2016-01-15,29,Friday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0
3,air_ba937bf13d40fb24,2016-01-16,22,Saturday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0
4,air_ba937bf13d40fb24,2016-01-18,6,Monday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0


In [7]:
te  = ce.TargetEncoder(cols=['id', 'area'])
ore = ce.OrdinalEncoder()
mod = GradientBoostingRegressor(verbose=1, random_state=42)

pipe = make_pipeline(te, ore, mod)

In [8]:
df.sort_values(by=['id', 'visit_date'], inplace=True)

In [9]:
df.groupby('id').mean()

Unnamed: 0_level_0,visitors,holiday,latitude,longitude,reserve_visitors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
air_00a91d42b08b08d9,26.081897,0.004310,35.694003,139.753595,7.379310
air_0164b9927d20bcc3,9.248322,0.033557,35.658068,139.751599,10.872483
air_0241aa3964b7f861,9.896465,0.042929,35.712607,139.779996,6.078283
air_0328696196e46f18,7.939655,0.051724,34.701279,135.528090,8.646552
air_034a3d5b40d5b1b1,14.828685,0.059761,34.692337,135.472229,8.494024
...,...,...,...,...,...
air_fea5dc9594450608,14.485401,0.036496,34.710896,137.725940,8.664234
air_fee8dcf4d619598e,26.027778,0.059028,34.695124,135.197853,8.739583
air_fef9ccb3ba0da2f7,9.620408,0.061224,34.815149,134.685353,9.061224
air_ffcc2d5087e1b476,20.242798,0.020576,35.658068,139.751599,7.893004


In [10]:
unique_ids = df['id'].unique().tolist()

In [11]:
for rest in unique_ids:
    temp_data = df[df['id'] == rest]
    test_data = temp_data.iloc[-15:]

In [12]:
df[df['id'] == unique_ids[0]].iloc[-15:]

Unnamed: 0,id,visit_date,visitors,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
167048,air_00a91d42b08b08d9,2017-04-05,35,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2.0
167049,air_00a91d42b08b08d9,2017-04-06,29,Thursday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,8.0
167050,air_00a91d42b08b08d9,2017-04-07,17,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,1.0
167051,air_00a91d42b08b08d9,2017-04-08,9,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,33.0
167052,air_00a91d42b08b08d9,2017-04-10,17,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0
167053,air_00a91d42b08b08d9,2017-04-11,43,Tuesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2.0
167054,air_00a91d42b08b08d9,2017-04-12,28,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2.0
167055,air_00a91d42b08b08d9,2017-04-13,34,Thursday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,7.0
167056,air_00a91d42b08b08d9,2017-04-14,39,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,4.0
167057,air_00a91d42b08b08d9,2017-04-17,19,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0


In [13]:
test = df.groupby('id').apply(lambda x: x.iloc[-15:])
train = df.groupby('id').apply(lambda x: x.iloc[:-15])

In [14]:
train.reset_index(drop=True, inplace=True)

In [15]:
val = train.groupby('id').apply(lambda x: x.iloc[-15:])
train = train.groupby('id').apply(lambda x: x.iloc[:-15])

In [16]:
# define some functions that we can reuse
def create_val_splits(df, val_units=15, return_val=False):
    """Function that will take in a dataset and split it up into training, validation, and test sets"""
    # split into training, validation, and test sets
    df = df.drop('visit_date', axis=1)
    train = df.groupby('id').apply(lambda x: x.iloc[:-val_units]).reset_index(drop=True)
    test  = df.groupby('id').apply(lambda x: x.iloc[-val_units:]).reset_index(drop=True)
    
    if return_val:
        val   = train.groupby('id').apply(lambda x: x.iloc[-val_units:]).reset_index(drop=True)
        train = train.groupby('id').apply(lambda x: x.iloc[:-val_units]).reset_index(drop=True)
        return train, val, test
    else:
        return train, test

In [17]:
train, val, test = create_val_splits(df, return_val=True)

In [18]:
X_train, y_train = train.drop('visitors', axis=1), train['visitors']
X_val, y_val = val.drop('visitors', axis=1), val['visitors']

In [19]:
pipe.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


      Iter       Train Loss   Remaining Time 
         1         253.5358           15.43s
         2         236.4692           14.45s
         3         222.2922           13.83s
         4         210.6249           13.75s
         5         201.0720           13.39s
         6         193.0328           13.28s
         7         186.5980           13.18s
         8         181.1435           13.02s
         9         176.7334           12.84s
        10         173.0381           12.72s
        20         155.6810           11.10s
        30         151.2830            9.74s
        40         149.5396            8.49s
        50         148.3023            7.25s
        60         147.1310            5.85s
        70         146.5496            4.38s
        80         145.9973            2.92s
        90         145.6742            1.45s
       100         145.2671            0.00s


Pipeline(steps=[('targetencoder', TargetEncoder(cols=['id', 'area'])),
                ('ordinalencoder',
                 OrdinalEncoder(cols=['day_of_week', 'genre'],
                                mapping=[{'col': 'day_of_week',
                                          'data_type': dtype('O'),
                                          'mapping': Friday       1
Saturday     2
Monday       3
Tuesday      4
Wednesday    5
Thursday     6
Sunday       7
NaN         -2
dtype: int64},
                                         {'col': 'genre',
                                          'data_type': dtype('O'),
                                          'mapping': Italian/French                   1
Izakaya                          2
Dining bar                       3
Cafe/Sweets                      4
Japanese food                    5
Western food                     6
Okonomiyaki/Monja/Teppanyaki     7
Other                            8
Yakiniku/Korean food             9
Asian              

In [20]:
pipe[-1].estimators_

array([[DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                              random_state=RandomState(MT19937) at 0x20518D4EA40)],
       [DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                              random_state=RandomState(MT19937) at 0x20518D4EA40)],
       [DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                              random_state=RandomState(MT19937) at 0x20518D4EA40)],
       [DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                              random_state=RandomState(MT19937) at 0x20518D4EA40)],
       [DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                              random_state=RandomState(MT19937) at 0x20518D4EA40)],
       [DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                              random_state=RandomState(MT19937) at 0x20518D4EA40)],
       [DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
             

In [21]:
pipe.predict(X_val)

array([33.9144731 , 33.44882017, 21.87924511, ...,  4.90443237,
        5.12950757,  4.51168977])

In [22]:
pipe.score(X_val, y_val)

0.4934377820532636

In [23]:
df['year'] = df['visit_date'].dt.year
df['month'] = df['visit_date'].dt.month
df['day'] = df['visit_date'].dt.day
df['quarter'] = df['visit_date'].dt.quarter
df['time'] = (df['visit_date'] - df['visit_date'].min()).dt.days

In [24]:
train, val, test = create_val_splits(df, return_val=True)

In [25]:
X_train, y_train = train.drop('visitors', axis=1), train['visitors']
X_val, y_val = val.drop('visitors', axis=1), val['visitors']

In [26]:
pipe.fit(X_train, y_train)
pipe.score(X_val, y_val)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


      Iter       Train Loss   Remaining Time 
         1         253.5358           22.87s
         2         236.4692           23.57s
         3         222.2922           23.51s
         4         210.6249           23.45s
         5         201.0720           23.01s
         6         193.0328           22.29s
         7         186.5980           22.11s
         8         181.1435           21.83s
         9         176.7334           21.57s
        10         173.0381           21.31s
        20         155.6810           19.57s
        30         151.0803           17.72s
        40         148.7138           15.66s
        50         147.2701           13.35s
        60         146.0362           10.63s
        70         145.0940            8.05s
        80         144.1010            5.45s
        90         143.5775            2.74s
       100         143.0504            0.00s


0.5012131202618897

In [27]:
pipe[-1].set_params(n_estimators=200, random_state=42)

GradientBoostingRegressor(n_estimators=200, random_state=42, verbose=1)

In [28]:
pipe.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


      Iter       Train Loss   Remaining Time 
         1         253.5358           50.54s
         2         236.4692           51.68s
         3         222.2922           51.03s
         4         210.6249           50.37s
         5         201.0720           49.22s
         6         193.0328           47.63s
         7         186.5980           47.20s
         8         181.1435           46.85s
         9         176.7334           46.94s
        10         173.0381           46.51s
        20         155.6810           43.27s
        30         151.0803           41.31s
        40         148.7138           39.65s
        50         147.2701           37.84s
        60         146.0362           35.18s
        70         145.0940           32.57s
        80         144.1010           30.01s
        90         143.5775           27.48s
       100         143.0504           25.07s
       200         140.4717            0.00s


Pipeline(steps=[('targetencoder', TargetEncoder(cols=['id', 'area'])),
                ('ordinalencoder',
                 OrdinalEncoder(cols=['day_of_week', 'genre'],
                                mapping=[{'col': 'day_of_week',
                                          'data_type': dtype('O'),
                                          'mapping': Friday       1
Saturday     2
Monday       3
Tuesday      4
Wednesday    5
Thursday     6
Sunday       7
NaN         -2
dtype: int64},
                                         {'col': 'genre',
                                          'data_type': dtype('O'),
                                          'mapping': Italian/French                   1
Izakaya                          2
Dining bar                       3
Cafe/Sweets                      4
Japanese food                    5
Western food                     6
Okonomiyaki/Monja/Teppanyaki     7
Other                            8
Yakiniku/Korean food             9
Asian              

In [29]:
pipe.score(X_val, y_val)

0.5058816354339639

In [30]:
df

Unnamed: 0,id,visit_date,visitors,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,year,month,day,quarter,time
166836,air_00a91d42b08b08d9,2016-07-01,35,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,2016,7,1,3,182
166837,air_00a91d42b08b08d9,2016-07-02,9,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,4.0,2016,7,2,3,183
166838,air_00a91d42b08b08d9,2016-07-04,20,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,2016,7,4,3,185
166839,air_00a91d42b08b08d9,2016-07-05,25,Tuesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,2016,7,5,3,186
166840,air_00a91d42b08b08d9,2016-07-06,29,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,2016,7,6,3,187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216643,air_fff68b929994bfbd,2017-04-18,6,Tuesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,0.0,2017,4,18,2,473
216644,air_fff68b929994bfbd,2017-04-19,2,Wednesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,0.0,2017,4,19,2,474
216645,air_fff68b929994bfbd,2017-04-20,2,Thursday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,1.0,2017,4,20,2,475
216646,air_fff68b929994bfbd,2017-04-21,4,Friday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,6.0,2017,4,21,2,476


In [31]:
df

Unnamed: 0,id,visit_date,visitors,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,year,month,day,quarter,time
166836,air_00a91d42b08b08d9,2016-07-01,35,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,2016,7,1,3,182
166837,air_00a91d42b08b08d9,2016-07-02,9,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,4.0,2016,7,2,3,183
166838,air_00a91d42b08b08d9,2016-07-04,20,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,2016,7,4,3,185
166839,air_00a91d42b08b08d9,2016-07-05,25,Tuesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,2016,7,5,3,186
166840,air_00a91d42b08b08d9,2016-07-06,29,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,2016,7,6,3,187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216643,air_fff68b929994bfbd,2017-04-18,6,Tuesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,0.0,2017,4,18,2,473
216644,air_fff68b929994bfbd,2017-04-19,2,Wednesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,0.0,2017,4,19,2,474
216645,air_fff68b929994bfbd,2017-04-20,2,Thursday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,1.0,2017,4,20,2,475
216646,air_fff68b929994bfbd,2017-04-21,4,Friday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,6.0,2017,4,21,2,476


In [32]:
df['yesterday'] = df.groupby('id')['visitors'].shift(1)
df['last_week'] = df.groupby('id')['visitors'].shift(7)
df['seven_day_ma'] = df.groupby('id')['visitors'].rolling(7).mean().shift().values

In [33]:
df = df.bfill()

In [34]:
train, val, test = create_val_splits(df, return_val=True)

In [35]:
X_train, y_train = train.drop('visitors', axis=1), train['visitors']
X_val, y_val = val.drop('visitors', axis=1), val['visitors']

In [36]:
pipe.fit(X_train, y_train)
pipe.score(X_val, y_val)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


      Iter       Train Loss   Remaining Time 
         1         253.1927            1.13m
         2         235.4788            1.15m
         3         220.7651            1.14m
         4         208.6898            1.13m
         5         198.6838            1.12m
         6         190.1258            1.11m
         7         183.0460            1.10m
         8         177.2780            1.09m
         9         172.4111            1.09m
        10         168.0901            1.08m
        20         148.1283            1.03m
        30         142.4784           58.37s
        40         140.1507           55.06s
        50         138.8598           52.31s
        60         137.5969           50.29s
        70         136.6072           48.05s
        80         135.8504           45.25s
        90         135.2389           41.89s
       100         134.8387           38.55s
       200         131.3272            0.00s


0.5279998593000268

In [37]:
df['visitors'].rolling(3).mean()

166836          NaN
166837          NaN
166838    21.333333
166839    18.000000
166840    24.666667
            ...    
216643     5.333333
216644     3.666667
216645     3.333333
216646     2.666667
216647     3.666667
Name: visitors, Length: 252108, dtype: float64

In [38]:
pipe[0].set_params(min_samples_leaf=30)

TargetEncoder(cols=['id', 'area'], min_samples_leaf=30)

In [39]:
pipe

Pipeline(steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'area'], min_samples_leaf=30)),
                ('ordinalencoder',
                 OrdinalEncoder(cols=['day_of_week', 'genre'],
                                mapping=[{'col': 'day_of_week',
                                          'data_type': dtype('O'),
                                          'mapping': Friday       1
Saturday     2
Monday       3
Tuesday      4
Wednesday    5
Thursday     6
Sunday       7
NaN         -2
dtype: int64},
                                         {'col': 'genre',
                                          'data_type': dtype('O'),
                                          'mapping': Italian/French                   1
Izakaya                          2
Dining bar                       3
Cafe/Sweets                      4
Japanese food                    5
Western food                     6
Okonomiyaki/Monja/Teppanyaki     7
Other                            8
Yakiniku/Korean 

In [40]:
max_features = [0.8, 0.7, 0.6]

for feature in max_features:
    pipe[-1].set_params(max_features=feature)
    pipe.fit(X_train, y_train)
    print(feature, pipe.score(X_train, y_train), pipe.score(X_val, y_val))

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


      Iter       Train Loss   Remaining Time 
         1         253.5359           51.55s
         2         235.9611           51.68s
         3         221.4186           52.01s
         4         209.2761           52.72s
         5         199.2333           52.03s
         6         190.7932           52.12s
         7         183.8676           51.70s
         8         178.1020           51.00s
         9         172.9892           50.68s
        10         168.8832           50.29s
        20         149.1903           48.26s
        30         143.2678           45.66s
        40         140.6663           43.31s
        50         139.1845           40.73s
        60         137.9940           38.11s
        70         137.2406           35.30s
        80         136.4320           32.59s
        90         135.8166           29.94s
       100         135.0330           27.30s
       200         131.3139            0.00s
0.8 0.5215435977229348 0.5143759000198023


  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


      Iter       Train Loss   Remaining Time 
         1         253.4261           45.57s
         2         236.2229           46.73s
         3         221.5749           47.41s
         4         209.4419           47.48s
         5         199.3300           48.16s
         6         191.0521           48.50s
         7         183.9289           48.80s
         8         177.9130           48.79s
         9         173.0272           48.41s
        10         168.9230           47.84s
        20         149.1718           45.06s
        30         143.4339           43.75s
        40         140.5762           41.64s
        50         139.1375           38.99s
        60         137.5896           36.67s
        70         136.9449           34.06s
        80         136.1080           31.18s
        90         135.6201           28.40s
       100         135.1805           25.69s
       200         131.9783            0.00s
0.7 0.51912265095304 0.5320209959773362


  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


      Iter       Train Loss   Remaining Time 
         1         253.2919           35.62s
         2         236.2726           38.51s
         3         221.9131           38.28s
         4         209.8009           38.12s
         5         199.7036           37.95s
         6         191.2424           38.74s
         7         185.0240           38.27s
         8         178.8343           38.26s
         9         173.8896           37.92s
        10         169.7068           37.72s
        20         149.5613           38.53s
        30         143.7274           36.93s
        40         141.2144           34.65s
        50         139.5877           32.41s
        60         138.5291           30.06s
        70         137.7149           27.89s
        80         137.2356           25.64s
        90         136.7091           23.55s
       100         135.9259           21.32s
       200         132.3504            0.00s
0.6 0.5177666979351339 0.5340691915987399


In [41]:
pipe[-1].set_params(max_features=0.8)

GradientBoostingRegressor(max_features=0.8, n_estimators=200, random_state=42,
                          verbose=1)

In [42]:
estimators = [150, 200]
tree_depth = [3, 4]
cv_scores  = []

for estimator in estimators:
    for depth in tree_depth:
        pipe[-1].set_params(n_estimators=estimator, max_depth=depth)
        print(f"Fitting for {estimator}, {depth}")
        pipe.fit(X_train, y_train)
        cv_scores.append((pipe.score(X_val, y_val), estimator, depth))

Fitting for 150, 3


  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


      Iter       Train Loss   Remaining Time 
         1         253.5359           37.99s
         2         235.9611           38.26s
         3         221.4186           38.66s
         4         209.2761           38.51s
         5         199.2333           38.05s
         6         190.7932           38.14s
         7         183.8676           37.77s
         8         178.1020           37.42s
         9         172.9892           37.55s
        10         168.8832           37.39s
        20         149.1903           34.79s
        30         143.2678           32.38s
        40         140.6663           29.69s
        50         139.1845           26.84s
        60         137.9940           24.02s
        70         137.2406           21.32s
        80         136.4320           18.66s
        90         135.8166           16.00s
       100         135.0330           13.28s
Fitting for 150, 4


  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


      Iter       Train Loss   Remaining Time 
         1         251.8314           54.39s
         2         233.2563           52.62s
         3         217.9332           52.48s
         4         205.5422           52.31s
         5         195.2023           51.68s
         6         186.5320           51.62s
         7         179.3134           52.17s
         8         173.3565           51.46s
         9         168.2083           51.01s
        10         164.0142           50.57s
        20         144.8123           46.54s
        30         139.4865           42.73s
        40         136.6229           39.81s
        50         134.7662           36.10s
        60         133.2651           32.41s
        70         132.6331           29.29s
        80         131.6709           25.83s
        90         130.8272           22.30s
       100         130.2053           18.62s
Fitting for 200, 3


  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


      Iter       Train Loss   Remaining Time 
         1         253.5359           52.73s
         2         235.9611           52.57s
         3         221.4186           54.50s
         4         209.2761           56.08s
         5         199.2333           56.18s
         6         190.7932           57.28s
         7         183.8676           57.53s
         8         178.1020           56.80s
         9         172.9892           56.97s
        10         168.8832           56.74s
        20         149.1903           51.72s
        30         143.2678           48.39s
        40         140.6663           45.62s
        50         139.1845           44.06s
        60         137.9940           41.09s
        70         137.2406           38.05s
        80         136.4320           34.99s
        90         135.8166           32.21s
       100         135.0330           28.93s
       200         131.3139            0.00s
Fitting for 200, 4


  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


      Iter       Train Loss   Remaining Time 
         1         251.8314            1.44m
         2         233.2563            1.32m
         3         217.9332            1.38m
         4         205.5422            1.39m
         5         195.2023            1.40m
         6         186.5320            1.38m
         7         179.3134            1.34m
         8         173.3565            1.32m
         9         168.2083            1.33m
        10         164.0142            1.34m
        20         144.8123            1.24m
        30         139.4865            1.16m
        40         136.6229            1.07m
        50         134.7662           58.95s
        60         133.2651           54.85s
        70         132.6331           50.82s
        80         131.6709           47.29s
        90         130.8272           43.08s
       100         130.2053           38.97s
       200         125.5299            0.00s


In [43]:
cv_scores

[(0.5398869304846574, 150, 3),
 (0.5459489278277129, 150, 4),
 (0.5143759000198023, 200, 3),
 (0.538377878520546, 200, 4)]

In [44]:
pipe[-1].set_params(n_estimators=200, max_depth=4)

GradientBoostingRegressor(max_depth=4, max_features=0.8, n_estimators=200,
                          random_state=42, verbose=1)

In [45]:
X_train = pd.concat([X_train, X_val])
y_train = pd.concat([y_train, y_val])

In [46]:
pipe.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


      Iter       Train Loss   Remaining Time 
         1         254.3975            1.68m
         2         235.3831            1.52m
         3         219.8009            1.44m
         4         207.0906            1.39m
         5         196.5499            1.34m
         6         187.7968            1.35m
         7         180.3819            1.33m
         8         174.3791            1.29m
         9         169.0915            1.28m
        10         164.7840            1.27m
        20         145.4972            1.15m
        30         140.1917            1.08m
        40         137.3015            1.02m
        50         136.0072           57.17s
        60         134.7212           53.36s
        70         133.6475           49.88s
        80         132.4805           46.50s
        90         131.9315           42.60s
       100         131.2377           38.69s
       200         126.3152            0.00s


Pipeline(steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'area'], min_samples_leaf=30)),
                ('ordinalencoder',
                 OrdinalEncoder(cols=['day_of_week', 'genre'],
                                mapping=[{'col': 'day_of_week',
                                          'data_type': dtype('O'),
                                          'mapping': Friday       1
Saturday     2
Monday       3
Tuesday      4
Wednesday    5
Thursday     6
Sunday       7
NaN         -2
dtype: int64},
                                         {'col': 'genre',
                                          'data_type': dtype('O'),
                                          'mapping': Italian/French                   1
Izakaya                          2
Dining bar                       3
Cafe/Sweets                      4
Japanese food                    5
Western food                     6
Okonomiyaki/Monja/Teppanyaki     7
Other                            8
Yakiniku/Korean 

In [47]:
X_test, y_test = test.drop('visitors', axis=1), test['visitors']
pipe.score(X_test, y_test)

0.5202367417033456

In [None]:
X_test, y_test = test.drop('visitors', axis=1), test['visitors']
pipe.score(X_test, y_test)