In [1]:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
df = pd.read_csv('../data/restaurants.csv', parse_dates=['visit_date'])

In [4]:
def denote_null_values(df):
    """Denotes whether or not there are null values or not"""
    empty_cols_query = df.isnull().sum() > 0
    empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()
    for col in empty_df_cols:
        col_name = f"{col}_missing"
        df[col_name] = pd.isnull(df[col])
    return df

In [3]:
df.head()

Unnamed: 0,id,visit_date,visitors,calendar_date,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
0,air_ba937bf13d40fb24,2016-01-13,25,2016-01-13,Wednesday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
1,air_ba937bf13d40fb24,2016-01-14,32,2016-01-14,Thursday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
2,air_ba937bf13d40fb24,2016-01-15,29,2016-01-15,Friday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
3,air_ba937bf13d40fb24,2016-01-16,22,2016-01-16,Saturday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
4,air_ba937bf13d40fb24,2016-01-18,6,2016-01-18,Monday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,


In [5]:
df = denote_null_values(df)

In [6]:
df.head()

Unnamed: 0,id,visit_date,visitors,calendar_date,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing
0,air_ba937bf13d40fb24,2016-01-13,25,2016-01-13,Wednesday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,True
1,air_ba937bf13d40fb24,2016-01-14,32,2016-01-14,Thursday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,True
2,air_ba937bf13d40fb24,2016-01-15,29,2016-01-15,Friday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,True
3,air_ba937bf13d40fb24,2016-01-16,22,2016-01-16,Saturday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,True
4,air_ba937bf13d40fb24,2016-01-18,6,2016-01-18,Monday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,,True


In [7]:
df['reserve_visitors'] = df['reserve_visitors'].fillna(0)

In [8]:
df.isnull().sum()

id                          0
visit_date                  0
visitors                    0
calendar_date               0
day_of_week                 0
holiday                     0
genre                       0
area                        0
latitude                    0
longitude                   0
reserve_visitors            0
reserve_visitors_missing    0
dtype: int64

In [9]:
date_col = df['visit_date'].copy()
df.drop('visit_date', axis=1, inplace=True)

In [10]:
df.drop('calendar_date', axis=1, inplace=True)

In [11]:
# let's initialize three different encoders
ordinal_encoder = ce.OrdinalEncoder()
onehot_encoder  = ce.OneHotEncoder()
target_encoder  = ce.TargetEncoder()

In [12]:
ordinal_encoder.fit_transform(df)

Unnamed: 0,id,visitors,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing
0,1,25,1,0,1,1,35.658068,139.751599,0.0,True
1,1,32,2,0,1,1,35.658068,139.751599,0.0,True
2,1,29,3,0,1,1,35.658068,139.751599,0.0,True
3,1,22,4,0,1,1,35.658068,139.751599,0.0,True
4,1,6,5,0,1,1,35.658068,139.751599,0.0,True
...,...,...,...,...,...,...,...,...,...,...
252103,829,49,3,0,4,10,34.695124,135.197852,6.0,False
252104,829,60,4,0,4,10,34.695124,135.197852,37.0,False
252105,829,69,7,0,4,10,34.695124,135.197852,35.0,False
252106,829,31,5,1,4,10,34.695124,135.197852,3.0,False


In [16]:
ordinal_encoder.category_mapping[0]['mapping']

air_ba937bf13d40fb24      1
air_25e9888d30b386df      2
air_fd6aac1043520e83      3
air_64d4491ad8cdb1c6      4
air_ee3a01f0c71a769f      5
                       ... 
air_cf5ab75a0afb8af9    826
air_1c0b150f9e696a5f    827
air_900d755ebd2f7bbd    828
air_a17f0778617c76e2    829
NaN                      -2
Length: 830, dtype: int64

In [21]:
onehot_encoder.fit_transform(df)

Unnamed: 0,id_1,id_2,id_3,id_4,id_5,id_6,id_7,id_8,id_9,id_10,...,area_98,area_99,area_100,area_101,area_102,area_103,latitude,longitude,reserve_visitors,reserve_visitors_missing
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,35.658068,139.751599,0.0,True
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,35.658068,139.751599,0.0,True
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,35.658068,139.751599,0.0,True
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,35.658068,139.751599,0.0,True
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,35.658068,139.751599,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,34.695124,135.197852,6.0,False
252104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,34.695124,135.197852,37.0,False
252105,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,34.695124,135.197852,35.0,False
252106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,34.695124,135.197852,3.0,False


In [23]:
target_encoder.fit_transform(df.drop('visitors', axis=1), df['visitors'])

Unnamed: 0,id,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing
0,22.782609,19.230121,0,18.723532,19.609418,35.658068,139.751599,0.0,True
1,22.782609,18.922702,0,18.723532,19.609418,35.658068,139.751599,0.0,True
2,22.782609,23.072737,0,18.723532,19.609418,35.658068,139.751599,0.0,True
3,22.782609,26.313688,0,18.723532,19.609418,35.658068,139.751599,0.0,True
4,22.782609,17.177009,0,18.723532,19.609418,35.658068,139.751599,0.0,True
...,...,...,...,...,...,...,...,...,...
252103,44.595745,23.072737,0,22.582953,20.466463,34.695124,135.197852,6.0,False
252104,44.595745,26.313688,0,22.582953,20.466463,34.695124,135.197852,37.0,False
252105,44.595745,23.873362,0,22.582953,20.466463,34.695124,135.197852,35.0,False
252106,44.595745,17.177009,1,22.582953,20.466463,34.695124,135.197852,3.0,False


In [24]:
df.groupby('day_of_week')['visitors'].mean()

day_of_week
Friday       23.072737
Monday       17.177009
Saturday     26.313688
Sunday       23.873362
Thursday     18.922702
Tuesday      17.672137
Wednesday    19.230121
Name: visitors, dtype: float64

In [26]:
df['visit_date'] = date_col
df.sort_values(by=['id', 'visit_date'], ascending=True, inplace=True)

In [29]:
train = df.groupby('id').apply(lambda x: x.iloc[:-15])
test  = df.groupby('id').apply(lambda x: x.iloc[-15:])

In [30]:
train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,visitors,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing,visit_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
air_00a91d42b08b08d9,166836,air_00a91d42b08b08d9,35,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,True,2016-07-01
air_00a91d42b08b08d9,166837,air_00a91d42b08b08d9,9,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,4.0,False,2016-07-02
air_00a91d42b08b08d9,166838,air_00a91d42b08b08d9,20,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,True,2016-07-04
air_00a91d42b08b08d9,166839,air_00a91d42b08b08d9,25,Tuesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,True,2016-07-05
air_00a91d42b08b08d9,166840,air_00a91d42b08b08d9,29,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,True,2016-07-06


In [33]:
train.groupby('day_of_week')['visitors'].mean()

day_of_week
Friday       23.021194
Monday       17.240350
Saturday     26.250407
Sunday       23.842540
Thursday     18.929925
Tuesday      17.720715
Wednesday    19.231336
Name: visitors, dtype: float64

In [34]:
test.groupby('day_of_week')['visitors'].mean()

day_of_week
Friday       23.975677
Monday       15.777697
Saturday     27.286489
Sunday       24.565285
Thursday     18.789123
Tuesday      16.623824
Wednesday    19.204450
Name: visitors, dtype: float64

In [35]:
target_encoder.fit_transform(train.drop('visitors', axis=1), train['visitors'])

Unnamed: 0_level_0,Unnamed: 1_level_0,id,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing,visit_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
air_00a91d42b08b08d9,166836,25.889401,23.021194,0,22.553878,26.268049,35.694003,139.753595,0.0,True,2016-07-01
air_00a91d42b08b08d9,166837,25.889401,26.250407,0,22.553878,26.268049,35.694003,139.753595,4.0,False,2016-07-02
air_00a91d42b08b08d9,166838,25.889401,17.240350,0,22.553878,26.268049,35.694003,139.753595,0.0,True,2016-07-04
air_00a91d42b08b08d9,166839,25.889401,17.720715,0,22.553878,26.268049,35.694003,139.753595,0.0,True,2016-07-05
air_00a91d42b08b08d9,166840,25.889401,19.231336,0,22.553878,26.268049,35.694003,139.753595,0.0,True,2016-07-06
...,...,...,...,...,...,...,...,...,...,...,...
air_fff68b929994bfbd,216629,5.137795,17.240350,0,13.298205,15.905319,35.708146,139.666288,0.0,True,2017-04-03
air_fff68b929994bfbd,216630,5.137795,17.720715,0,13.298205,15.905319,35.708146,139.666288,0.0,True,2017-04-04
air_fff68b929994bfbd,216631,5.137795,19.231336,0,13.298205,15.905319,35.708146,139.666288,2.0,False,2017-04-05
air_fff68b929994bfbd,216632,5.137795,18.929925,0,13.298205,15.905319,35.708146,139.666288,8.0,False,2017-04-06


In [36]:
target_encoder.transform(test.drop('visitors', axis=1), test['visitors'])

Unnamed: 0_level_0,Unnamed: 1_level_0,id,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing,visit_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
air_00a91d42b08b08d9,167048,25.889401,19.231336,0,22.553878,26.268049,35.694003,139.753595,2.0,False,2017-04-05
air_00a91d42b08b08d9,167049,25.889401,18.929925,0,22.553878,26.268049,35.694003,139.753595,8.0,False,2017-04-06
air_00a91d42b08b08d9,167050,25.889401,23.021194,0,22.553878,26.268049,35.694003,139.753595,1.0,False,2017-04-07
air_00a91d42b08b08d9,167051,25.889401,26.250407,0,22.553878,26.268049,35.694003,139.753595,33.0,False,2017-04-08
air_00a91d42b08b08d9,167052,25.889401,17.240350,0,22.553878,26.268049,35.694003,139.753595,0.0,True,2017-04-10
...,...,...,...,...,...,...,...,...,...,...,...
air_fff68b929994bfbd,216643,5.137795,17.720715,0,13.298205,15.905319,35.708146,139.666288,0.0,True,2017-04-18
air_fff68b929994bfbd,216644,5.137795,19.231336,0,13.298205,15.905319,35.708146,139.666288,0.0,True,2017-04-19
air_fff68b929994bfbd,216645,5.137795,18.929925,0,13.298205,15.905319,35.708146,139.666288,1.0,False,2017-04-20
air_fff68b929994bfbd,216646,5.137795,23.021194,0,13.298205,15.905319,35.708146,139.666288,6.0,False,2017-04-21


In [37]:
from sklearn.pipeline import make_pipeline

In [38]:
gbm  = GradientBoostingRegressor()
pipe = make_pipeline(target_encoder, gbm)

In [39]:
pipe

Pipeline(memory=None,
         steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', min_samples_leaf=1,
                               return_df=True, smoothing=1.0, verbose=0)),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                           criterion='friedman_mse', init=None,
                                           learning_..., loss='ls',
                                           max_depth=3, max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                      

In [45]:
train.drop('visit_date', axis=1, inplace=True)
test.drop('visit_date', axis=1, inplace=True)

In [46]:
X_train, y_train = train.drop('visitors', axis=1), train['visitors']
X_test, y_test   = test.drop('visitors', axis=1), test['visitors']

In [47]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', min_samples_leaf=1,
                               return_df=True, smoothing=1.0, verbose=0)),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                           criterion='friedman_mse', init=None,
                                           learning_..., loss='ls',
                                           max_depth=3, max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                      

In [49]:
pipe.predict(X_test.head())

array([23.775115  , 23.53959019, 29.27020293, 32.45187038, 20.69317674])

In [50]:
X_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
air_00a91d42b08b08d9,167048,air_00a91d42b08b08d9,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2.0,False
air_00a91d42b08b08d9,167049,air_00a91d42b08b08d9,Thursday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,8.0,False
air_00a91d42b08b08d9,167050,air_00a91d42b08b08d9,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,1.0,False
air_00a91d42b08b08d9,167051,air_00a91d42b08b08d9,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,33.0,False
air_00a91d42b08b08d9,167052,air_00a91d42b08b08d9,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,True


In [53]:
pipe[0].mapping

{'id': id
  1      25.889401
  2       9.552239
  3       9.902887
  4       7.920792
  5      14.161017
           ...    
  827     9.608696
  828    20.206140
  829     5.137795
 -1      20.952877
 -2      20.952877
 Length: 831, dtype: float64,
 'day_of_week': day_of_week
  1    23.021194
  2    26.250407
  3    17.240350
  4    17.720715
  5    19.231336
  6    18.929925
  7    23.842540
 -1    20.952877
 -2    20.952877
 dtype: float64,
 'genre': genre
  1     22.553878
  2     23.080782
  3     18.745333
  4     22.568583
  5     19.621273
  6     22.286880
  7     22.194222
  8     19.801428
  9     21.188772
  10    38.681188
  11    13.298205
  12    23.451130
  13    25.163743
  14    29.411523
 -1     20.952877
 -2     20.952877
 dtype: float64,
 'area': area
  1      26.268049
  2      19.621851
  3      22.866587
  4      19.264599
  5      13.857438
           ...    
  101     4.528634
  102    14.327731
  103    28.028571
 -1      20.952877
 -2      20.952877
 Length: 

In [54]:
pipe.score(X_test, y_test)

0.4664350862073289

In [55]:
pipe

Pipeline(memory=None,
         steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', min_samples_leaf=1,
                               return_df=True, smoothing=1.0, verbose=0)),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                           criterion='friedman_mse', init=None,
                                           learning_..., loss='ls',
                                           max_depth=3, max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                      

In [58]:
ordinal_encoder.fit_transform(X_train)

Unnamed: 0_level_0,Unnamed: 1_level_0,id,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
air_00a91d42b08b08d9,166836,472,3,0,4,32,35.694003,139.753595,0.0,True
air_00a91d42b08b08d9,166837,472,4,0,4,32,35.694003,139.753595,4.0,False
air_00a91d42b08b08d9,166838,472,5,0,4,32,35.694003,139.753595,0.0,True
air_00a91d42b08b08d9,166839,472,6,0,4,32,35.694003,139.753595,0.0,True
air_00a91d42b08b08d9,166840,472,1,0,4,32,35.694003,139.753595,0.0,True
...,...,...,...,...,...,...,...,...,...,...
air_fff68b929994bfbd,216629,668,5,0,9,63,35.708146,139.666288,0.0,True
air_fff68b929994bfbd,216630,668,6,0,9,63,35.708146,139.666288,0.0,True
air_fff68b929994bfbd,216631,668,1,0,9,63,35.708146,139.666288,2.0,False
air_fff68b929994bfbd,216632,668,2,0,9,63,35.708146,139.666288,8.0,False


In [59]:
target_encoder = ce.TargetEncoder(cols=['id'])
onehot_encoder = ce.OneHotEncoder()
pipe1 = make_pipeline(target_encoder, onehot_encoder, gbm)

In [61]:
pipe1.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('targetencoder',
                 TargetEncoder(cols=['id'], drop_invariant=False,
                               handle_missing='value', handle_unknown='value',
                               min_samples_leaf=1, return_df=True,
                               smoothing=1.0, verbose=0)),
                ('onehotencoder',
                 OneHotEncoder(cols=['day_of_week', 'genre', 'area'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', return_df=True,
                               use...
                                           learning_rate=0.1, loss='ls',
                                           max_depth=3, max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                 

In [62]:
pipe1.score(X_test, y_test)

0.4705601119449213

In [64]:
pipe1.score(X_train, y_train)

0.4758186379012257

In [65]:
pipe1.score(X_test, y_test)

0.4705601119449213

In [66]:
def create_val_splits(df, val_units=15, return_val=False):
    """Function that will take in a dataset and split it up into training, validation, and test sets"""
    # split into training, validation, and test sets
    train = df.groupby('id').apply(lambda x: x.iloc[:-val_units]).reset_index(drop=True)
    test  = df.groupby('id').apply(lambda x: x.iloc[-val_units:]).reset_index(drop=True)
    
    if return_val: 
        val   = train.groupby('id').apply(lambda x: x.iloc[-val_units:]).reset_index(drop=True)
        train = train.groupby('id').apply(lambda x: x.iloc[:-val_units]).reset_index(drop=True)
        return train, val, test
    else:
        return train, test

In [68]:
train, test = create_val_splits(df)

In [72]:
# add in our visit_date column again
df['visit_date'] = date_col

In [77]:
grouping = df.groupby('id').apply(lambda x: x['visitors'].shift())

In [81]:
df['yesterday'] = grouping.values

In [79]:
grouping.loc['air_fff68b929994bfbd']

216408    NaN
216409    3.0
216410    3.0
216411    7.0
216412    6.0
         ... 
216643    3.0
216644    6.0
216645    2.0
216646    2.0
216647    4.0
Name: visitors, Length: 269, dtype: float64

In [82]:
df.head()

Unnamed: 0,id,visitors,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing,visit_date,yesterday
166836,air_00a91d42b08b08d9,35,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,True,2016-07-01,
166837,air_00a91d42b08b08d9,9,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,4.0,False,2016-07-02,35.0
166838,air_00a91d42b08b08d9,20,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,True,2016-07-04,9.0
166839,air_00a91d42b08b08d9,25,Tuesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,True,2016-07-05,20.0
166840,air_00a91d42b08b08d9,29,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,True,2016-07-06,25.0


In [83]:
df['month'] = df.visit_date.dt.month

In [84]:
df.head()

Unnamed: 0,id,visitors,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing,visit_date,yesterday,month
166836,air_00a91d42b08b08d9,35,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,True,2016-07-01,,7
166837,air_00a91d42b08b08d9,9,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,4.0,False,2016-07-02,35.0,7
166838,air_00a91d42b08b08d9,20,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,True,2016-07-04,9.0,7
166839,air_00a91d42b08b08d9,25,Tuesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,True,2016-07-05,20.0,7
166840,air_00a91d42b08b08d9,29,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,True,2016-07-06,25.0,7


In [85]:
df = denote_null_values(df)

In [86]:
df.isnull().sum()

id                            0
visitors                      0
day_of_week                   0
holiday                       0
genre                         0
area                          0
latitude                      0
longitude                     0
reserve_visitors              0
reserve_visitors_missing      0
visit_date                    0
yesterday                   829
month                         0
yesterday_missing             0
dtype: int64

In [88]:
df = df.bfill()

In [90]:
??create_val_splits

In [98]:
train, val, test = create_val_splits(df, return_val=True)

In [94]:
test.head()

Unnamed: 0,id,visitors,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors,reserve_visitors_missing,visit_date,yesterday,month,yesterday_missing
0,air_00a91d42b08b08d9,35,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2.0,False,2017-04-05,17.0,4,False
1,air_00a91d42b08b08d9,29,Thursday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,8.0,False,2017-04-06,35.0,4,False
2,air_00a91d42b08b08d9,17,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,1.0,False,2017-04-07,29.0,4,False
3,air_00a91d42b08b08d9,9,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,33.0,False,2017-04-08,17.0,4,False
4,air_00a91d42b08b08d9,17,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0,True,2017-04-10,9.0,4,False


In [99]:
train.drop('visit_date', axis=1, inplace=True)
val.drop('visit_date', axis=1, inplace=True)
X_train, y_train = train.drop('visitors', axis=1), train['visitors']
X_val, y_val     = val.drop('visitors', axis=1), val['visitors']

In [100]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', min_samples_leaf=1,
                               return_df=True, smoothing=1.0, verbose=0)),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                           criterion='friedman_mse', init=None,
                                           learning_..., loss='ls',
                                           max_depth=3, max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                      

In [101]:
pipe.score(X_val, y_val)

0.5162597129609128