In [2]:
# first, let's fit our model, using our standard setup
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline

# read in data
df = pd.read_csv('../data/restaurants.csv')
df.drop(['calendar_date', 'visit_date'], axis=1, inplace=True)

# fill missing values
df = df.fillna(0)

# declare X & y
X = df.drop('visitors', axis=1)
y = df['visitors']

# make pipeline
pipe = make_pipeline(ce.TargetEncoder(), GradientBoostingRegressor())

# fit
pipe.fit(X, y)
# and score
pipe.score(X, y)

0.47122281492324647

In [13]:
np.log(X)

TypeError: loop of ufunc does not support argument 0 of type str which has no callable log method

In [15]:
our_model_error = np.sum((y - pipe.predict(X))**2)

In [16]:
naive_model_error = np.sum((y - y.mean())**2)

In [10]:
1 - (our_model_error / naive_model_error)

0.47122281492324647

In [17]:
# let's create our feature importance dataframe
feats = pd.DataFrame({
    'Columns': X.columns,
    'Importance': pipe[1].feature_importances_
}).sort_values(by='Importance', ascending=False)

# and here we go
feats

Unnamed: 0,Columns,Importance
0,id,0.875794
1,day_of_week,0.105272
2,holiday,0.006767
7,reserve_visitors,0.004605
6,longitude,0.004496
5,latitude,0.002085
4,area,0.000509
3,genre,0.000473


In [18]:
feats['Importance'].sum()

1.0000000000000002

In [19]:
# make a copy
X_copy = X.copy()

# and shuffle the id column
X_copy['id'] = X_copy['id'].sample(frac=1).values

In [20]:
X['id']

0         air_ba937bf13d40fb24
1         air_ba937bf13d40fb24
2         air_ba937bf13d40fb24
3         air_ba937bf13d40fb24
4         air_ba937bf13d40fb24
                  ...         
252103    air_a17f0778617c76e2
252104    air_a17f0778617c76e2
252105    air_a17f0778617c76e2
252106    air_a17f0778617c76e2
252107    air_a17f0778617c76e2
Name: id, Length: 252108, dtype: object

In [21]:
X_copy['id']

0         air_c8265ecc116f2284
1         air_ba937bf13d40fb24
2         air_9d452a881f7f2bb7
3         air_79f528087f49df06
4         air_e9ebf7fc520ac76a
                  ...         
252103    air_caf996ac27206301
252104    air_5fbda8e9302f7c13
252105    air_e483f5b3c4f310e0
252106    air_f88898cd09f40496
252107    air_4ce7b17062a1bf73
Name: id, Length: 252108, dtype: object

In [22]:
# let's do this for all of our columns
cols       = []
impact     = []

for column in X.columns:

    X_copy         = X.copy()
    X_copy[column] = X_copy[column].sample(frac=1).values
    total_impact   = np.abs(pipe.score(X_copy, y) - pipe.score(X, y))
    cols.append(column)
    impact.append(total_impact)
    
# and turn it into a dataframe
feats = pd.DataFrame({'Column': cols,
                      'Impact': impact}).sort_values(by='Impact', ascending=False)


In [26]:
feats = feats['Impact'] / feats['Impact'].sum()

In [28]:
feats['Impact']

KeyError: 'Impact'

In [34]:
feats.cumsum() < .99

0     True
1     True
2     True
6    False
7    False
5    False
3    False
4    False
Name: Impact, dtype: bool

In [35]:
feats.cumsum()

0    0.874793
1    0.980896
2    0.988457
6    0.993355
7    0.997820
5    0.999235
3    0.999627
4    1.000000
Name: Impact, dtype: float64

In [37]:
(2**5)*500

16000

In [38]:
# make a copy of X -- makes it easier
X_copy = X.copy()
# an empty dataframe
preds  = pd.DataFrame()

# we'll change EVERY single value of day_of_week to monday
X_copy['day_of_week'] = 'Monday'
preds['Monday'] = pipe.predict(X_copy)

In [39]:
X_copy.head()

Unnamed: 0,id,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
0,air_ba937bf13d40fb24,Monday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0
1,air_ba937bf13d40fb24,Monday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0
2,air_ba937bf13d40fb24,Monday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0
3,air_ba937bf13d40fb24,Monday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0
4,air_ba937bf13d40fb24,Monday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0


In [40]:
preds

Unnamed: 0,Monday
0,17.834932
1,17.834932
2,17.834932
3,17.834932
4,17.834932
...,...
252103,37.035296
252104,37.666372
252105,37.621192
252106,45.473085


In [41]:
# we'll do a loop and derive the same values for each unique day of the week
days_of_week = df['day_of_week'].unique()

# make a copy of X -- makes it easier
X_copy = X.copy()
# an empty dataframe
preds  = pd.DataFrame()

# loop through each unique value in the day_of_week column
for day in days_of_week:
    # set the value for the entire column during that day
    X_copy['day_of_week'] = day
    # look at our new predicted values with the adjusted column
    preds[day] = pipe.predict(X_copy)

In [42]:
preds

Unnamed: 0,Wednesday,Thursday,Friday,Saturday,Monday,Tuesday,Sunday
0,20.948331,20.163744,25.052949,27.507854,17.834932,18.915649,24.671304
1,20.948331,20.163744,25.052949,27.507854,17.834932,18.915649,24.671304
2,20.948331,20.163744,25.052949,27.507854,17.834932,18.915649,24.671304
3,20.948331,20.163744,25.052949,27.507854,17.834932,18.915649,24.671304
4,20.948331,20.163744,25.052949,27.507854,17.834932,18.915649,24.671304
...,...,...,...,...,...,...,...
252103,40.867392,40.262585,46.111691,50.812908,37.035296,38.591633,51.376021
252104,42.290367,41.685560,47.534667,52.110450,37.666372,39.837884,52.673564
252105,42.245186,41.640379,47.489486,51.791164,37.621192,39.792704,52.354277
252106,44.888994,44.941017,49.723604,51.307395,45.473085,46.051193,52.818498
