In [1]:
# first, let's fit our model, using our standard setup
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline

# read in data
df = pd.read_csv('/Users/cameronlefevre/Data Science/coding/GA-DS-Class/ClassMaterial/Unit3/data/restaurants.csv')
df.drop(['calendar_date', 'visit_date'], axis=1, inplace=True)

# fill missing values
df = df.fillna(0)

# declare X & y
X = df.drop('visitors', axis=1)
y = df['visitors']

# make pipeline
pipe = make_pipeline(ce.TargetEncoder(), GradientBoostingRegressor())

# fit
pipe.fit(X, y)
# and score
pipe.score(X, y)

  elif pd.api.types.is_categorical(cols):


0.47122281492324647

In [4]:
our_model_error = np.sum((y - pipe.predict(X))**2)

In [6]:
naive_model_error = np.sum((y - y.mean())**2)

In [7]:
R = 1 - (our_model_error/naive_model_error)

In [8]:
R

0.47122281492324647

In [9]:
# let's create our feature importance dataframe
feats = pd.DataFrame({
    'Columns': X.columns,
    'Importance': pipe[1].feature_importances_
}).sort_values(by='Importance', ascending=False)
# and here we go
feats

Unnamed: 0,Columns,Importance
0,id,0.8758
1,day_of_week,0.105272
2,holiday,0.006767
7,reserve_visitors,0.004605
6,longitude,0.00437
5,latitude,0.002163
4,area,0.000551
3,genre,0.000473


In [10]:
# make a copy
X_copy = X.copy()

# and shuffle the id column
X_copy['id'] = X_copy['id'].sample(frac=1).values

In [11]:
pipe.score(X_copy, y)

-0.36415907222346466

In [12]:
# let's do this for all of our columns
cols       = []
impact     = []
for column in X.columns:
    X_copy         = X.copy()
    X_copy[column] = X_copy[column].sample(frac=1).values
    total_impact   = np.abs(pipe.score(X_copy, y) - pipe.score(X, y))
    cols.append(column)
    impact.append(total_impact)
# and turn it into a dataframe
feats = pd.DataFrame({'Column': cols,
                      'Impact': impact}).sort_values(by='Impact', ascending=False)

In [13]:
feats

Unnamed: 0,Column,Impact
0,id,0.833339
1,day_of_week,0.100391
2,holiday,0.006825
6,longitude,0.004624
7,reserve_visitors,0.004202
5,latitude,0.001481
3,genre,0.000383
4,area,0.000363


In [16]:
# make a copy of X
X_copy = X.copy()

#create an empty dataframe
preds = pd.DataFrame()

#we'll change every single value of day_of_week to Monday
X_copy['day_of_week'] = 'Monday'

#assign predicted values to the Monday column in the empty dataframe
preds['Monday'] = pipe.predict(X_copy)


# now do the same for Tuesday
X_copy['day_of_week'] = 'Tuesday'
preds['Tuesday'] = pipe.predict(X_copy)


In [17]:
X_copy.head()

Unnamed: 0,id,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
0,air_ba937bf13d40fb24,Tuesday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0
1,air_ba937bf13d40fb24,Tuesday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0
2,air_ba937bf13d40fb24,Tuesday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0
3,air_ba937bf13d40fb24,Tuesday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0
4,air_ba937bf13d40fb24,Tuesday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,0.0


In [18]:
preds

Unnamed: 0,Monday,Tuesday
0,17.834932,18.915649
1,17.834932,18.915649
2,17.834932,18.915649
3,17.834932,18.915649
4,17.834932,18.915649
...,...,...
252103,37.035296,38.591633
252104,37.666372,39.837884
252105,37.621192,39.792704
252106,45.473085,46.051193


In [19]:
preds.mean()

Monday     16.688220
Tuesday    17.939777
dtype: float64

In [21]:
# this is the marginal change from going from Monday to Tuesday. Ie we expect Tuesday will have 1.25 more visitors than Monday
preds.mean().diff()

Monday          NaN
Tuesday    1.251556
dtype: float64

In [22]:
# we'll do a loop and derive the same values for each unique day of the week
days_of_week = df['day_of_week'].unique()

# make a copy of X -- makes it easier
X_copy = X.copy()

# an empty dataframe
preds  = pd.DataFrame()

# loop through each unique value in the day_of_week column
for day in days_of_week:
    # set the value for the entire column during that day
    X_copy['day_of_week'] = day
    # look at our new predicted values with the adjusted column
    preds[day] = pipe.predict(X_copy)

In [23]:
preds

Unnamed: 0,Wednesday,Thursday,Friday,Saturday,Monday,Tuesday,Sunday
0,20.948331,20.163744,25.052949,27.507854,17.834932,18.915649,24.671304
1,20.948331,20.163744,25.052949,27.507854,17.834932,18.915649,24.671304
2,20.948331,20.163744,25.052949,27.507854,17.834932,18.915649,24.671304
3,20.948331,20.163744,25.052949,27.507854,17.834932,18.915649,24.671304
4,20.948331,20.163744,25.052949,27.507854,17.834932,18.915649,24.671304
...,...,...,...,...,...,...,...
252103,40.867392,40.262585,46.111691,50.812908,37.035296,38.591633,51.376021
252104,42.290367,41.685560,47.534667,52.110450,37.666372,39.837884,52.673564
252105,42.245186,41.640379,47.489486,51.791164,37.621192,39.792704,52.354277
252106,44.888994,44.941017,49.723604,51.307395,45.473085,46.051193,52.818498


In [26]:
preds = preds[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']]

preds.mean()

Monday       16.688220
Tuesday      17.939777
Wednesday    19.355143
Thursday     18.938121
Friday       23.171111
Saturday     26.148835
Sunday       23.806839
dtype: float64

In [31]:
# Every day is compared to the previous day
preds.mean().diff()

Monday            NaN
Tuesday      1.251556
Wednesday    1.415366
Thursday    -0.417022
Friday       4.232989
Saturday     2.977724
Sunday      -2.341996
dtype: float64

In [30]:
pd.options.plotting.backend = "plotly"

preds.mean().diff().fillna(0).plot(title = 'Expected Impact for Different Days of the Week on Attendance')

In [32]:
!pip install pdpbox

Collecting pdpbox
  Downloading PDPbox-0.2.0.tar.gz (57.7 MB)
[K     |████████████████████████████████| 57.7 MB 95.5 MB/s eta 0:00:01     |███████▎                        | 13.2 MB 11.4 MB/s eta 0:00:04     |█████████████████▉              | 32.2 MB 11.4 MB/s eta 0:00:03     |█████████████████████▍          | 38.6 MB 11.4 MB/s eta 0:00:02
Building wheels for collected packages: pdpbox
  Building wheel for pdpbox (setup.py) ... [?25ldone
[?25h  Created wheel for pdpbox: filename=PDPbox-0.2.0-py3-none-any.whl size=57690722 sha256=e0ba36db8f83862e68240c6a8ef8f06761e74aedaa17b113027a661ee3d4936e
  Stored in directory: /Users/cameronlefevre/Library/Caches/pip/wheels/7f/7d/f5/136844ad90a5cfb60b92f6de7afadd0574042b6012a6d16e2b
Successfully built pdpbox
Installing collected packages: pdpbox
Successfully installed pdpbox-0.2.0
