In [1]:
import pandas as pd
import numpy as np 
import category_encoders as ce

In [4]:
df = pd.read_csv('/Users/ethanalter/Dropbox (Personal)/GA-4K-DataScience/gazelle-4K/data_master/master.csv', parse_dates = ['visit_date'])

In [5]:
df.head()

Unnamed: 0,id,visit_date,visitors,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
0,air_ba937bf13d40fb24,2016-01-13,25,Wednesday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
1,air_ba937bf13d40fb24,2016-01-14,32,Thursday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
2,air_ba937bf13d40fb24,2016-01-15,29,Friday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
3,air_ba937bf13d40fb24,2016-01-16,22,Saturday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,
4,air_ba937bf13d40fb24,2016-01-18,6,Monday,0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,


In [6]:
df.describe()

Unnamed: 0,visitors,holiday,latitude,longitude,reserve_visitors
count,252108.0,252108.0,252108.0,252108.0,108394.0
mean,20.973761,0.050673,35.613121,137.357865,16.699808
std,16.757007,0.219329,2.044473,3.671577,17.388735
min,1.0,0.0,33.211967,130.195555,1.0
25%,9.0,0.0,34.692337,135.341564,4.0
50%,17.0,0.0,35.658068,139.670038,10.0
75%,29.0,0.0,35.694003,139.751599,24.0
max,877.0,1.0,44.020632,144.273398,96.0


In [7]:
ore = ce.OrdinalEncoder()

In [8]:
X = df.drop(['visit_date', 'visitors'], axis = 1)
y = df['visitors']

In [9]:
ore.fit_transform(X)
#fit creates the mappings 
#transform returns the modified version of X in a view - does NOT save in place 
#you can do these separately 
#once you fit, you can tranform new data (new samples) based on the existing mappings 
#important to understand that these are 2 different steps 

Unnamed: 0,id,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
0,1,1,0,1,1,35.658068,139.751599,
1,1,2,0,1,1,35.658068,139.751599,
2,1,3,0,1,1,35.658068,139.751599,
3,1,4,0,1,1,35.658068,139.751599,
4,1,5,0,1,1,35.658068,139.751599,
...,...,...,...,...,...,...,...,...
252103,829,3,0,4,10,34.695124,135.197852,6.0
252104,829,4,0,4,10,34.695124,135.197852,37.0
252105,829,7,0,4,10,34.695124,135.197852,35.0
252106,829,5,1,4,10,34.695124,135.197852,3.0


#### One Hot Encoding - it's the default encoding method, but it has some issues 
* can be inefficient or difficult to interpret if you're working with high-cardinality data 
* paticular pathology for tree-based models: tree-based models split on individual values, so you'll be splitting on (or cleaving off) very small fractions of your dataset. Tree will likely not be able to split on high cardinality categorical variables. You could end up with a very important column that the tree will have a hard time actually splitting on. 
    * Another way of putting it: ID / restaurant could have a lot of predictive validity but not be pulled in to build the model because the tree wants to create splits with roughly equal proportions of remaining obvervations on each side 

Target encoding - similar to doing a groupby and then a transform 

In [10]:
te = ce.TargetEncoder()

In [11]:
te.fit_transform(X,y)
#for target encoding, you need to pass in y 

Unnamed: 0,id,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
0,22.782609,19.230121,0,18.723532,19.609418,35.658068,139.751599,
1,22.782609,18.922702,0,18.723532,19.609418,35.658068,139.751599,
2,22.782609,23.072737,0,18.723532,19.609418,35.658068,139.751599,
3,22.782609,26.313688,0,18.723532,19.609418,35.658068,139.751599,
4,22.782609,17.177009,0,18.723532,19.609418,35.658068,139.751599,
...,...,...,...,...,...,...,...,...
252103,44.595745,23.072737,0,22.582953,20.466463,34.695124,135.197852,6.0
252104,44.595745,26.313688,0,22.582953,20.466463,34.695124,135.197852,37.0
252105,44.595745,23.873362,0,22.582953,20.466463,34.695124,135.197852,35.0
252106,44.595745,17.177009,1,22.582953,20.466463,34.695124,135.197852,3.0


Look at what day of week says: it's basically saying that the average number of visitors (the target) on the day of week in row 0 is 19.23. 

**Advantages of target encoding**: 
* Category values are coherent and actually mean something (as opposed to ordinal)
* You don't need to create a million columns (as opposed to one-hot) 

some fun terms: 
* Info gain 
* Entropy 

In [14]:
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor

Chain different encoding steps together into a modeling step 

Model fitting and encoding all in one function call 

This feel useful

In [15]:
tree = DecisionTreeRegressor(max_depth = 5)
pipe = make_pipeline(te, tree)

In [16]:
pipe

Pipeline(steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area'])),
                ('decisiontreeregressor', DecisionTreeRegressor(max_depth=5))])

You can treat pipe like a list and index into it by using pipe[0], for example 

pipe.set_params() is a good method to understand

In [18]:
X = X.fillna(0)

In [19]:
pipe.fit(X,y)

Pipeline(steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area'])),
                ('decisiontreeregressor', DecisionTreeRegressor(max_depth=5))])

In [20]:
pipe.score(X,y)

0.4534961233081768

In [21]:
#we're not changing X directly but rather saving the logic necessary that pertains to X inside the pipe
# it allows faster trial and iteration for different encoding methods and models 
#the methods available to the pipe are the methods available to the object that's the final one passed in 

### Creating training and test sets aka cross validation

Training / test makes the pipeline method even more useful 

In [22]:
df.sort_values(by = ['id', 'visit_date'], inplace = True)

In [24]:
#for time series, we want to 'hold back' the final 15 days per restaurant as our test set 
#we need a groupby and a lambda function 

#we're using this method because we have 829 different time series 

train = df.groupby(['id']).apply(lambda x: x.iloc[:-15])
test = df.groupby(['id']).apply(lambda x: x.iloc[-15:])

In [25]:
train

Unnamed: 0_level_0,Unnamed: 1_level_0,id,visit_date,visitors,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
air_00a91d42b08b08d9,166836,air_00a91d42b08b08d9,2016-07-01,35,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,
air_00a91d42b08b08d9,166837,air_00a91d42b08b08d9,2016-07-02,9,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,4.0
air_00a91d42b08b08d9,166838,air_00a91d42b08b08d9,2016-07-04,20,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,
air_00a91d42b08b08d9,166839,air_00a91d42b08b08d9,2016-07-05,25,Tuesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,
air_00a91d42b08b08d9,166840,air_00a91d42b08b08d9,2016-07-06,29,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,
...,...,...,...,...,...,...,...,...,...,...,...
air_fff68b929994bfbd,216629,air_fff68b929994bfbd,2017-04-03,2,Monday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,
air_fff68b929994bfbd,216630,air_fff68b929994bfbd,2017-04-04,4,Tuesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,
air_fff68b929994bfbd,216631,air_fff68b929994bfbd,2017-04-05,6,Wednesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,2.0
air_fff68b929994bfbd,216632,air_fff68b929994bfbd,2017-04-06,6,Thursday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,8.0


In [26]:
test

Unnamed: 0_level_0,Unnamed: 1_level_0,id,visit_date,visitors,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
air_00a91d42b08b08d9,167048,air_00a91d42b08b08d9,2017-04-05,35,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2.0
air_00a91d42b08b08d9,167049,air_00a91d42b08b08d9,2017-04-06,29,Thursday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,8.0
air_00a91d42b08b08d9,167050,air_00a91d42b08b08d9,2017-04-07,17,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,1.0
air_00a91d42b08b08d9,167051,air_00a91d42b08b08d9,2017-04-08,9,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,33.0
air_00a91d42b08b08d9,167052,air_00a91d42b08b08d9,2017-04-10,17,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,
...,...,...,...,...,...,...,...,...,...,...,...
air_fff68b929994bfbd,216643,air_fff68b929994bfbd,2017-04-18,6,Tuesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,
air_fff68b929994bfbd,216644,air_fff68b929994bfbd,2017-04-19,2,Wednesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,
air_fff68b929994bfbd,216645,air_fff68b929994bfbd,2017-04-20,2,Thursday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,1.0
air_fff68b929994bfbd,216646,air_fff68b929994bfbd,2017-04-21,4,Friday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,6.0


Recommendation to split into three subsets: training, validation, test 
Validation set is to iteratively check your changes. 

So the idea is that if you use your test set to inform iterative changes to model params, that basically defeats the purpose of the test set because you’ll end up overfitting to training set as well as the test set 

In [27]:
housing_df = pd.read_csv('/Users/ethanalter/Dropbox (Personal)/GA-4K-DataScience/gazelle-4K/Homework/Unit2/data/housing.csv')

In [28]:
housing_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [29]:
from sklearn.model_selection import train_test_split

In [33]:
X = housing_df.drop('PRICE', axis=1)
y = housing_df['PRICE']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, shuffle = True)

In [35]:
X_test

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
251,0.21409,22.0,5.86,0,0.431,6.438,8.9,7.3967,7,330,19.1,377.07,3.59
339,0.05497,0.0,5.19,0,0.515,5.985,45.4,4.8122,5,224,20.2,396.90,9.74
28,0.77299,0.0,8.14,0,0.538,6.495,94.4,4.4547,4,307,21.0,387.94,12.80
366,3.69695,0.0,18.10,0,0.718,4.963,91.4,1.7523,24,666,20.2,316.03,14.00
357,3.84970,0.0,18.10,1,0.770,6.395,91.0,2.5052,24,666,20.2,391.34,13.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,0.20746,0.0,27.74,0,0.609,5.093,98.0,1.8226,4,711,20.1,318.43,29.68
360,4.54192,0.0,18.10,0,0.770,6.398,88.0,2.5182,24,666,20.2,374.56,7.79
151,1.49632,0.0,19.58,0,0.871,5.404,100.0,1.5916,5,403,14.7,341.60,13.28
342,0.02498,0.0,1.89,0,0.518,6.540,59.7,6.2669,1,422,15.9,389.96,8.65
