In [13]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
import category_encoders as ce

In [3]:
df = pd.read_csv(r"https://raw.githubusercontent.com/JonathanBechtel/dat-11-15/main/ClassMaterial/Unit1/data/master.csv", parse_dates = ['visit_date'])

In [14]:
df.select_dtypes(include = np.object).columns.tolist()

['id', 'day_of_week', 'genre', 'area']

In [15]:
df['yesterday'] = df.groupby('id').apply(lambda x: x['visitors'].shift()).values
df['last_week'] = df.groupby('id').apply(lambda x: x['visitors'].shift(7)).values

# fill in missing reservations
df['reserve_visitors'] = df['reserve_visitors'].fillna(0)

# drop missing values from shifts
df = df.dropna()

In [16]:
X = df[['id', 'yesterday', 'day_of_week']]
y = df['visitors']

In [17]:
# let's assume these were our optimized parameters
tree = DecisionTreeRegressor(max_depth = 7, max_features = 0.8, min_samples_leaf = 10)

pipe = make_pipeline(ce.TargetEncoder(), tree)

In [18]:
# fit the tree, and export it
# fitting on ALL of X and y, not just the training set 
pipe.fit(X, y)

Pipeline(steps=[('targetencoder', TargetEncoder(cols=['id', 'day_of_week'])),
                ('decisiontreeregressor',
                 DecisionTreeRegressor(max_depth=7, max_features=0.8,
                                       min_samples_leaf=10))])

In [19]:
# the pickle module allows you to export saved models
import pickle

# rb -- WRITE the file in BYTES
with open('pipe.pkl', 'wb') as export:
    # this creates an external version of the file that we can now import later on
    pickle.dump(pipe, export)

In [27]:
# we can now import this, and re-use it on new data:  very handy

with open('pipe.pkl', 'rb') as import_:
    # this will import the pickled object again
    pipe2 = pickle.load(import_)

In [28]:
# here it is
pipe2

Pipeline(steps=[('targetencoder', TargetEncoder(cols=['id', 'day_of_week'])),
                ('decisiontreeregressor',
                 DecisionTreeRegressor(max_depth=7, max_features=0.8,
                                       min_samples_leaf=10))])

In [29]:
# and we can use it to make new predictions
pipe2.predict(X)

array([26.02951065, 32.35852425, 21.71750433, ..., 51.37125749,
       42.82732865, 51.37125749])

### Making Predictions in Practice 

In [21]:
sample_id = 'adsfaeiw384793ljlj'
yesterday = 55
day_of_week = 'Sunday'

#single sample to 1 row dataframe

In [23]:
#needs to be in dict form if you're using a pipeline 
#otherwise needs to be a numpy array 

sample = {
    'id': sample_id,
    'yesterday': yesterday,
    'day_of_week': day_of_week
}

pd.DataFrame(sample, index = [0])

Unnamed: 0,id,yesterday,day_of_week
0,adsfaeiw384793ljlj,55,Sunday


In [33]:
sample = pd.DataFrame(sample, index = [0])

In [37]:
pipe2.predict(sample)

array([22.75972006])