In [30]:
import pandas as pd
import numpy as np
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import date, timedelta
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2


def objective(params):
    #alpha=params[0]
    #l1_ratio=params[1]
    (rmse, mae, r2) = train_model(**params)
    return {'loss': rmse, 'status': STATUS_OK}

def train_model(alpha,l1_ratio):
    num_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('scaler', StandardScaler())
    ])
    full_pipeline = ColumnTransformer([
        ("num", num_pipe, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
    pipe_estimator=Pipeline([
        ('transformer',full_pipeline()),
        ('estimator',ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42))
    ])
    # lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
    pipe_estimator.fit(train_x, train_y)
    predicted_qualities = pipe.predict(test_x)
    return eval_metrics(test_y, predicted_qualities)


In [34]:
features_by_date.columns

Index(['date', 'num_people_10_00', 'label_num_people_12_33',
       'label_num_menus_sold', 'label_difference_12_normalized',
       'label_difference_10_and_sold_normalized', 'zurich_vacation', 'weekday',
       'zuehlke_day', 'Temperature', 'Rain Duration'],
      dtype='object')

In [72]:
pickle_path = os.path.join('..', '..', 'data', 'features_by_date')
features_by_date = pd.read_pickle(pickle_path)
num_features=['num_people_10_00','Temperature','Rain Duration']
cat_features=['weekday']
bin_features=['zurich_vacation']
label=['label_num_menus_sold']
all_columns=num_features+cat_features+bin_features+label
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler())
])
num_pipe.fit(features_by_date[num_features])



full_pipeline = ColumnTransformer([
    ("num", num_pipe, num_features),
    ("cat", OneHotEncoder(), cat_features),
])
alpha=0.5
l1_ratio=0.5
pipe_estimator=Pipeline([
    ('transformer',full_pipeline),
    ('estimator',ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42))
])
# lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)


In [73]:
train, test = train_test_split(features_by_date[all_columns])
train_x = train.drop(label, axis=1)
test_x = test.drop(label, axis=1)
train_y = train[label]
test_y = test[label]

pipe_estimator.fit(train_x, train_y)
predicted_qualities = pipe_estimator.predict(test_x)

In [74]:
clf=pipe_estimator.named_steps['estimator'].coef_


array([23.05915794, -0.26921951,  0.07504832,  0.12404613,  0.71525202,
        2.30135732,  0.34622184, -6.48615102])

In [45]:
for key, values in {'a':1,'b':2,'c':3}.items():
    print(key)

a
b
c


In [48]:
features_by_date


Unnamed: 0,date,num_people_10_00,label_num_people_12_33,label_num_menus_sold,label_difference_12_normalized,label_difference_10_and_sold_normalized,zurich_vacation,weekday,zuehlke_day,Temperature,Rain Duration
2022-03-09,2022-03-09,131,126,103.0,0.038168,0.213740,0,2,False,8.73,0.00
2022-03-10,2022-03-10,125,107,81.0,0.144000,0.352000,0,3,False,9.32,0.00
2022-03-11,2022-03-11,55,50,30.0,0.090909,0.454545,0,4,False,10.03,0.00
2022-03-14,2022-03-14,129,133,102.0,-0.031008,0.209302,0,0,False,11.88,0.00
2022-03-15,2022-03-15,144,139,114.0,0.034722,0.208333,0,1,False,7.99,0.00
...,...,...,...,...,...,...,...,...,...,...,...
2022-11-21,2022-11-21,191,190,154.0,0.005236,0.193717,0,0,False,9.08,0.00
2022-11-22,2022-11-22,235,222,187.0,0.055319,0.204255,0,1,False,7.79,0.00
2022-11-23,2022-11-23,245,222,176.0,0.093878,0.281633,0,2,False,7.84,0.00
2022-11-24,2022-11-24,219,213,174.0,0.027397,0.205479,0,3,False,9.11,33.63


In [49]:
features_by_date.columns


Index(['date', 'num_people_10_00', 'label_num_people_12_33',
       'label_num_menus_sold', 'label_difference_12_normalized',
       'label_difference_10_and_sold_normalized', 'zurich_vacation', 'weekday',
       'zuehlke_day', 'Temperature', 'Rain Duration'],
      dtype='object')

In [75]:
mean_absolute_error([0,3,2],[0,0,0])

1.6666666666666667

In [76]:
{key: value for key, value in zip([0,1,2,3],[4,5,6,7])}

{0: 4, 1: 5, 2: 6, 3: 7}