# **Model**

In [296]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from category_encoders import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn import linear_model


In [308]:
df_train = pd.read_csv("data/train.csv")

In [309]:
df_train = df_train.drop(columns=['number_of_items'])

In [310]:
df_train.dtypes

order_value_gbp      float64
restaurant_id          int64
prep_time_seconds      int64
country               object
city                  object
type_of_food          object
day                    int64
hour                   int64
dtype: object

In [311]:
cat_columns = ["restaurant_id", "country", "city", "type_of_food"]
# df_train = OneHotEncoder(cols=cat_columns).fit(df_train).transform(df_train)
df_train = TargetEncoder(cols=cat_columns, smoothing=1.0).fit(df_train, df_train['prep_time_seconds']).transform(df_train)

# df_train[cat_columns] = df_train[cat_columns].apply(lambda x: pd.factorize(x)[0])

In [312]:
X = df_train.drop(columns=['prep_time_seconds'])
y = df_train['prep_time_seconds']

In [313]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [314]:
print("Length of training data: ", len(X_train))
print("Length of test data: ", len(X_test))

Length of training data:  24295
Length of test data:  8099


## Linear Regression

In [315]:
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print("Mean Squared Error : " + str(mean_squared_error(y_test, y_pred)))
print("Mean Absolute Error : " + str(mean_absolute_error(y_pred, y_test)))

Mean Squared Error : 26628122.003275014
Mean Absolute Error : 1137.4103782747834


## XGBoost

In [316]:
regressor = xgb.XGBRegressor(
    n_estimators=10,
    reg_lambda=1,
    gamma=0,
    max_depth=3
)

regressor.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.300000012, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=3, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=10, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [317]:
pd.DataFrame(regressor.feature_importances_.reshape(1, -1), columns=X_train.columns)

Unnamed: 0,order_value_gbp,restaurant_id,country,city,type_of_food,day,hour
0,0.386506,0.326401,0.0,0.0,0.0,0.083473,0.20362


In [318]:
y_pred = regressor.predict(X_test)

print("Mean Squared Error : " + str(mean_squared_error(y_test, y_pred)))
print("Mean Absolute Error : " + str(mean_absolute_error(y_pred, y_test)))

Mean Squared Error : 24631808.785672437
Mean Absolute Error : 872.0579974363727
