# **Model**

In [32]:
import pandas as pd
import catboost as cb
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from category_encoders import TargetEncoder
from sklearn import linear_model
from category_encoders import CatBoostEncoder
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

In [33]:
df_train = pd.read_csv("data/train.csv")

In [34]:
df_train.dtypes

order_value_gbp            float64
restaurant_id                int64
number_of_items              int64
prep_time_seconds            int64
city                        object
type_of_food                object
day                         object
hour                         int64
average_prep_resteraunt    float64
min_prep_restauraunt         int64
max_prep_restauraunt         int64
dtype: object

In [35]:
cat_columns = ["city", "type_of_food", "day"]

# df_train = TargetEncoder(cols=cat_columns, smoothing=1.0).fit(df_train, df_train['prep_time_seconds']).transform(df_train)
encoder = CatBoostEncoder(cols=cat_columns)
df_train = encoder.fit_transform(df_train, df_train["prep_time_seconds"])

In [36]:
X = df_train.drop(columns=['prep_time_seconds'])
y = df_train['prep_time_seconds']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [37]:
print("Length of training data: ", len(X_train))
print("Length of test data: ", len(X_test))

Length of training data:  24181
Length of test data:  8061


## Linear Regression

In [38]:
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

print("Mean Absolute Error : " + str(mean_absolute_error(y_pred, y_test)))

Mean Absolute Error : 537.8429039725107


## CatBoost

In [39]:
target = "prep_time_seconds"
loss = "MAE"
metric = "MAE"
iterations = 30

model_params = {
    "thread_count": -1,
    "task_type": "CPU",
    "loss_function": loss,
    "eval_metric": metric,
    "verbose": False,
    "depth": 16,
    "iterations": iterations,
    "learning_rate": 0.1,
    "l2_leaf_reg": 25,
    "border_count": 256,
    "grow_policy": "SymmetricTree",
    "random_seed": 42,
}

print("Started Training")
print("- Target: ", target)
print("- Input Size:", X_train.shape[0])
print("- Inputs:", ", ".join(X_train.columns))
print("- Categorical:", ", ".join(cat_columns))
print("- Model")
print("-- Loss:", loss)
print("-- Metric:", metric)
print("-- Parameters:", model_params)

train_pool = cb.Pool(X_train, label=y_train)

model = cb.CatBoost(model_params)
model.fit(train_pool, plot=False, verbose=25)
model_name = f"models/model.cbm"
model.save_model(model_name)

eval = ["RMSE", "MAPE", "MAE", "R2", "MedianAbsoluteError"]

test_pool = cb.Pool(X_test, y_test)

print("- Evaluation")
results = model.eval_metrics(test_pool, eval)
eval = {}
for e in results.keys():
    eval[e] = round(results[e][-1], 4)
    print("--", e, results[e][-1])

uniques = {}
for c in cat_columns:
    uniques[c] = list(X_train[c].unique())

importance = model.get_feature_importance(type="FeatureImportance", prettified=True)
importance = {i["Feature Id"]: round(i["Importances"], 2) for _, i in importance.iterrows()}

metadata = {
    "Date": datetime.now().strftime("%Y/%m/%d %H:%M:%S"),
    "Training Data Size": len(X_train),
    "Evaluation": {"Metrics": eval, "Size": len(X_test)},
    "Feature Importance": importance,
    "Model Parameters": model_params,
    "Target": target,
    "Input Columns": list(X_train.columns),
    "Categorical Columns": list(cat_columns),
    "Categorical Possible Values": uniques,
}


Started Training
- Target:  prep_time_seconds
- Input Size: 24181
- Inputs: order_value_gbp, restaurant_id, number_of_items, city, type_of_food, day, hour, average_prep_resteraunt, min_prep_restauraunt, max_prep_restauraunt
- Categorical: city, type_of_food, day
- Model
-- Loss: MAE
-- Metric: MAE
-- Parameters: {'thread_count': -1, 'task_type': 'CPU', 'loss_function': 'MAE', 'eval_metric': 'MAE', 'verbose': False, 'depth': 16, 'iterations': 30, 'learning_rate': 0.1, 'l2_leaf_reg': 25, 'border_count': 256, 'grow_policy': 'SymmetricTree', 'random_seed': 42}
0:	learn: 569.6100190	total: 3.95s	remaining: 1m 54s
25:	learn: 377.8250291	total: 1m 27s	remaining: 13.5s
29:	learn: 367.9230139	total: 1m 39s	remaining: 0us
- Evaluation
-- RMSE 923.7959299600234
-- MAPE 4.908900690690161
-- MAE 444.3025128998352
-- R2 0.26561566007970017
-- MedianAbsoluteError 240.51253294814455


In [40]:
importance

{'hour': 29.08,
 'order_value_gbp': 9.69,
 'max_prep_restauraunt': 9.35,
 'average_prep_resteraunt': 9.32,
 'min_prep_restauraunt': 8.09,
 'number_of_items': 7.18,
 'type_of_food': 7.15,
 'restaurant_id': 6.83,
 'day': 6.72,
 'city': 6.59}

## Hyperparameter Tuning

In [41]:
# parameters = {
#     'depth'         : [4, 5, 6, 7, 8, 9, 10],
#     'learning_rate' : [0.1, 0.2, 0.3, 0.4],
#     'iterations'    : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
# }

# model = cb.CatBoostClassifier()

# Grid_CB = GridSearchCV(estimator=model, param_grid = parameters, cv = 2, n_jobs=-1)
# Grid_CB.fit(X_train, y_train)

# print(" Results from Grid Search " )
# print("\n The best estimator across ALL searched params:\n",Grid_CB.best_estimator_)
# print("\n The best score across ALL searched params:\n",Grid_CB.best_score_)
# print("\n The best parameters across ALL searched params:\n",Grid_CB.best_params_)