# Data Preparation for Machine Learning and Deep Learning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
import duckdb as db

In [53]:
df=pd.read_parquet("../data/cleaned/dataset.parquet")

In [54]:
df.columns

Index(['market_id', 'created_at', 'actual_delivery_time', 'store_id',
       'store_primary_category', 'order_protocol', 'total_items', 'subtotal',
       'num_distinct_items', 'min_item_price', 'max_item_price',
       'total_onshift_partners', 'total_busy_partners',
       'total_outstanding_orders', 'created_at_year', 'created_at_month',
       'created_at_day', 'created_at_hour', 'created_at_minute',
       'created_at_second', 'actual_delivery_time_year',
       'actual_delivery_time_month', 'actual_delivery_time_day',
       'actual_delivery_time_hour', 'actual_delivery_time_minute',
       'actual_delivery_time_second', 'created_at_month_name',
       'actual_delivery_time_month_name', 'created_at_day_of_week',
       'actual_delivery_time_day_of_week', 'created_at_day_name',
       'actual_delivery_time_day_name', 'created_at_week_number',
       'actual_delivery_time_week_number', 'created_at_week_of_month',
       'actual_delivery_time_week_of_month', 'delivery_time_seconds',

In [55]:
to_drop = [
     'created_at', 'actual_delivery_time', 'store_id',
     'created_at_year', 'created_at_month',
       'created_at_second', 'actual_delivery_time_year',
       'actual_delivery_time_month', 'actual_delivery_time_day',
       'actual_delivery_time_hour', 'actual_delivery_time_minute',
       'actual_delivery_time_second', 'created_at_month_name',
       'actual_delivery_time_month_name', 
       'actual_delivery_time_day_of_week', 'created_at_day_name',
       'actual_delivery_time_day_name', 'created_at_week_number',
       'actual_delivery_time_week_number', 
       'actual_delivery_time_week_of_month', 'delivery_time_seconds',
]

In [56]:
df = df.drop(columns=to_drop)
df.columns

Index(['market_id', 'store_primary_category', 'order_protocol', 'total_items',
       'subtotal', 'num_distinct_items', 'min_item_price', 'max_item_price',
       'total_onshift_partners', 'total_busy_partners',
       'total_outstanding_orders', 'created_at_day', 'created_at_hour',
       'created_at_minute', 'created_at_day_of_week',
       'created_at_week_of_month', 'delivery_time_minutes'],
      dtype='object')

In [57]:
df=df.drop_duplicates()

In [58]:
df.shape

(177544, 17)

In [59]:
df=df[df["total_items"] < 100]
df=df[df["delivery_time_minutes"] < 200]
df=df[df["max_item_price"] < 10000]
df=df[df["min_item_price"] < 10000]
df=df[df["subtotal"] < 15000]

In [61]:
df.shape

(177445, 17)

In [62]:
df

Unnamed: 0,market_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_partners,total_busy_partners,total_outstanding_orders,created_at_day,created_at_hour,created_at_minute,created_at_day_of_week,created_at_week_of_month,delivery_time_minutes
0,1.0,american,1,4,3441,4,557.0,1239.0,33.0,14.0,21.0,6,16,54,5,1,62
1,2.0,mexican,2,1,1900,1,1400.0,1400.0,1.0,2.0,2.0,10,16,19,2,2,67
2,3.0,other,1,1,1900,1,1900.0,1900.0,1.0,0.0,0.0,22,15,9,4,4,29
3,3.0,other,1,6,6900,5,600.0,1800.0,1.0,1.0,2.0,3,15,51,2,1,51
4,3.0,other,1,3,3900,3,1100.0,1600.0,6.0,6.0,9.0,14,21,10,6,2,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177539,1.0,fast,4,3,1389,3,345.0,649.0,17.0,17.0,23.0,16,18,49,1,3,65
177540,1.0,fast,4,6,3010,4,405.0,825.0,12.0,11.0,14.0,12,18,31,4,2,56
177541,1.0,fast,4,5,1836,3,300.0,399.0,39.0,41.0,40.0,23,23,16,5,4,50
177542,1.0,sandwich,1,1,1175,1,535.0,535.0,7.0,7.0,12.0,1,12,48,7,1,65


In [71]:
cat_df = df["store_primary_category"].value_counts().reset_index().rename(columns={ 'store_primary_category': 'category'})
cat_df

Unnamed: 0,category,count
0,american,18086
1,pizza,15693
2,mexican,15608
3,burger,9846
4,sandwich,8991
...,...,...
68,african,10
69,lebanese,9
70,belgian,2
71,chocolate,1


In [75]:
df=df.merge(cat_df, left_on="store_primary_category", right_on="category", how="left")

In [76]:
df["store_primary_category"] = np.where(df["count"] < 20, "other", df["store_primary_category"])

In [77]:
df=df.drop(columns=["category", "count"])

In [79]:
df["store_primary_category"].value_counts()

store_primary_category
american        18086
pizza           15693
mexican         15608
burger           9846
sandwich         8991
                ...  
moroccan           24
cheese             24
european           22
comfort-food       21
spanish            20
Name: count, Length: 66, dtype: int64