In [19]:
# importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

df = pd.read_csv('data/historical_data.csv')
df.head(5)

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0


In [20]:
# understanding the data landscape

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197428 entries, 0 to 197427
Data columns (total 16 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   market_id                                     196441 non-null  float64
 1   created_at                                    197428 non-null  object 
 2   actual_delivery_time                          197421 non-null  object 
 3   store_id                                      197428 non-null  int64  
 4   store_primary_category                        192668 non-null  object 
 5   order_protocol                                196433 non-null  float64
 6   total_items                                   197428 non-null  int64  
 7   subtotal                                      197428 non-null  int64  
 8   num_distinct_items                            197428 non-null  int64  
 9   min_item_price                                19

In [21]:
# looking for null values

df.isnull().sum()

market_id                                         987
created_at                                          0
actual_delivery_time                                7
store_id                                            0
store_primary_category                           4760
order_protocol                                    995
total_items                                         0
subtotal                                            0
num_distinct_items                                  0
min_item_price                                      0
max_item_price                                      0
total_onshift_dashers                           16262
total_busy_dashers                              16262
total_outstanding_orders                        16262
estimated_order_place_duration                      0
estimated_store_to_consumer_driving_duration      526
dtype: int64

In [22]:
# need to create a new variable which is the actual delivery time
# in order for that to happen the data types for created_at and actual_delivery_time have to be changed from object to datetime

df['created_at'] = pd.to_datetime(df['created_at'])
df['actual_delivery_time'] = pd.to_datetime(df['actual_delivery_time'])
df['actual_delivery_time'] = (df['actual_delivery_time'] - df['created_at']).dt.total_seconds()
df.head(5)

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:17,3779.0,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,2015-02-10 21:49:25,4024.0,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
2,3.0,2015-01-22 20:39:28,1781.0,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0
3,3.0,2015-02-03 21:21:45,3075.0,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0
4,3.0,2015-02-15 02:40:36,2390.0,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0


In [23]:
# since we are calculating for delivery there are a few variables we need to take into consideration
# how long was the drive to the restaurant? how long did the order take to get to the restautant?

# creating a variable to measure the amount of time these two actions took during the order
df['time_to_resto'] = df['estimated_store_to_consumer_driving_duration'] + df['estimated_order_place_duration']
df.head(5)

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration,time_to_resto
0,1.0,2015-02-06 22:24:17,3779.0,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0,1307.0
1,2.0,2015-02-10 21:49:25,4024.0,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0,1136.0
2,3.0,2015-01-22 20:39:28,1781.0,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0,1136.0
3,3.0,2015-02-03 21:21:45,3075.0,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0,735.0
4,3.0,2015-02-15 02:40:36,2390.0,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0,1096.0


In [24]:
# since we have the data for dashers on shift AND how many dashers were busy, understanding if dasher capacity plays is important
# creating a new column called dasher_ration which will have the percentage of dashers that were not busy during the time of the order

df['dasher_ratio'] = df['total_busy_dashers']/df['total_onshift_dashers']

In [25]:
# understanding the amount of unique market ids

df['market_id'].nunique()

6

In [26]:
# now let's look at the amount of unique store ids

df['store_id'].nunique()

6743

In [27]:
# number of unique models in which doordash can send an order

df['order_protocol'].nunique()

7

In [28]:
# creating a dictionary with the most repeated categories of each store to full null rows

id_unique = df['store_id'].unique().tolist()

# creating the dictionary

id_category = {store_id: df[df['store_id'] == store_id].store_primary_category.mode() for store_id in id_unique}

In [29]:
# creating a function that will label any null values that the store_ids in the dictionary may have
# some have more than one option, but we will be using the first value since it is the primary

def fill_id(store_id):
    try:
        return id_category[store_id].values[0]
    except:
        return np.nan
    

df['store_category_clean'] = df['store_id'].apply(fill_id)

In [30]:
df.head(10)

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration,time_to_resto,dasher_ratio,store_category_clean
0,1.0,2015-02-06 22:24:17,3779.0,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0,1307.0,0.424242,american
1,2.0,2015-02-10 21:49:25,4024.0,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0,1136.0,2.0,indian
2,3.0,2015-01-22 20:39:28,1781.0,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0,1136.0,0.0,indian
3,3.0,2015-02-03 21:21:45,3075.0,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0,735.0,1.0,indian
4,3.0,2015-02-15 02:40:36,2390.0,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0,1096.0,1.0,indian
5,3.0,2015-01-28 20:30:38,2300.0,5477,,1.0,3,5000,3,1500,1900,2.0,2.0,2.0,446,338.0,784.0,1.0,indian
6,3.0,2015-01-31 02:16:36,1584.0,5477,,1.0,2,3900,2,1200,2700,10.0,9.0,9.0,446,638.0,1084.0,0.9,indian
7,3.0,2015-02-12 03:03:35,1965.0,5477,,1.0,4,4850,4,750,1800,7.0,8.0,7.0,446,626.0,1072.0,1.142857,indian
8,2.0,2015-02-16 00:11:35,1586.0,5477,indian,3.0,4,4771,3,820,1604,8.0,6.0,18.0,446,289.0,735.0,0.75,indian
9,3.0,2015-02-18 01:15:45,3192.0,5477,,1.0,2,2100,2,700,1200,2.0,2.0,2.0,446,715.0,1161.0,1.0,indian
