In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [2]:
# Load the dataset after EDA
data = pd.read_csv("../data/processed/data_eda.csv")

In [3]:
# Handle missing values for 'store_primary_category'
data['store_primary_category'] = data['store_primary_category'].fillna('unknown')
data['store_primary_category'].value_counts(dropna=False)

store_primary_category
american             19399
pizza                17320
mexican              17098
burger               10958
sandwich             10059
                     ...  
lebanese                 9
belgian                  2
indonesian               2
chocolate                1
alcohol-plus-food        1
Name: count, Length: 75, dtype: int64

In [5]:
# Convert 'created_at' and 'actual_delivery_time' to datetime format, handling errors
data['created_at'] = pd.to_datetime(data['created_at'], errors='coerce', utc=True)
data['actual_delivery_time'] = pd.to_datetime(data['actual_delivery_time'], errors='coerce', utc=True)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197421 entries, 0 to 197420
Data columns (total 39 columns):
 #   Column                                                   Non-Null Count   Dtype              
---  ------                                                   --------------   -----              
 0   market_id                                                197421 non-null  float64            
 1   created_at                                               197421 non-null  datetime64[ns, UTC]
 2   actual_delivery_time                                     197421 non-null  datetime64[ns, UTC]
 3   store_id                                                 197421 non-null  int64              
 4   store_primary_category                                   197421 non-null  object             
 5   order_protocol                                           197421 non-null  float64            
 6   total_items                                              197421 non-null  int64             

In [7]:
# Feature extraction from 'created_at'
data['hour_of_day'] = data['created_at'].dt.hour
data['day_of_week'] = data['created_at'].dt.dayofweek
data['is_weekend'] = data['day_of_week'].isin([5, 6]).astype(int)

In [8]:
# One-hot encoding for other categorical variables
data = pd.get_dummies(data, columns=['store_primary_category', 'order_protocol'], drop_first=True)

In [9]:
# Frequency encoding for 'store_id' and 'market_id'
store_id_counts = data['store_id'].value_counts()
data['store_id_freq'] = data['store_id'].map(store_id_counts)

market_id_counts = data['market_id'].value_counts()
data['market_id_freq'] = data['market_id'].map(market_id_counts)

In [10]:
# Use the corrected features where outliers have been handled
data['total_delivery_duration'] = data['total_delivery_duration_corrected']
data['busy_dashers_ratio'] = data['busy_dashers_ratio_corrected']
data['workload_metric'] = data['workload_metric_corrected']
data['avg_item_price'] = data['avg_item_price_corrected']

In [11]:
# Log transformation of skewed features (including corrected versions)
skewed_features = [
    'total_items', 'subtotal', 'num_distinct_items', 
    'min_item_price', 'max_item_price', 'store_id_freq',
    'busy_dashers_ratio', 'workload_metric', 'avg_item_price', 'market_id_freq'
]
for feature in skewed_features:
    data[feature] = np.log1p(data[feature])

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [12]:
# Drop columns that are no longer needed
data.drop(columns=['total_delivery_duration_corrected', 'busy_dashers_ratio_corrected', 'workload_metric_corrected', 'avg_item_price_corrected', 'market_id'], inplace=True)

Comments:

- We used frequency encoding for both store_id and market_id to represent the relative occurrence of these identifiers.

- This approach helps retain information about the prevalence of different identifiers without creating a large number of dummy variables.

- Dropping the original market_id column ensures that only the transformed features are used for modeling.

Handling missing values

In [15]:
data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197421 entries, 0 to 197420
Data columns (total 117 columns):
 #    Column                                                   Non-Null Count   Dtype              
---   ------                                                   --------------   -----              
 0    created_at                                               197421 non-null  datetime64[ns, UTC]
 1    actual_delivery_time                                     197421 non-null  datetime64[ns, UTC]
 2    store_id                                                 197421 non-null  int64              
 3    total_items                                              197421 non-null  float64            
 4    subtotal                                                 197421 non-null  float64            
 5    num_distinct_items                                       197421 non-null  float64            
 6    min_item_price                                           197409 non-null  float64 

In [14]:
data['min_item_price'].isnull().sum()

12

In [16]:
data['estimated_store_to_consumer_driving_duration'].isnull().sum()

526

In [17]:
# Handle missing values specifically for the known columns with nulls
data['min_item_price'].fillna(data['min_item_price'].median(), inplace=True)
data['estimated_store_to_consumer_driving_duration'].fillna(
    data['estimated_store_to_consumer_driving_duration'].median(), inplace=True
)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['min_item_price'].fillna(data['min_item_price'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['estimated_store_to_consumer_driving_duration'].fillna(


In [20]:
print(data.isnull().sum().sum())

0


In [21]:
# Save processed data
processed_data_path = "../data/processed/processed_data.csv"
data.to_csv(processed_data_path, index=False)