In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def plot_delivery_time_hist(df):
    sns.histplot(df['duration_minutes'], bins=50)
    plt.title('Distribution of Delivery Times (minutes)')
    plt.xlabel('Minutes')
    plt.show()

def plot_geospatial(df):
    plt.scatter(df['delivery_lng'], df['delivery_lat'], alpha=0.1)
    plt.title('Delivery Locations')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.show()

    plt.scatter(df['pickup_lng'], df['pickup_lat'], alpha=0.1)
    plt.title('Pickup Locations')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.show()

In [5]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set random seed for reproducibility
seed = 42
np.random.seed(seed)
random.seed(seed)

n_rows = 100000

order_ids = [f"order_{i+1:04d}" for i in range(n_rows)]

# Generate random dates in 2024
def random_date_2024():
    start = datetime(2024, 1, 1)
    end = datetime(2024, 12, 31)
    delta_days = (end - start).days
    rand_day = np.random.randint(0, delta_days + 1)
    return start + timedelta(days=rand_day)

# Generate random pickup datetimes (between 8:00 and 18:00) on random dates
pickup_times = []
for _ in range(n_rows):
    base_date = random_date_2024()
    hour = np.random.randint(8, 18)
    minute = np.random.randint(0, 60)
    second = np.random.randint(0, 60)
    pickup_times.append(base_date + timedelta(hours=hour, minutes=minute, seconds=second))

# Generate delivery times with longer durations for 8–9AM and 3–5PM pickups
delivery_times = []
for pickup in pickup_times:
    hour = pickup.hour
    if 8 <= hour < 9:
        # Morning window: 8–9 hours
        duration = np.random.randint(8*60, (9*60) + 1)
    elif 15 <= hour < 17:
        # Afternoon window: 3–5 hours
        duration = np.random.randint(3*60, (5*60) + 1)
    else:
        # Default: 2–6 hours
        duration = np.random.randint(2*60, (6*60) + 1)
    delivery_times.append(pickup + timedelta(minutes=duration))

def random_lat_lng(center_lat, center_lng, delta=0.05):
    lat = center_lat + np.random.uniform(-delta, delta)
    lng = center_lng + np.random.uniform(-delta, delta)
    return lat, lng

pickup_coords = [random_lat_lng(41.8781, -87.6298) for _ in range(n_rows)]  # Example: Chicago
delivery_coords = [random_lat_lng(41.8781, -87.6298) for _ in range(n_rows)]

df = pd.DataFrame({
    'order_id': order_ids,
    'pickup_time': pickup_times,
    'delivery_time': delivery_times,
    'pickup_lat': [lat for lat, lng in pickup_coords],
    'pickup_lng': [lng for lat, lng in pickup_coords],
    'delivery_lat': [lat for lat, lng in delivery_coords],
    'delivery_lng': [lng for lat, lng in delivery_coords],
})

In [8]:
from src.data_prep import clean_data
from src.features import add_time_features, add_distance_feature
from src.models import train_models, predict_models
from src.evaluation import evaluate
from src.experiments import aa_test, ab_test
from sklearn.model_selection import train_test_split

df = clean_data(df)
df = add_time_features(df)
df = add_distance_feature(df)

# plot_delivery_time_hist(df)
# plot_geospatial(df)

df.head()

Unnamed: 0,order_id,pickup_time,delivery_time,pickup_lat,pickup_lng,delivery_lat,delivery_lng,duration_minutes,hour,weekday,hour_sin,hour_cos,weekday_sin,weekday_cos,distance_km
0,order_0001,2024-04-12 11:28:14,2024-04-12 16:32:14,41.908623,-87.628885,41.838164,-87.612079,304.0,11,4,0.258819,-0.965926,-0.433884,-0.900969,7.957324
1,order_0002,2024-04-16 15:20:38,2024-04-16 19:25:38,41.855561,-87.584524,41.907309,-87.652123,245.0,15,1,-0.707107,-0.707107,0.781831,0.62349,8.026737
2,order_0003,2024-05-01 10:22:10,2024-05-01 14:20:10,41.904606,-87.581231,41.894924,-87.611134,238.0,10,2,0.5,-0.866025,0.974928,-0.222521,2.698911
3,order_0004,2024-03-28 12:35:39,2024-03-28 17:04:39,41.864547,-87.614878,41.898879,-87.590719,269.0,12,3,0.0,-1.0,0.433884,-0.900969,4.309769
4,order_0005,2024-05-31 10:21:52,2024-05-31 16:18:52,41.889259,-87.611044,41.867953,-87.655431,357.0,10,4,0.5,-0.866025,-0.433884,-0.900969,4.372336


In [10]:
features = ['distance_km', 'hour', 'weekday']
features = ['distance_km', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos']
X = df[features]
y = df['duration_minutes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
models = train_models(X_train, y_train)
y_preds = predict_models(models, X_test)
print(features)
evaluate(
    y_test,
    y_preds,
    [
        'LinearReg',
        'RandomForest',
        'GBM'
    ]
)

aa_test(y_test)
ab_test(y_test, y_preds[0], y_preds[1])  # Compare Linear vs RandomForest

['distance_km', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos']
LinearReg: MAE=67.29, R2=0.32
RandomForest: MAE=53.62, R2=0.56
GBM: MAE=51.21, R2=0.61
A/A Test p-value: 0.652 (should be > 0.05)
A/B Test p-value: 0.000 (significant difference if < 0.05)


['distance_km', 'hour', 'weekday']  
LinearReg: MAE=72.81, R2=0.16  
RandomForest: MAE=53.79, R2=0.55  
GBM: MAE=51.18, R2=0.61  
A/A Test p-value: 0.652 (should be > 0.05)  
A/B Test p-value: 0.000 (significant difference if < 0.05)  

---

['distance_km', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos']  
LinearReg: MAE=67.29, R2=0.32  
RandomForest: MAE=53.62, R2=0.56  
GBM: MAE=51.21, R2=0.61  
A/A Test p-value: 0.652 (should be > 0.05)  
A/B Test p-value: 0.000 (significant difference if < 0.05)  
