In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

In [None]:
# Constants for Delhi-NCR dataset
num_rows = 100000

In [None]:
# Define Delhi-NCR locations
delhi_ncr_locations = [
    "Connaught Place", "Saket", "Lajpat Nagar", "Karol Bagh", "Pitampura", "Rohini", "Vasant Kunj", "Dwarka",
    "Janakpuri", "Chandni Chowk", "Greater Kailash", "Hauz Khas", "South Extension", "Defence Colony", "Green Park",
    "Rajouri Garden", "Uttam Nagar", "Paschim Vihar", "Jasola", "Sarita Vihar", "Mayur Vihar", "Yamuna Vihar",
    "Shahdara", "Dilshad Garden", "Preet Vihar", "Nehru Place", "Okhla", "Govindpuri", "Munirka", "Mehrauli",
    "Noida Sector 18", "Noida Sector 62", "Indirapuram", "Ghaziabad", "Vaishali", "Kaushambi", "Faridabad",
    "Gurgaon Cyber Hub", "MG Road Gurgaon", "Sohna Road", "Golf Course Road", "DLF Phase 1", "DLF Phase 2",
    "DLF Phase 3", "Manesar", "Greater Noida", "Raj Nagar Extension", "Badarpur", "Nangloi", "Najafgarh", "Palam"
]


In [None]:
# Reuse vehicle types and cancellation reasons
vehicle_types = ["Auto", "Prime Plus", "Prime Sedan", "Mini", "Bike", "eBike", "Prime SUV"]
customer_cancel_reasons = [
    "Driver is not moving towards pickup location", "Driver asked to cancel",
    "AC is not working", "Change of plans", "Wrong Address"
]
driver_cancel_reasons = [
    "Personal & Car related issues", "Customer related issue",
    "The customer was coughing/sick", "More than permitted people in there"
]
incomplete_reasons = ["Customer Demand", "Vehicle Breakdown", "Other Issue"]

In [None]:
# Success and failure rates
success_rate = 0.62
customer_cancel_rate = 0.07
driver_cancel_rate = 0.18
incomplete_rate = 1 - (success_rate + customer_cancel_rate + driver_cancel_rate)


In [None]:
# Booking value distribution
low_value_rate = 0.70
mid_value_rate = 0.28
high_value_rate = 1 - (low_value_rate + mid_value_rate)


In [None]:
# Generate date and time
start_date = datetime(2024, 6, 1)
end_date = datetime(2024, 6, 30)
random_dates = [start_date + timedelta(days=random.randint(0, (end_date - start_date).days),
                                       hours=random.randint(0, 23),
                                       minutes=random.randint(0, 59)) for _ in range(num_rows)]


In [None]:
# Define match days and weekends
match_days = ["2024-06-07", "2024-06-15", "2024-06-23", "2024-06-30"]
weekends = [5, 6]  # Saturday and Sunday

In [None]:
# Generate unique booking IDs
booking_ids = ["CNR" + str(random.randint(1000000000, 9999999999)) for _ in range(num_rows)]

In [None]:
# Generate booking statuses
statuses = np.random.choice(
    ["Success", "Cancelled by Customer", "Cancelled by Driver", "Incomplete"],
    num_rows,
    p=[success_rate, customer_cancel_rate, driver_cancel_rate, incomplete_rate]
)


In [None]:
# Generate data
data = []
for i in range(num_rows):
    date = random_dates[i].strftime("%Y-%m-%d")
    time = random_dates[i].strftime("%H:%M:%S")
    booking_id = booking_ids[i]
    status = statuses[i]

    customer_id = "CUST" + str(random.randint(100000, 999999))
    vehicle = random.choice(vehicle_types)
    pickup = random.choice(delhi_ncr_locations)
    drop = random.choice(delhi_ncr_locations)

    avg_vtat = random.uniform(2, 10) if status == "Success" else None
    avg_ctat = random.uniform(5, 15) if status == "Success" else None

    cancelled_by_customer = 1 if status == "Cancelled by Customer" else 0
    customer_cancel_reason = random.choice(customer_cancel_reasons) if cancelled_by_customer else None

    cancelled_by_driver = 1 if status == "Cancelled by Driver" else 0
    driver_cancel_reason = random.choice(driver_cancel_reasons) if cancelled_by_driver else None

    incomplete_rides = 1 if status == "Incomplete" else 0
    incomplete_reason = random.choice(incomplete_reasons) if incomplete_rides else None

    ride_distance = random.uniform(2, 25) if status == "Success" else None

    # Booking value distribution
    rand_val = random.random()
    if status == "Success":
        if rand_val < low_value_rate:
            booking_value = random.randint(50, 499)
        elif rand_val < (low_value_rate + mid_value_rate):
            booking_value = random.randint(500, 999)
        else:
            booking_value = random.randint(1000, 5000)
    else:
        booking_value = None

    driver_rating = round(random.uniform(3, 5), 1) if status == "Success" else None
    customer_rating = round(random.uniform(3, 5), 1) if status == "Success" else None

    # Increase booking value on match days and weekends
    if date in match_days or pd.to_datetime(date).weekday() in weekends:
        booking_value = booking_value * 1.2 if booking_value else None

    data.append([date, time, booking_id, status, customer_id, vehicle, pickup, drop, avg_vtat, avg_ctat,
                 cancelled_by_customer, customer_cancel_reason, cancelled_by_driver, driver_cancel_reason,
                 incomplete_rides, incomplete_reason, booking_value, ride_distance, driver_rating, customer_rating])

In [None]:
# Create DataFrame
df_delhi_ncr = pd.DataFrame(data, columns=[
    "Date", "Time", "Booking ID", "Booking Status", "Customer ID", "Vehicle Type", "Pickup Location",
    "Drop Location", "Avg VTAT", "Avg CTAT", "Cancelled Rides by Customer", "Reason for cancelling by Customer",
    "Cancelled Rides by Driver", "Reason for cancelling by Driver", "Incomplete Rides", "Incomplete Rides Reason",
    "Booking Value", "Ride Distance", "Driver Ratings", "Customer Rating"
])


In [22]:
# Save to CSV
file_path_delhi_ncr = "/mnt/data/Delhi_NCR_OLA_Data.csv"
df_delhi_ncr.to_csv(file_path_delhi_ncr, index=False)

file_path_delhi_ncr