In [1]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path

# Detect environment
if 'ipykernel' in sys.modules:
    ROOT_DIR = Path.cwd().parent
else:
    ROOT_DIR = Path(__file__).resolve().parent

DATA_PATH = ROOT_DIR / "data" / "raw" / "NYC.csv"

# Load dataset
try:
    df = pd.read_csv(DATA_PATH)
except FileNotFoundError:
    df = pd.DataFrame()

if not df.empty:
    df = df[['pickup_datetime', 'pickup_longitude', 'pickup_latitude',
             'dropoff_longitude', 'dropoff_latitude', 'passenger_count']]

    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df['hour'] = df['pickup_datetime'].dt.hour
    df['day'] = df['pickup_datetime'].dt.dayofweek
    df['date'] = df['pickup_datetime'].dt.date

    zones = [
        "Indiranagar","Koramangala","MG Road","Whitefield","Bellandur",
        "Marathahalli","HSR Layout","BTM Layout","Electronic City","Jayanagar",
        "Banashankari","Rajajinagar","Hebbal","Yelahanka","Basavanagudi"
    ]

    df['zone'] = np.random.choice(zones, len(df))

    print("Base cab data loaded and formatted.")
else:
    print(f"Could not load data from {DATA_PATH}")


Base cab data loaded and formatted.


In [7]:
np.random.seed(42)

In [2]:
weather_probs = {
    'Clear': 0.40,
    'Clouds': 0.30,
    'Mist': 0.15,
    'Rain': 0.10,
    'Fog': 0.03,
    'Thunderstorm': 0.02
}
df['weather'] = np.random.choice(
    list(weather_probs.keys()), 
    len(df), 
    p=list(weather_probs.values())
)

df['temperature'] = np.random.normal(27, 4, len(df))
df['temperature'] = np.clip(df['temperature'], 18, 38)

print("Simulated weather and temperature added.")

Simulated weather and temperature added.


In [3]:
def generate_realistic_traffic(hour, base_random):
    if hour in [7, 8, 9, 18, 19, 20]:
        return np.clip(3.5 + base_random * 0.8, 3.0, 5.0)
    elif hour in [10, 11, 17, 21]:
        return np.clip(2.5 + base_random * 1.0, 2.0, 4.0)
    elif hour in [12, 13, 14, 15, 16]:
        return np.clip(2.0 + base_random * 0.8, 1.5, 3.5)
    else:
        return np.clip(1.0 + base_random * 0.5, 1.0, 2.5)

base_random = np.random.normal(0, 0.5, len(df))
df['traffic'] = df['hour'].apply(lambda h: generate_realistic_traffic(h, 0)) + base_random
df['traffic'] = np.clip(df['traffic'], 1, 5)

weather_traffic_boost = df['weather'].map({
    'Clear': 0, 'Clouds': 0.2, 'Mist': 0.3,
    'Rain': 0.8, 'Fog': 0.5, 'Thunderstorm': 1.0
})

df['traffic'] = np.clip(df['traffic'] + weather_traffic_boost, 1, 5)

In [4]:
def calculate_surge_label(row):
    zone_weights = {
        "Indiranagar": 1.0, "Koramangala": 1.0, "MG Road": 1.0,
        "Whitefield": 0.9, "Bellandur": 0.9, "Marathahalli": 0.8,
        "HSR Layout": 0.8, "BTM Layout": 0.7, "Electronic City": 0.7,
        "Jayanagar": 0.7, "Banashankari": 0.6, "Rajajinagar": 0.6,
        "Hebbal": 0.6, "Yelahanka": 0.5, "Basavanagudi": 0.5
    }

    weather_weights = {
        "Clear": 0.1, "Mist": 0.4, "Fog": 0.6,
        "Clouds": 0.2, "Rain": 1.0, "Thunderstorm": 1.0
    }

    z_weight = zone_weights.get(row["zone"], 0.5)
    w_weight = weather_weights.get(row["weather"], 0.2)

    hour = row["hour"]
    if 7 <= hour <= 10 or 17 <= hour <= 21:
        hour_factor = 1.0
    elif 11 <= hour <= 16:
        hour_factor = 0.5
    else:
        hour_factor = 0.2

    temp = row["temperature"]
    if temp > 32 or temp < 18:
        temp_factor = 1.0
    elif 25 <= temp <= 32:
        temp_factor = 0.5
    else:
        temp_factor = 0.2

    traffic_factor = row["traffic"] / 5.0

    surge_score = (
        0.30 * traffic_factor +
        0.25 * w_weight +
        0.20 * hour_factor +
        0.15 * z_weight +
        0.10 * temp_factor
    )

    if surge_score < 0.35:
        return "Low"
    elif surge_score < 0.60:
        return "Medium"
    else:
        return "High"

df["surge_label"] = df.apply(calculate_surge_label, axis=1)


In [5]:
output_path = ROOT_DIR / "data" / "processed" / "merged_data.csv"
output_path.parent.mkdir(parents=True, exist_ok=True)

df.to_csv(output_path, index=False)

print(f"Final dataset saved at: {output_path}")
print(f"Columns: {df.columns.tolist()}")
print(f"Total records: {len(df)}")

Final dataset saved at: c:\Users\ASUS\OneDrive\Desktop\SurgeSense\data\processed\merged_data.csv
Columns: ['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'hour', 'day', 'date', 'zone', 'weather', 'temperature', 'traffic', 'surge_label']
Total records: 1458644


In [6]:
import sys
import pandas as pd
from pathlib import Path

if 'ipykernel' in sys.modules:
    current_path = Path.cwd()
    if current_path.name == 'notebooks':
        ROOT_DIR = current_path.parent
    else:
        ROOT_DIR = current_path
else:
    ROOT_DIR = Path(__file__).resolve().parent

file_path = ROOT_DIR / "data" / "processed" / "merged_data.csv"

print(f"Looking for file at: {file_path}")

if file_path.exists():
    print("Reading big file... (this might take 10 seconds)")
    df = pd.read_csv(file_path)

    if len(df) > 5000:
        print(f"Original size: {len(df)} rows. Shrinking to 5,000...")
        df_small = df.sample(n=5000, random_state=42)
        df_small.to_csv(file_path, index=False)
        print("Success! File overwritten with small version.")
    else:
        print("File is already small enough. No changes made.")
else:
    print("Error: Could not find 'merged_data.csv'. Check your folder structure.")

Looking for file at: c:\Users\ASUS\OneDrive\Desktop\SurgeSense\data\processed\merged_data.csv
Reading big file... (this might take 10 seconds)
Original size: 1458644 rows. Shrinking to 5,000...
Success! File overwritten with small version.
