In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load the rides_trips data
df = pd.read_csv('server/data/ride_trips.csv')

# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())

# Focus on pickup_hex_id9 and drop_hex_id9 columns
hex_columns = ['pickup_hex_id9', 'drop_hex_id9']

print("\n" + "="*50)
print("ANALYSIS OF HEX ID COLUMNS")
print("="*50)

for col in hex_columns:
  if col in df.columns:
    print(f"\n{col.upper()}:")
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Missing values: {df[col].isnull().sum()}")
    print(f"  Data type: {df[col].dtype}")
    print(f"  Sample values: {df[col].dropna().head().tolist()}")
    print(f"  Most common values:")
    print(f"    {df[col].value_counts().head()}")
  else:
    print(f"\n{col} column not found in dataset")

# Display first few rows with hex columns
print("\n" + "="*50)
print("SAMPLE DATA")
print("="*50)
if all(col in df.columns for col in hex_columns):
  print(df[hex_columns].head(10))
else:
  print("Available columns:", [col for col in hex_columns if col in df.columns])

Dataset shape: (3000, 24)

Column names:
['ride_id', 'driver_id', 'rider_id', 'city_id', 'product', 'vehicle_type', 'is_ev', 'start_time', 'end_time', 'pickup_lat', 'pickup_lon', 'pickup_hex_id9', 'drop_lat', 'drop_lon', 'drop_hex_id9', 'distance_km', 'duration_mins', 'surge_multiplier', 'fare_amount', 'uber_fee', 'net_earnings', 'tips', 'payment_type', 'date']

ANALYSIS OF HEX ID COLUMNS

PICKUP_HEX_ID9:
  Unique values: 2970
  Missing values: 0
  Data type: object
  Sample values: ['89b5443252677be', '89fa132b2821409', '89df88c01c5d375', '8923a24a0c47657', '897df7e0ca6a6f7']
  Most common values:
    pickup_hex_id9
89b231f4d0144bb    2
8971e8676a25686    2
89a81df844d05f4    2
896cd847e7b5bd3    2
89ab997665e6fbb    2
Name: count, dtype: int64

DROP_HEX_ID9:
  Unique values: 2969
  Missing values: 0
  Data type: object
  Sample values: ['896326c3e14b5c2', '898e2394712bd2c', '8947d4823562561', '89cf8728790df33', '8900b3581cd6ec3']
  Most common values:
    drop_hex_id9
8999ccd5fffa82b

In [2]:
# Create visualizations for zone analysis

# 1. Top pickup zones
pickup_counts = df['pickup_hex_id9'].value_counts().head(20)
fig1 = px.bar(
    x=pickup_counts.index,
    y=pickup_counts.values,
    title="Top 20 Pickup Zones (by Hex ID)",
    labels={'x': 'Hex ID', 'y': 'Number of Pickups'}
)
fig1.update_layout(xaxis_tickangle=45)
fig1.show()

# 2. Top drop-off zones
dropoff_counts = df['drop_hex_id9'].value_counts().head(20)
fig2 = px.bar(
    x=dropoff_counts.index,
    y=dropoff_counts.values,
    title="Top 20 Drop-off Zones (by Hex ID)",
    labels={'x': 'Hex ID', 'y': 'Number of Drop-offs'}
)
fig2.update_layout(xaxis_tickangle=45)
fig2.show()

# 3. Distribution of unique zones
zone_stats = pd.DataFrame({
    'Zone Type': ['Pickup Zones', 'Drop-off Zones'],
    'Unique Count': [df['pickup_hex_id9'].nunique(), df['drop_hex_id9'].nunique()]
})

fig3 = px.bar(zone_stats, x='Zone Type', y='Unique Count', 
              title='Number of Unique Zones for Pickup vs Drop-off')
fig3.show()

print(f"Total unique pickup zones: {df['pickup_hex_id9'].nunique()}")
print(f"Total unique drop-off zones: {df['drop_hex_id9'].nunique()}")
print(f"Zones that appear in both pickup and drop-off: {len(set(df['pickup_hex_id9']).intersection(set(df['drop_hex_id9'])))}")

Total unique pickup zones: 2970
Total unique drop-off zones: 2969
Zones that appear in both pickup and drop-off: 55


In [3]:
# Geographic analysis of zones

# 4. Scatter plot of pickup and drop-off locations
sample_size = min(1000, len(df))  # Use a sample for better performance
df_sample = df.sample(n=sample_size, random_state=42)

fig4 = px.scatter_mapbox(
    df_sample,
    lat="pickup_lat",
    lon="pickup_lon",
    title=f"Pickup Locations (Sample of {sample_size} rides)",
    zoom=10,
    height=600,
    mapbox_style="open-street-map"
)
fig4.show()

# 5. Heat map style scatter plot
fig5 = px.density_mapbox(
    df_sample,
    lat="pickup_lat",
    lon="pickup_lon",
    radius=10,
    title=f"Pickup Density Heat Map (Sample of {sample_size} rides)",
    zoom=10,
    height=600,
    mapbox_style="open-street-map"
)
fig5.show()

# 6. Analysis of zone coverage
print("\n" + "="*50)
print("GEOGRAPHIC ANALYSIS")
print("="*50)
print(f"Pickup coordinates range:")
print(f"  Latitude: {df['pickup_lat'].min():.6f} to {df['pickup_lat'].max():.6f}")
print(f"  Longitude: {df['pickup_lon'].min():.6f} to {df['pickup_lon'].max():.6f}")
print(f"\nDrop-off coordinates range:")
print(f"  Latitude: {df['drop_lat'].min():.6f} to {df['drop_lat'].max():.6f}")
print(f"  Longitude: {df['drop_lon'].min():.6f} to {df['drop_lon'].max():.6f}")

# 7. Most active zones (combining pickup and dropoff)
all_zones = pd.concat([
    df['pickup_hex_id9'].value_counts().rename('pickup_count'),
    df['drop_hex_id9'].value_counts().rename('dropoff_count')
], axis=1).fillna(0)

all_zones['total_activity'] = all_zones['pickup_count'] + all_zones['dropoff_count']
most_active = all_zones.sort_values('total_activity', ascending=False).head(10)

print(f"\nTop 10 most active zones (pickup + dropoff):")
print(most_active)


*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/




*density_mapbox* is deprecated! Use *density_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/




GEOGRAPHIC ANALYSIS
Pickup coordinates range:
  Latitude: 51.403700 to 52.405810
  Longitude: 4.243310 to 5.523270

Drop-off coordinates range:
  Latitude: 51.403730 to 52.405940
  Longitude: 4.243270 to 5.523290

Top 10 most active zones (pickup + dropoff):
                 pickup_count  dropoff_count  total_activity
896388666ac90f6           2.0            1.0             3.0
89fc6124bcd5982           2.0            1.0             3.0
89b231f4d0144bb           2.0            0.0             2.0
89e00a119f1b4bc           1.0            1.0             2.0
89e5f9c7a165d87           1.0            1.0             2.0
89c14c893200a0f           0.0            2.0             2.0
893c5224fd80533           1.0            1.0             2.0
898733b7c5d91f8           1.0            1.0             2.0
89dc4d68524aecd           1.0            1.0             2.0
89b1a3fa59e6db2           1.0            1.0             2.0


In [4]:
# LATITUDE AND LONGITUDE ANALYSIS AND GROUPING
print("="*60)
print("COORDINATE ANALYSIS AND GROUPING STRATEGIES")
print("="*60)

# First, let's examine the coordinate data
print("Coordinate Data Overview:")
print(f"Pickup coordinates:")
print(f"  Latitude range: {df['pickup_lat'].min():.6f} to {df['pickup_lat'].max():.6f}")
print(f"  Longitude range: {df['pickup_lon'].min():.6f} to {df['pickup_lon'].max():.6f}")
print(f"  Latitude span: {df['pickup_lat'].max() - df['pickup_lat'].min():.6f} degrees")
print(f"  Longitude span: {df['pickup_lon'].max() - df['pickup_lon'].min():.6f} degrees")

print(f"\nDrop-off coordinates:")
print(f"  Latitude range: {df['drop_lat'].min():.6f} to {df['drop_lat'].max():.6f}")
print(f"  Longitude range: {df['drop_lon'].min():.6f} to {df['drop_lon'].max():.6f}")
print(f"  Latitude span: {df['drop_lat'].max() - df['drop_lat'].min():.6f} degrees")
print(f"  Longitude span: {df['drop_lon'].max() - df['drop_lon'].min():.6f} degrees")

# Method 1: Grid-based grouping (divide area into equal grid cells)
def create_coordinate_grid(lat, lon, grid_size=0.01):
    """Group coordinates into grid cells"""
    lat_grid = np.floor(lat / grid_size) * grid_size
    lon_grid = np.floor(lon / grid_size) * grid_size
    return lat_grid, lon_grid

# Apply grid grouping for pickups
grid_size = 0.01  # About 1km grid cells
df['pickup_lat_grid'], df['pickup_lon_grid'] = create_coordinate_grid(
    df['pickup_lat'], df['pickup_lon'], grid_size
)
df['pickup_grid_id'] = df['pickup_lat_grid'].astype(str) + '_' + df['pickup_lon_grid'].astype(str)

# Apply grid grouping for drop-offs
df['drop_lat_grid'], df['drop_lon_grid'] = create_coordinate_grid(
    df['drop_lat'], df['drop_lon'], grid_size
)
df['drop_grid_id'] = df['drop_lat_grid'].astype(str) + '_' + df['drop_lon_grid'].astype(str)

print(f"\nMethod 1: Grid-based grouping (grid size: {grid_size} degrees)")
print(f"Unique pickup grid cells: {df['pickup_grid_id'].nunique()}")
print(f"Unique drop-off grid cells: {df['drop_grid_id'].nunique()}")

# Method 2: Rounded coordinates (simpler grouping)
def round_coordinates(lat, lon, decimal_places=2):
    """Round coordinates to specified decimal places"""
    return np.round(lat, decimal_places), np.round(lon, decimal_places)

df['pickup_lat_rounded'], df['pickup_lon_rounded'] = round_coordinates(
    df['pickup_lat'], df['pickup_lon'], 2
)
df['pickup_rounded_id'] = df['pickup_lat_rounded'].astype(str) + '_' + df['pickup_lon_rounded'].astype(str)

df['drop_lat_rounded'], df['drop_lon_rounded'] = round_coordinates(
    df['drop_lat'], df['drop_lon'], 2
)
df['drop_rounded_id'] = df['drop_lat_rounded'].astype(str) + '_' + df['drop_lon_rounded'].astype(str)

print(f"\nMethod 2: Rounded coordinates (2 decimal places)")
print(f"Unique pickup rounded locations: {df['pickup_rounded_id'].nunique()}")
print(f"Unique drop-off rounded locations: {df['drop_rounded_id'].nunique()}")

# Method 3: Zone-based grouping (divide into geographic regions)
def create_geographic_zones(lat, lon, lat_zones=10, lon_zones=10):
    """Divide coordinates into geographic zones"""
    lat_min, lat_max = lat.min(), lat.max()
    lon_min, lon_max = lon.min(), lon.max()
    
    lat_bins = np.linspace(lat_min, lat_max, lat_zones + 1)
    lon_bins = np.linspace(lon_min, lon_max, lon_zones + 1)
    
    lat_zone = pd.cut(lat, bins=lat_bins, labels=False, include_lowest=True)
    lon_zone = pd.cut(lon, bins=lon_bins, labels=False, include_lowest=True)
    
    return lat_zone, lon_zone

# Apply zone grouping
lat_zones, lon_zones = 10, 10
df['pickup_lat_zone'], df['pickup_lon_zone'] = create_geographic_zones(
    df['pickup_lat'], df['pickup_lon'], lat_zones, lon_zones
)
df['pickup_zone_id'] = df['pickup_lat_zone'].astype(str) + '_' + df['pickup_lon_zone'].astype(str)

df['drop_lat_zone'], df['drop_lon_zone'] = create_geographic_zones(
    df['drop_lat'], df['drop_lon'], lat_zones, lon_zones
)
df['drop_zone_id'] = df['drop_lat_zone'].astype(str) + '_' + df['drop_lon_zone'].astype(str)

print(f"\nMethod 3: Zone-based grouping ({lat_zones}x{lon_zones} zones)")
print(f"Unique pickup zones: {df['pickup_zone_id'].nunique()}")
print(f"Unique drop-off zones: {df['drop_zone_id'].nunique()}")

# Display comparison of all methods
print(f"\n" + "="*60)
print("COMPARISON OF GROUPING METHODS")
print("="*60)
comparison_data = {
    'Method': ['Original Hex IDs', 'Grid (0.01°)', 'Rounded (2 dec)', 'Geographic Zones'],
    'Pickup Groups': [df['pickup_hex_id9'].nunique(), df['pickup_grid_id'].nunique(), 
                     df['pickup_rounded_id'].nunique(), df['pickup_zone_id'].nunique()],
    'Dropoff Groups': [df['drop_hex_id9'].nunique(), df['drop_grid_id'].nunique(),
                      df['drop_rounded_id'].nunique(), df['drop_zone_id'].nunique()]
}

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df)

COORDINATE ANALYSIS AND GROUPING STRATEGIES
Coordinate Data Overview:
Pickup coordinates:
  Latitude range: 51.403700 to 52.405810
  Longitude range: 4.243310 to 5.523270
  Latitude span: 1.002110 degrees
  Longitude span: 1.279960 degrees

Drop-off coordinates:
  Latitude range: 51.403730 to 52.405940
  Longitude range: 4.243270 to 5.523290
  Latitude span: 1.002210 degrees
  Longitude span: 1.280020 degrees

Method 1: Grid-based grouping (grid size: 0.01 degrees)
Unique pickup grid cells: 84
Unique drop-off grid cells: 83

Method 2: Rounded coordinates (2 decimal places)
Unique pickup rounded locations: 88
Unique drop-off rounded locations: 86

Method 3: Zone-based grouping (10x10 zones)
Unique pickup zones: 10
Unique drop-off zones: 10

COMPARISON OF GROUPING METHODS
             Method  Pickup Groups  Dropoff Groups
0  Original Hex IDs           2970            2969
1      Grid (0.01°)             84              83
2   Rounded (2 dec)             88              86
3  Geographic Z

In [5]:
# VISUALIZATIONS FOR DIFFERENT GROUPING METHODS

# 1. Grid-based analysis
print("Creating visualizations for coordinate groupings...")

# Grid-based pickup analysis
grid_pickup_counts = df['pickup_grid_id'].value_counts().head(15)
fig_grid = px.bar(
    x=grid_pickup_counts.index,
    y=grid_pickup_counts.values,
    title="Top 15 Pickup Grid Cells (0.01° grid)",
    labels={'x': 'Grid Cell ID', 'y': 'Number of Pickups'}
)
fig_grid.update_layout(xaxis_tickangle=45, height=500)
fig_grid.show()

# 2. Geographic zones heatmap
zone_pickup_counts = df.groupby(['pickup_lat_zone', 'pickup_lon_zone']).size().reset_index(name='count')
fig_heatmap = px.density_heatmap(
    zone_pickup_counts,
    x='pickup_lon_zone',
    y='pickup_lat_zone',
    z='count',
    title='Pickup Density by Geographic Zones (10x10 grid)',
    labels={'pickup_lon_zone': 'Longitude Zone', 'pickup_lat_zone': 'Latitude Zone'}
)
fig_heatmap.show()

# 3. Scatter plot with different grouping overlays
sample_df = df.sample(n=500, random_state=42)

# Grid-based scatter
fig_scatter_grid = px.scatter(
    sample_df,
    x='pickup_lon',
    y='pickup_lat',
    color='pickup_grid_id',
    title='Pickup Locations Colored by Grid Cell (Sample of 500)',
    labels={'pickup_lon': 'Longitude', 'pickup_lat': 'Latitude'}
)
fig_scatter_grid.update_layout(showlegend=False)  # Too many categories for legend
fig_scatter_grid.show()

# Zone-based scatter
fig_scatter_zone = px.scatter(
    sample_df,
    x='pickup_lon',
    y='pickup_lat',
    color='pickup_zone_id',
    title='Pickup Locations Colored by Geographic Zone (Sample of 500)',
    labels={'pickup_lon': 'Longitude', 'pickup_lat': 'Latitude'}
)
fig_scatter_zone.show()

# 4. Flow analysis between zones
flow_data = df.groupby(['pickup_zone_id', 'drop_zone_id']).size().reset_index(name='trip_count')
flow_data = flow_data.sort_values('trip_count', ascending=False).head(20)

fig_flow = px.bar(
    flow_data,
    x='trip_count',
    y=flow_data['pickup_zone_id'] + ' → ' + flow_data['drop_zone_id'],
    orientation='h',
    title='Top 20 Zone-to-Zone Trip Flows',
    labels={'trip_count': 'Number of Trips', 'y': 'Pickup Zone → Drop Zone'}
)
fig_flow.update_layout(height=600)
fig_flow.show()

# 5. Activity summary by different grouping methods
print("\n" + "="*60)
print("ACTIVITY ANALYSIS BY GROUPING METHOD")
print("="*60)

# Grid method activity
grid_activity = pd.concat([
    df['pickup_grid_id'].value_counts().rename('pickup'),
    df['drop_grid_id'].value_counts().rename('dropoff')
], axis=1).fillna(0)
grid_activity['total'] = grid_activity['pickup'] + grid_activity['dropoff']
print(f"Grid method - Most active cell: {grid_activity['total'].max()} trips")
print(f"Grid method - Average trips per cell: {grid_activity['total'].mean():.1f}")

# Zone method activity  
zone_activity = pd.concat([
    df['pickup_zone_id'].value_counts().rename('pickup'),
    df['drop_zone_id'].value_counts().rename('dropoff')
], axis=1).fillna(0)
zone_activity['total'] = zone_activity['pickup'] + zone_activity['dropoff']
print(f"Zone method - Most active zone: {zone_activity['total'].max()} trips")
print(f"Zone method - Average trips per zone: {zone_activity['total'].mean():.1f}")

print(f"\nTop 5 most active grid cells:")
print(grid_activity.sort_values('total', ascending=False).head().round(1))
print(f"\nTop 5 most active zones:")
print(zone_activity.sort_values('total', ascending=False).head().round(1))

Creating visualizations for coordinate groupings...



ACTIVITY ANALYSIS BY GROUPING METHOD
Grid method - Most active cell: 342.0 trips
Grid method - Average trips per cell: 70.6
Zone method - Most active zone: 1252 trips
Zone method - Average trips per zone: 600.0

Top 5 most active grid cells:
                         pickup  dropoff  total
51.4_5.44                 174.0    168.0  342.0
52.1_5.13                 162.0    149.0  311.0
52.09_5.17                106.0    106.0  212.0
51.4_5.45                  94.0    103.0  197.0
52.050000000000004_4.32    92.0    105.0  197.0

Top 5 most active zones:
     pickup  dropoff  total
6_0     626      626   1252
0_9     557      557   1114
9_5     366      365    731
6_6     223      250    473
9_4     219      220    439


In [6]:
# PRACTICAL COORDINATE GROUPING RECOMMENDATIONS

print("="*60)
print("COORDINATE GROUPING RECOMMENDATIONS")
print("="*60)

# Create a summary of the different methods and their use cases
print("GROUPING METHOD COMPARISON:")
print("-" * 40)

methods = {
    'Original Hex IDs': {
        'groups': (df['pickup_hex_id9'].nunique(), df['drop_hex_id9'].nunique()),
        'pros': 'Most precise, preserves original data structure',
        'cons': 'Too granular for analysis (2970 groups)',
        'use_case': 'Exact location analysis, matching with external hex systems'
    },
    'Grid-based (0.01°)': {
        'groups': (df['pickup_grid_id'].nunique(), df['drop_grid_id'].nunique()),
        'pros': 'Good balance of precision and grouping (~1km cells)',
        'cons': 'Arbitrary grid boundaries',
        'use_case': 'Density analysis, hotspot identification'
    },
    'Rounded coordinates': {
        'groups': (df['pickup_rounded_id'].nunique(), df['drop_rounded_id'].nunique()),
        'pros': 'Simple to understand and implement',
        'cons': 'Less systematic than grid approach',
        'use_case': 'Quick analysis, approximate location grouping'
    },
    'Geographic zones': {
        'groups': (df['pickup_zone_id'].nunique(), df['drop_zone_id'].nunique()),
        'pros': 'Perfect for high-level analysis, manageable number of groups',
        'cons': 'May lose local patterns',
        'use_case': 'Regional analysis, city-wide patterns, strategic planning'
    }
}

for method, info in methods.items():
    print(f"\n{method}:")
    print(f"  Groups: {info['groups'][0]} pickup, {info['groups'][1]} dropoff")
    print(f"  Pros: {info['pros']}")
    print(f"  Cons: {info['cons']}")
    print(f"  Best for: {info['use_case']}")

# Demonstrate how to create different analyses based on your needs

print(f"\n" + "="*60)
print("EXAMPLE ANALYSES YOU CAN CREATE")
print("="*60)

# Example 1: Peak hour analysis by zone
df['hour'] = pd.to_datetime(df['start_time']).dt.hour
hourly_zone_data = df.groupby(['hour', 'pickup_zone_id']).size().reset_index(name='trips')

print("\n1. PEAK HOURS BY ZONE (using geographic zones):")
peak_hours = hourly_zone_data.groupby('pickup_zone_id')['trips'].idxmax()
for zone in sorted(df['pickup_zone_id'].unique()):
    if not pd.isna(zone):
        zone_data = hourly_zone_data[hourly_zone_data['pickup_zone_id'] == zone]
        if not zone_data.empty:
            peak_hour = zone_data.loc[zone_data['trips'].idxmax(), 'hour']
            peak_trips = zone_data['trips'].max()
            print(f"  Zone {zone}: Peak at {peak_hour}:00 with {peak_trips} trips")

# Example 2: Distance analysis by grouping method
print(f"\n2. AVERAGE TRIP DISTANCE BY GROUPING METHOD:")
for method_name, col_prefix in [('Grid', 'grid'), ('Zone', 'zone')]:
    pickup_col = f'pickup_{col_prefix}_id'
    drop_col = f'drop_{col_prefix}_id'
    
    # Same area trips (within same grid/zone)
    same_area = df[df[pickup_col] == df[drop_col]]
    cross_area = df[df[pickup_col] != df[drop_col]]
    
    print(f"  {method_name} method:")
    print(f"    Same-area trips: {len(same_area)} ({len(same_area)/len(df)*100:.1f}%)")
    print(f"    Cross-area trips: {len(cross_area)} ({len(cross_area)/len(df)*100:.1f}%)")
    print(f"    Avg distance same-area: {same_area['distance_km'].mean():.2f} km")
    print(f"    Avg distance cross-area: {cross_area['distance_km'].mean():.2f} km")

# Example 3: Create a practical grouping function
def create_optimal_coordinate_groups(df, method='grid', grid_size=0.01, zones=(10,10)):
    """
    Create coordinate groups based on specified method
    
    Parameters:
    - method: 'grid', 'rounded', or 'zones'
    - grid_size: size of grid cells in degrees (for grid method)
    - zones: tuple of (lat_zones, lon_zones) for zone method
    """
    
    result_df = df.copy()
    
    if method == 'grid':
        # Grid-based grouping
        result_df['pickup_lat_group'] = np.floor(df['pickup_lat'] / grid_size) * grid_size
        result_df['pickup_lon_group'] = np.floor(df['pickup_lon'] / grid_size) * grid_size
        result_df['pickup_group_id'] = (result_df['pickup_lat_group'].astype(str) + '_' + 
                                       result_df['pickup_lon_group'].astype(str))
        
        result_df['drop_lat_group'] = np.floor(df['drop_lat'] / grid_size) * grid_size
        result_df['drop_lon_group'] = np.floor(df['drop_lon'] / grid_size) * grid_size
        result_df['drop_group_id'] = (result_df['drop_lat_group'].astype(str) + '_' + 
                                     result_df['drop_lon_group'].astype(str))
        
    elif method == 'zones':
        # Zone-based grouping
        lat_zones, lon_zones = zones
        
        lat_bins = np.linspace(df['pickup_lat'].min(), df['pickup_lat'].max(), lat_zones + 1)
        lon_bins = np.linspace(df['pickup_lon'].min(), df['pickup_lon'].max(), lon_zones + 1)
        
        result_df['pickup_lat_group'] = pd.cut(df['pickup_lat'], bins=lat_bins, labels=False, include_lowest=True)
        result_df['pickup_lon_group'] = pd.cut(df['pickup_lon'], bins=lon_bins, labels=False, include_lowest=True)
        result_df['pickup_group_id'] = (result_df['pickup_lat_group'].astype(str) + '_' + 
                                       result_df['pickup_lon_group'].astype(str))
        
        lat_bins = np.linspace(df['drop_lat'].min(), df['drop_lat'].max(), lat_zones + 1)
        lon_bins = np.linspace(df['drop_lon'].min(), df['drop_lon'].max(), lon_zones + 1)
        
        result_df['drop_lat_group'] = pd.cut(df['drop_lat'], bins=lat_bins, labels=False, include_lowest=True)
        result_df['drop_lon_group'] = pd.cut(df['drop_lon'], bins=lon_bins, labels=False, include_lowest=True)
        result_df['drop_group_id'] = (result_df['drop_lat_group'].astype(str) + '_' + 
                                     result_df['drop_lon_group'].astype(str))
    
    return result_df

print(f"\n3. USING THE OPTIMAL GROUPING FUNCTION:")
print("You can now use create_optimal_coordinate_groups() to group coordinates")
print("Example: grouped_df = create_optimal_coordinate_groups(df, method='grid', grid_size=0.005)")

# Quick demonstration
demo_df = create_optimal_coordinate_groups(df.head(100), method='zones', zones=(5,5))
print(f"Demo: 100 rides grouped into {demo_df['pickup_group_id'].nunique()} pickup zones and {demo_df['drop_group_id'].nunique()} drop zones")

COORDINATE GROUPING RECOMMENDATIONS
GROUPING METHOD COMPARISON:
----------------------------------------

Original Hex IDs:
  Groups: 2970 pickup, 2969 dropoff
  Pros: Most precise, preserves original data structure
  Cons: Too granular for analysis (2970 groups)
  Best for: Exact location analysis, matching with external hex systems

Grid-based (0.01°):
  Groups: 84 pickup, 83 dropoff
  Pros: Good balance of precision and grouping (~1km cells)
  Cons: Arbitrary grid boundaries
  Best for: Density analysis, hotspot identification

Rounded coordinates:
  Groups: 88 pickup, 86 dropoff
  Pros: Simple to understand and implement
  Cons: Less systematic than grid approach
  Best for: Quick analysis, approximate location grouping

Geographic zones:
  Groups: 10 pickup, 10 dropoff
  Pros: Perfect for high-level analysis, manageable number of groups
  Cons: May lose local patterns
  Best for: Regional analysis, city-wide patterns, strategic planning

EXAMPLE ANALYSES YOU CAN CREATE

1. PEAK HO

In [7]:
# TESTING DIFFERENT GRID SIZES: 0.02 and 0.03 DEGREES

print("="*60)
print("COMPARING DIFFERENT GRID SIZES")
print("="*60)

# Test different grid sizes
grid_sizes = [0.01, 0.02, 0.03]
grid_results = []

for size in grid_sizes:
    print(f"\nTesting grid size: {size} degrees")
    print("-" * 30)
    
    # Create grid grouping for this size
    pickup_lat_grid = np.floor(df['pickup_lat'] / size) * size
    pickup_lon_grid = np.floor(df['pickup_lon'] / size) * size
    pickup_grid_id = pickup_lat_grid.astype(str) + '_' + pickup_lon_grid.astype(str)
    
    drop_lat_grid = np.floor(df['drop_lat'] / size) * size
    drop_lon_grid = np.floor(df['drop_lon'] / size) * size
    drop_grid_id = drop_lat_grid.astype(str) + '_' + drop_lon_grid.astype(str)
    
    # Calculate statistics
    pickup_groups = pickup_grid_id.nunique()
    dropoff_groups = drop_grid_id.nunique()
    
    # Calculate activity distribution
    pickup_activity = pickup_grid_id.value_counts()
    dropoff_activity = drop_grid_id.value_counts()
    
    # Combined activity
    combined_activity = pd.concat([
        pickup_activity.rename('pickup'),
        dropoff_activity.rename('dropoff')
    ], axis=1).fillna(0)
    combined_activity['total'] = combined_activity['pickup'] + combined_activity['dropoff']
    
    # Calculate metrics
    max_activity = combined_activity['total'].max()
    avg_activity = combined_activity['total'].mean()
    min_activity = combined_activity['total'].min()
    std_activity = combined_activity['total'].std()
    
    # Store results
    result = {
        'grid_size': size,
        'pickup_groups': pickup_groups,
        'dropoff_groups': dropoff_groups,
        'max_activity': max_activity,
        'avg_activity': avg_activity,
        'min_activity': min_activity,
        'std_activity': std_activity,
        'activity_data': combined_activity
    }
    grid_results.append(result)
    
    print(f"Pickup groups: {pickup_groups}")
    print(f"Dropoff groups: {dropoff_groups}")
    print(f"Most active cell: {max_activity:.0f} trips")
    print(f"Average activity: {avg_activity:.1f} trips per cell")
    print(f"Activity range: {min_activity:.0f} - {max_activity:.0f} trips")
    print(f"Activity std dev: {std_activity:.1f}")

# Create comparison DataFrame
comparison_grid_sizes = pd.DataFrame([
    {
        'Grid Size (degrees)': result['grid_size'],
        'Approx Cell Size': f"~{result['grid_size']*111:.0f}m x {result['grid_size']*111:.0f}m",
        'Pickup Groups': result['pickup_groups'],
        'Dropoff Groups': result['dropoff_groups'],
        'Max Activity': f"{result['max_activity']:.0f}",
        'Avg Activity': f"{result['avg_activity']:.1f}",
        'Activity Std Dev': f"{result['std_activity']:.1f}"
    }
    for result in grid_results
])

print(f"\n" + "="*60)
print("GRID SIZE COMPARISON SUMMARY")
print("="*60)
print(comparison_grid_sizes.to_string(index=False))

# Visualize the comparison
fig_comparison = px.bar(
    comparison_grid_sizes,
    x='Grid Size (degrees)',
    y=['Pickup Groups', 'Dropoff Groups'],
    title='Number of Groups by Grid Size',
    barmode='group'
)
fig_comparison.show()

# Activity distribution comparison
fig_activity = go.Figure()

for result in grid_results:
    size = result['grid_size']
    activity_counts = result['activity_data']['total'].value_counts().sort_index()
    
    fig_activity.add_trace(go.Scatter(
        x=activity_counts.index,
        y=activity_counts.values,
        mode='lines+markers',
        name=f'{size}° grid',
        line=dict(width=2)
    ))

fig_activity.update_layout(
    title='Activity Distribution by Grid Size',
    xaxis_title='Number of Trips per Cell',
    yaxis_title='Number of Cells',
    height=500
)
fig_activity.show()

print(f"\n" + "="*60)
print("RECOMMENDATIONS BY GRID SIZE")
print("="*60)

recommendations = {
    0.01: {
        'cell_size': '~1.1km x 1.1km',
        'best_for': 'Detailed local analysis, identifying specific hotspots',
        'pros': 'High precision, good for micro-level patterns',
        'cons': 'Many groups to manage, may be too granular'
    },
    0.02: {
        'cell_size': '~2.2km x 2.2km', 
        'best_for': 'Neighborhood-level analysis, balanced detail vs simplicity',
        'pros': 'Good balance of detail and manageability',
        'cons': 'May miss very local patterns'
    },
    0.03: {
        'cell_size': '~3.3km x 3.3km',
        'best_for': 'District-level analysis, city planning',
        'pros': 'Fewer groups, easier to analyze broad patterns',
        'cons': 'Lower resolution, may lose important local details'
    }
}

for size, info in recommendations.items():
    result = next(r for r in grid_results if r['grid_size'] == size)
    print(f"\n{size}° Grid ({info['cell_size']}):")
    print(f"  Groups: {result['pickup_groups']} pickup, {result['dropoff_groups']} dropoff")
    print(f"  Best for: {info['best_for']}")
    print(f"  Pros: {info['pros']}")
    print(f"  Cons: {info['cons']}")
    print(f"  Activity range: {result['min_activity']:.0f}-{result['max_activity']:.0f} trips per cell")

COMPARING DIFFERENT GRID SIZES

Testing grid size: 0.01 degrees
------------------------------
Pickup groups: 84
Dropoff groups: 83
Most active cell: 342 trips
Average activity: 70.6 trips per cell
Activity range: 1 - 342 trips
Activity std dev: 68.7

Testing grid size: 0.02 degrees
------------------------------
Pickup groups: 49
Dropoff groups: 47
Most active cell: 544 trips
Average activity: 122.4 trips per cell
Activity range: 1 - 544 trips
Activity std dev: 114.4

Testing grid size: 0.03 degrees
------------------------------
Pickup groups: 40
Dropoff groups: 40
Most active cell: 554 trips
Average activity: 150.0 trips per cell
Activity range: 2 - 554 trips
Activity std dev: 118.7

GRID SIZE COMPARISON SUMMARY
 Grid Size (degrees) Approx Cell Size  Pickup Groups  Dropoff Groups Max Activity Avg Activity Activity Std Dev
                0.01         ~1m x 1m             84              83          342         70.6             68.7
                0.02         ~2m x 2m             4


RECOMMENDATIONS BY GRID SIZE

0.01° Grid (~1.1km x 1.1km):
  Groups: 84 pickup, 83 dropoff
  Best for: Detailed local analysis, identifying specific hotspots
  Pros: High precision, good for micro-level patterns
  Cons: Many groups to manage, may be too granular
  Activity range: 1-342 trips per cell

0.02° Grid (~2.2km x 2.2km):
  Groups: 49 pickup, 47 dropoff
  Best for: Neighborhood-level analysis, balanced detail vs simplicity
  Pros: Good balance of detail and manageability
  Cons: May miss very local patterns
  Activity range: 1-544 trips per cell

0.03° Grid (~3.3km x 3.3km):
  Groups: 40 pickup, 40 dropoff
  Best for: District-level analysis, city planning
  Pros: Fewer groups, easier to analyze broad patterns
  Cons: Lower resolution, may lose important local details
  Activity range: 2-554 trips per cell


In [8]:
# VISUAL COMPARISON OF DIFFERENT GRID SIZES

print("Creating visual comparisons of different grid sizes...")

# Sample data for visualization
sample_data = df.sample(n=800, random_state=42)

# Create subplots for different grid sizes
fig_grids = make_subplots(
    rows=1, cols=3,
    subplot_titles=['0.01° Grid (~1.1km)', '0.02° Grid (~2.2km)', '0.03° Grid (~3.3km)'],
    specs=[[{"secondary_y": False}, {"secondary_y": False}, {"secondary_y": False}]]
)

colors = ['red', 'blue', 'green']
grid_sizes_viz = [0.01, 0.02, 0.03]

for i, size in enumerate(grid_sizes_viz):
    # Create grid IDs for this size
    lat_grid = np.floor(sample_data['pickup_lat'] / size) * size
    lon_grid = np.floor(sample_data['pickup_lon'] / size) * size
    grid_id = lat_grid.astype(str) + '_' + lon_grid.astype(str)
    
    # Create scatter plot
    fig_grids.add_scatter(
        x=sample_data['pickup_lon'],
        y=sample_data['pickup_lat'],
        mode='markers',
        marker=dict(
            size=4,
            color=colors[i],
            opacity=0.6
        ),
        name=f'{size}° grid',
        row=1, col=i+1
    )
    
    # Add grid lines for visualization
    lat_min, lat_max = sample_data['pickup_lat'].min(), sample_data['pickup_lat'].max()
    lon_min, lon_max = sample_data['pickup_lon'].min(), sample_data['pickup_lon'].max()
    
    # Create grid lines
    lat_lines = np.arange(
        np.floor(lat_min / size) * size,
        np.ceil(lat_max / size) * size + size,
        size
    )
    lon_lines = np.arange(
        np.floor(lon_min / size) * size,
        np.ceil(lon_max / size) * size + size,
        size
    )
    
    # Add vertical lines (longitude)
    for lon in lon_lines[::2]:  # Show every other line to avoid clutter
        fig_grids.add_vline(
            x=lon, 
            line=dict(color="gray", width=1, dash="dot"),
            opacity=0.3,
            row=1, col=i+1
        )
    
    # Add horizontal lines (latitude)
    for lat in lat_lines[::2]:  # Show every other line to avoid clutter
        fig_grids.add_hline(
            y=lat,
            line=dict(color="gray", width=1, dash="dot"),
            opacity=0.3,
            row=1, col=i+1
        )

fig_grids.update_layout(
    title_text="Pickup Locations with Different Grid Overlays (Sample of 800 rides)",
    showlegend=False,
    height=500
)

# Update axes labels
for i in range(3):
    fig_grids.update_xaxes(title_text="Longitude", row=1, col=i+1)
    fig_grids.update_yaxes(title_text="Latitude", row=1, col=i+1)

fig_grids.show()

# Create activity heatmaps for each grid size  
for i, size in enumerate(grid_sizes_viz):
    print(f"\nCreating heatmap for {size}° grid...")
    
    # Create grid coordinates
    pickup_lat_grid = np.floor(df['pickup_lat'] / size) * size
    pickup_lon_grid = np.floor(df['pickup_lon'] / size) * size
    
    # Count activities by grid cell
    activity_by_grid = df.groupby([pickup_lat_grid, pickup_lon_grid]).size().reset_index(name='count')
    activity_by_grid.columns = ['lat_grid', 'lon_grid', 'count']
    
    # Create heatmap
    fig_heat = px.density_heatmap(
        activity_by_grid,
        x='lon_grid',
        y='lat_grid',
        z='count',
        title=f'Pickup Activity Heatmap - {size}° Grid (~{size*111:.0f}m cells)',
        labels={'lon_grid': 'Longitude Grid', 'lat_grid': 'Latitude Grid', 'count': 'Number of Trips'}
    )
    fig_heat.show()

# Summary recommendations based on analysis
print(f"\n" + "="*60)
print("GRID SIZE SELECTION GUIDE")
print("="*60)

print("""
CHOOSE YOUR GRID SIZE BASED ON YOUR ANALYSIS GOAL:

🔍 0.01° Grid (1.1km cells) - DETAILED ANALYSIS
   • 84 pickup groups, 83 dropoff groups  
   • Best for: Hotspot identification, local optimization
   • Use when: You need to find exact problem areas or optimize driver positioning
   • Example: "Where exactly should we position drivers in downtown?"

🏘️ 0.02° Grid (2.2km cells) - NEIGHBORHOOD ANALYSIS  
   • 49 pickup groups, 47 dropoff groups
   • Best for: Balanced detail vs simplicity, route optimization
   • Use when: You want neighborhood-level insights without too much complexity
   • Example: "Which neighborhoods have highest demand?"

🏙️ 0.03° Grid (3.3km cells) - DISTRICT ANALYSIS
   • 40 pickup groups, 40 dropoff groups  
   • Best for: City planning, high-level strategy, resource allocation
   • Use when: You need broad patterns for strategic decisions
   • Example: "Which districts need more drivers during peak hours?"

💡 RECOMMENDATION: Start with 0.02° grid for most analyses
   - Good balance of detail and manageability
   - ~47-49 groups are easy to work with
   - Can always drill down to 0.01° for specific areas of interest
""")

# Quick function to apply any grid size
def apply_grid_size(df, grid_size):
    """Apply a specific grid size to the dataframe"""
    df_result = df.copy()
    
    # Pickup grids
    df_result['pickup_lat_grid'] = np.floor(df['pickup_lat'] / grid_size) * grid_size
    df_result['pickup_lon_grid'] = np.floor(df['pickup_lon'] / grid_size) * grid_size
    df_result['pickup_grid_id'] = (df_result['pickup_lat_grid'].astype(str) + '_' + 
                                  df_result['pickup_lon_grid'].astype(str))
    
    # Dropoff grids  
    df_result['drop_lat_grid'] = np.floor(df['drop_lat'] / grid_size) * grid_size
    df_result['drop_lon_grid'] = np.floor(df['drop_lon'] / grid_size) * grid_size
    df_result['drop_grid_id'] = (df_result['drop_lat_grid'].astype(str) + '_' + 
                                df_result['drop_lon_grid'].astype(str))
    
    return df_result

print(f"\n📋 READY TO USE: apply_grid_size(df, 0.02) function is now available")
print(f"Example: df_with_02_grid = apply_grid_size(df, 0.02)")

# Demonstrate the function
demo_02 = apply_grid_size(df.head(10), 0.02)
print(f"\nDemo: Applied 0.02° grid to first 10 rows:")
print(demo_02[['pickup_lat', 'pickup_lon', 'pickup_grid_id']].head())

Creating visual comparisons of different grid sizes...



Creating heatmap for 0.01° grid...



Creating heatmap for 0.02° grid...



Creating heatmap for 0.03° grid...



GRID SIZE SELECTION GUIDE

CHOOSE YOUR GRID SIZE BASED ON YOUR ANALYSIS GOAL:

🔍 0.01° Grid (1.1km cells) - DETAILED ANALYSIS
   • 84 pickup groups, 83 dropoff groups  
   • Best for: Hotspot identification, local optimization
   • Use when: You need to find exact problem areas or optimize driver positioning
   • Example: "Where exactly should we position drivers in downtown?"

🏘️ 0.02° Grid (2.2km cells) - NEIGHBORHOOD ANALYSIS  
   • 49 pickup groups, 47 dropoff groups
   • Best for: Balanced detail vs simplicity, route optimization
   • Use when: You want neighborhood-level insights without too much complexity
   • Example: "Which neighborhoods have highest demand?"

🏙️ 0.03° Grid (3.3km cells) - DISTRICT ANALYSIS
   • 40 pickup groups, 40 dropoff groups  
   • Best for: City planning, high-level strategy, resource allocation
   • Use when: You need broad patterns for strategic decisions
   • Example: "Which districts need more drivers during peak hours?"

💡 RECOMMENDATION: Start w

In [9]:
# CITY ANALYSIS: INTER-CITY TRIPS AND OPTIMAL NODES PER CITY

print("="*60)
print("CITY-BASED ANALYSIS")
print("="*60)

# First, let's examine the cities in our dataset
print("Cities in the dataset:")
cities = df['city_id'].unique()
print(f"Number of cities: {len(cities)}")
print(f"City IDs: {sorted(cities)}")

# Analyze trips by city
print(f"\nTrips per city:")
city_trip_counts = df['city_id'].value_counts().sort_index()
for city in sorted(cities):
    count = city_trip_counts[city]
    percentage = (count / len(df)) * 100
    print(f"  City {city}: {count:,} trips ({percentage:.1f}%)")

# Check for inter-city trips
print(f"\n" + "="*50)
print("INTER-CITY TRIP ANALYSIS")
print("="*50)

# Create pickup and dropoff city analysis
# We'll need to determine city for each pickup/dropoff location
# Since we don't have direct city info for dropoffs, we'll use coordinates

def assign_city_by_coordinates(lat, lon, city_mapping=None):
    """
    Assign city based on coordinates
    This is a simplified approach - in reality you'd use proper geographic boundaries
    """
    if city_mapping is None:
        # We'll create a mapping based on the pickup locations and their known cities
        city_coords = df.groupby('city_id')[['pickup_lat', 'pickup_lon']].mean()
        city_mapping = {}
        for city_id in city_coords.index:
            city_mapping[city_id] = {
                'lat': city_coords.loc[city_id, 'pickup_lat'],
                'lon': city_coords.loc[city_id, 'pickup_lon']
            }
    
    # Find closest city for each coordinate
    min_distance = float('inf')
    closest_city = None
    
    for city_id, coords in city_mapping.items():
        # Simple Euclidean distance (not geographically accurate but good for estimation)
        distance = ((lat - coords['lat'])**2 + (lon - coords['lon'])**2)**0.5
        if distance < min_distance:
            min_distance = distance
            closest_city = city_id
    
    return closest_city

# Get city centers for reference
city_centers = df.groupby('city_id')[['pickup_lat', 'pickup_lon']].agg(['mean', 'std']).round(4)
print("City centers and spread (from pickup locations):")
print(city_centers)

# Create city mapping
city_mapping = {}
for city_id in cities:
    city_data = df[df['city_id'] == city_id]
    city_mapping[city_id] = {
        'lat': city_data['pickup_lat'].mean(),
        'lon': city_data['pickup_lon'].mean()
    }

print(f"\nCity coordinates:")
for city_id, coords in city_mapping.items():
    print(f"  City {city_id}: ({coords['lat']:.4f}, {coords['lon']:.4f})")

# Assign cities to dropoff locations
print(f"\nAssigning dropoff cities based on coordinates...")
df['dropoff_city_estimated'] = df.apply(
    lambda row: assign_city_by_coordinates(row['drop_lat'], row['drop_lon'], city_mapping), 
    axis=1
)

# Analyze inter-city trips
intercity_analysis = df.groupby(['city_id', 'dropoff_city_estimated']).size().reset_index(name='trip_count')
intercity_matrix = intercity_analysis.pivot(index='city_id', columns='dropoff_city_estimated', values='trip_count').fillna(0)

print(f"\nInter-city trip matrix:")
print("Rows = Pickup City, Columns = Dropoff City")
print(intercity_matrix.astype(int))

# Calculate inter-city vs intra-city trips
intra_city_trips = 0
inter_city_trips = 0

for pickup_city in cities:
    for dropoff_city in cities:
        trips = intercity_matrix.loc[pickup_city, dropoff_city] if dropoff_city in intercity_matrix.columns else 0
        if pickup_city == dropoff_city:
            intra_city_trips += trips
        else:
            inter_city_trips += trips

print(f"\n" + "="*50)
print("TRIP DISTRIBUTION SUMMARY")
print("="*50)
total_trips = intra_city_trips + inter_city_trips
print(f"Total trips analyzed: {total_trips:,}")
print(f"Intra-city trips (within same city): {intra_city_trips:,} ({intra_city_trips/total_trips*100:.1f}%)")
print(f"Inter-city trips (between cities): {inter_city_trips:,} ({inter_city_trips/total_trips*100:.1f}%)")

if inter_city_trips > 0:
    print(f"\n📊 INTER-CITY TRIP DETAILS:")
    intercity_details = []
    for pickup_city in cities:
        for dropoff_city in cities:
            if pickup_city != dropoff_city:
                trips = intercity_matrix.loc[pickup_city, dropoff_city] if dropoff_city in intercity_matrix.columns else 0
                if trips > 0:
                    intercity_details.append({
                        'route': f"City {pickup_city} → City {dropoff_city}",
                        'trips': int(trips),
                        'percentage': trips/total_trips*100
                    })
    
    intercity_df = pd.DataFrame(intercity_details).sort_values('trips', ascending=False)
    if len(intercity_df) > 0:
        print(intercity_df.to_string(index=False))
    else:
        print("No significant inter-city trips detected.")
else:
    print(f"\n✅ NO INTER-CITY TRIPS DETECTED")
    print("All trips appear to be within the same city boundaries.")
    print("This suggests each city operates as an independent market.")

CITY-BASED ANALYSIS
Cities in the dataset:
Number of cities: 5
City IDs: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5)]

Trips per city:
  City 1: 585 trips (19.5%)
  City 2: 589 trips (19.6%)
  City 3: 643 trips (21.4%)
  City 4: 557 trips (18.6%)
  City 5: 626 trips (20.9%)

INTER-CITY TRIP ANALYSIS
City centers and spread (from pickup locations):
        pickup_lat         pickup_lon        
              mean     std       mean     std
city_id                                      
1          52.3638  0.0246     4.8950  0.0370
2          51.9194  0.0251     4.4783  0.0362
3          52.0851  0.0215     5.1317  0.0337
4          51.4317  0.0305     5.4697  0.0303
5          52.0672  0.0237     4.2965  0.0291

City coordinates:
  City 3: (52.0851, 5.1317)
  City 4: (51.4317, 5.4697)
  City 2: (51.9194, 4.4783)
  City 1: (52.3638, 4.8950)
  City 5: (52.0672, 4.2965)

Assigning dropoff cities based on coordinates...

Inter-city trip matrix:
Rows = Pickup City, Columns

In [10]:
# OPTIMAL NODE ANALYSIS PER CITY

print(f"\n" + "="*60)
print("OPTIMAL NODES ANALYSIS PER CITY")
print("="*60)

# Since there are no inter-city trips, we can analyze each city independently
city_results = {}

print("Since all trips are intra-city, we'll analyze each city separately...")

# Test different grid sizes for each city
grid_sizes_test = [0.01, 0.02, 0.03]

for city_id in sorted(cities):
    print(f"\n" + "="*40)
    print(f"CITY {city_id} ANALYSIS")
    print("="*40)
    
    city_data = df[df['city_id'] == city_id]
    city_trips = len(city_data)
    
    print(f"Total trips: {city_trips:,}")
    print(f"Coordinate range:")
    print(f"  Pickup Lat: {city_data['pickup_lat'].min():.4f} to {city_data['pickup_lat'].max():.4f}")
    print(f"  Pickup Lon: {city_data['pickup_lon'].min():.4f} to {city_data['pickup_lon'].max():.4f}")
    print(f"  Dropoff Lat: {city_data['drop_lat'].min():.4f} to {city_data['drop_lat'].max():.4f}")
    print(f"  Dropoff Lon: {city_data['drop_lon'].min():.4f} to {city_data['drop_lon'].max():.4f}")
    
    # Calculate area coverage
    lat_span = max(city_data['pickup_lat'].max(), city_data['drop_lat'].max()) - min(city_data['pickup_lat'].min(), city_data['drop_lat'].min())
    lon_span = max(city_data['pickup_lon'].max(), city_data['drop_lon'].max()) - min(city_data['pickup_lon'].min(), city_data['drop_lon'].min())
    
    print(f"  Area span: {lat_span:.4f}° lat × {lon_span:.4f}° lon")
    print(f"  Approx area: {lat_span*111:.1f}km × {lon_span*111:.1f}km")
    
    city_grid_results = []
    
    # Test different grid sizes for this city
    for grid_size in grid_sizes_test:
        # Apply grid to pickup locations
        pickup_lat_grid = np.floor(city_data['pickup_lat'] / grid_size) * grid_size
        pickup_lon_grid = np.floor(city_data['pickup_lon'] / grid_size) * grid_size
        pickup_grid_id = pickup_lat_grid.astype(str) + '_' + pickup_lon_grid.astype(str)
        
        # Apply grid to dropoff locations
        drop_lat_grid = np.floor(city_data['drop_lat'] / grid_size) * grid_size
        drop_lon_grid = np.floor(city_data['drop_lon'] / grid_size) * grid_size
        drop_grid_id = drop_lat_grid.astype(str) + '_' + drop_lon_grid.astype(str)
        
        # Calculate statistics
        pickup_nodes = pickup_grid_id.nunique()
        dropoff_nodes = drop_grid_id.nunique()
        total_unique_nodes = len(set(pickup_grid_id.unique()).union(set(drop_grid_id.unique())))
        
        # Activity analysis
        pickup_activity = pickup_grid_id.value_counts()
        dropoff_activity = drop_grid_id.value_counts()
        
        combined_activity = pd.concat([
            pickup_activity.rename('pickup'),
            dropoff_activity.rename('dropoff')
        ], axis=1).fillna(0)
        combined_activity['total'] = combined_activity['pickup'] + combined_activity['dropoff']
        
        avg_activity = combined_activity['total'].mean()
        max_activity = combined_activity['total'].max()
        min_activity = combined_activity['total'].min()
        
        city_grid_results.append({
            'grid_size': grid_size,
            'pickup_nodes': pickup_nodes,
            'dropoff_nodes': dropoff_nodes,
            'total_nodes': total_unique_nodes,
            'avg_activity': avg_activity,
            'max_activity': max_activity,
            'min_activity': min_activity,
            'trips_per_node': city_trips / total_unique_nodes
        })
        
        print(f"\n  Grid {grid_size}° (~{grid_size*111:.0f}m cells):")
        print(f"    Pickup nodes: {pickup_nodes}")
        print(f"    Dropoff nodes: {dropoff_nodes}")
        print(f"    Total unique nodes: {total_unique_nodes}")
        print(f"    Avg trips per node: {city_trips / total_unique_nodes:.1f}")
        print(f"    Activity range: {min_activity:.0f} - {max_activity:.0f}")
    
    city_results[city_id] = {
        'trips': city_trips,
        'area_km2': lat_span * lon_span * 111 * 111,  # Very rough approximation
        'grid_results': city_grid_results
    }

# Create summary comparison
print(f"\n" + "="*60)
print("CITY COMPARISON SUMMARY")
print("="*60)

summary_data = []
for city_id in sorted(cities):
    city_info = city_results[city_id]
    for result in city_info['grid_results']:
        summary_data.append({
            'City': city_id,
            'Grid Size': f"{result['grid_size']}°",
            'Cell Size': f"~{result['grid_size']*111:.0f}m",
            'Total Nodes': result['total_nodes'],
            'Trips/Node': f"{result['trips_per_node']:.1f}",
            'Max Activity': f"{result['max_activity']:.0f}",
            'Total Trips': city_info['trips']
        })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

# Recommendations per city
print(f"\n" + "="*60)
print("RECOMMENDATIONS PER CITY")
print("="*60)

for city_id in sorted(cities):
    city_info = city_results[city_id]
    print(f"\n🏙️ CITY {city_id} ({city_info['trips']:,} trips):")
    
    # Find optimal grid size based on trips per node
    best_balance = None
    for result in city_info['grid_results']:
        trips_per_node = result['trips_per_node']
        total_nodes = result['total_nodes']
        
        # Good balance: 10-50 trips per node, not too many nodes
        if 10 <= trips_per_node <= 50 and total_nodes <= 50:
            if best_balance is None or abs(trips_per_node - 25) < abs(best_balance['trips_per_node'] - 25):
                best_balance = result
    
    if best_balance:
        print(f"  ⭐ RECOMMENDED: {best_balance['grid_size']}° grid")
        print(f"     • {best_balance['total_nodes']} nodes")
        print(f"     • {best_balance['trips_per_node']:.1f} trips per node")
        print(f"     • ~{best_balance['grid_size']*111:.0f}m cell size")
    else:
        # Fallback recommendation
        medium_grid = next(r for r in city_info['grid_results'] if r['grid_size'] == 0.02)
        print(f"  💡 SUGGESTED: 0.02° grid (balanced approach)")
        print(f"     • {medium_grid['total_nodes']} nodes")
        print(f"     • {medium_grid['trips_per_node']:.1f} trips per node")
    
    # Show all options
    print(f"  📊 All options:")
    for result in city_info['grid_results']:
        print(f"     {result['grid_size']}°: {result['total_nodes']} nodes, {result['trips_per_node']:.1f} trips/node")

print(f"\n" + "="*60)
print("FINAL RECOMMENDATIONS")
print("="*60)
print("""
🎯 OPTIMAL NODE STRATEGY:

Since there are NO inter-city trips, treat each city as an independent network:

• Each city should have its own separate node structure
• No need for inter-city connectivity in your graph
• Focus on optimizing within-city routing and demand patterns
• Consider different grid sizes per city based on trip density

💡 PRACTICAL IMPLEMENTATION:
1. Create separate graphs for each city
2. Use 0.02° grid for most cities (good balance)
3. Adjust grid size based on city size and trip density
4. Cities with more trips can handle finer grids (more nodes)
5. Smaller cities may work better with coarser grids (fewer nodes)
""")


OPTIMAL NODES ANALYSIS PER CITY
Since all trips are intra-city, we'll analyze each city separately...

CITY 1 ANALYSIS
Total trips: 585
Coordinate range:
  Pickup Lat: 52.3257 to 52.4058
  Pickup Lon: 4.8416 to 4.9512
  Dropoff Lat: 52.3258 to 52.4059
  Dropoff Lon: 4.8415 to 4.9511
  Area span: 0.0802° lat × 0.1097° lon
  Approx area: 8.9km × 12.2km

  Grid 0.01° (~1m cells):
    Pickup nodes: 19
    Dropoff nodes: 18
    Total unique nodes: 19
    Avg trips per node: 30.8
    Activity range: 1 - 170

  Grid 0.02° (~2m cells):
    Pickup nodes: 13
    Dropoff nodes: 12
    Total unique nodes: 13
    Avg trips per node: 45.0
    Activity range: 1 - 192

  Grid 0.03° (~3m cells):
    Pickup nodes: 8
    Dropoff nodes: 8
    Total unique nodes: 8
    Avg trips per node: 73.1
    Activity range: 21 - 235

CITY 2 ANALYSIS
Total trips: 589
Coordinate range:
  Pickup Lat: 51.8833 to 51.9588
  Pickup Lon: 4.4267 to 4.5254
  Dropoff Lat: 51.8831 to 51.9588
  Dropoff Lon: 4.4268 to 4.5255
  Ar

In [11]:
# CITY-SPECIFIC VISUALIZATIONS

print("Creating city-specific visualizations...")

# 1. City distribution visualization
fig_cities = px.bar(
    x=[f"City {i}" for i in sorted(cities)],
    y=[city_trip_counts[i] for i in sorted(cities)],
    title="Trip Distribution Across Cities",
    labels={'x': 'City', 'y': 'Number of Trips'},
    color=[city_trip_counts[i] for i in sorted(cities)],
    color_continuous_scale='viridis'
)
fig_cities.show()

# 2. Node count comparison by city and grid size
node_comparison_data = []
for city_id in sorted(cities):
    for result in city_results[city_id]['grid_results']:
        node_comparison_data.append({
            'City': f"City {city_id}",
            'Grid Size': f"{result['grid_size']}°",
            'Nodes': result['total_nodes'],
            'Trips per Node': result['trips_per_node']
        })

node_df = pd.DataFrame(node_comparison_data)

fig_nodes = px.bar(
    node_df,
    x='City',
    y='Nodes',
    color='Grid Size',
    title='Number of Nodes by City and Grid Size',
    barmode='group'
)
fig_nodes.show()

# 3. Trips per node analysis
fig_efficiency = px.bar(
    node_df,
    x='City',
    y='Trips per Node',
    color='Grid Size',
    title='Trips per Node by City and Grid Size (Higher = More Efficient)',
    barmode='group'
)
fig_efficiency.show()

# 4. Geographic distribution of cities
city_coords_df = pd.DataFrame([
    {'City': f"City {city_id}", 'Latitude': coords['lat'], 'Longitude': coords['lon'], 
     'Trips': city_trip_counts[city_id]}
    for city_id, coords in city_mapping.items()
])

fig_map = px.scatter(
    city_coords_df,
    x='Longitude',
    y='Latitude',
    size='Trips',
    color='City',
    title='Geographic Distribution of Cities',
    labels={'Longitude': 'Longitude', 'Latitude': 'Latitude'},
    size_max=30
)
fig_map.show()

# 5. Create a summary table for easy reference
print(f"\n" + "="*60)
print("QUICK REFERENCE: RECOMMENDED NODES PER CITY")
print("="*60)

quick_ref = []
for city_id in sorted(cities):
    city_info = city_results[city_id]
    # Use 0.02° as the recommended grid size
    rec_result = next(r for r in city_info['grid_results'] if r['grid_size'] == 0.02)
    quick_ref.append({
        'City': f"City {city_id}",
        'Trips': f"{city_info['trips']:,}",
        'Recommended Nodes': rec_result['total_nodes'],
        'Trips/Node': f"{rec_result['trips_per_node']:.1f}",
        'Grid Size': "0.02° (~2.2km)"
    })

quick_ref_df = pd.DataFrame(quick_ref)
print(quick_ref_df.to_string(index=False))

print(f"\n📋 IMPLEMENTATION SUMMARY:")
print(f"• Create 5 separate city networks (no inter-city connections needed)")
print(f"• Total recommended nodes across all cities: {sum(r['Recommended Nodes'] for r in quick_ref)}")
print(f"• Average nodes per city: {sum(r['Recommended Nodes'] for r in quick_ref) / len(quick_ref):.1f}")
print(f"• Use 0.02° grid size for balanced analysis")

# Create a function to get optimal nodes for any city
def get_optimal_nodes_for_city(city_id, grid_size=0.02):
    """Get optimal node configuration for a specific city"""
    if city_id not in city_results:
        return None
    
    result = next((r for r in city_results[city_id]['grid_results'] if r['grid_size'] == grid_size), None)
    if result:
        return {
            'city_id': city_id,
            'total_nodes': result['total_nodes'],
            'trips_per_node': result['trips_per_node'],
            'grid_size': grid_size,
            'total_trips': city_results[city_id]['trips']
        }
    return None

print(f"\n🔧 UTILITY FUNCTION AVAILABLE:")
print(f"Use get_optimal_nodes_for_city(city_id, grid_size) to get node info for any city")

# Demonstrate the function
print(f"\nExample: City 1 with 0.02° grid:")
example = get_optimal_nodes_for_city(1, 0.02)
if example:
    print(f"  Nodes: {example['total_nodes']}")
    print(f"  Trips per node: {example['trips_per_node']:.1f}")
    print(f"  Total trips: {example['total_trips']}")

Creating city-specific visualizations...



QUICK REFERENCE: RECOMMENDED NODES PER CITY
  City Trips  Recommended Nodes Trips/Node      Grid Size
City 1   585                 13       45.0 0.02° (~2.2km)
City 2   589                  8       73.6 0.02° (~2.2km)
City 3   643                 10       64.3 0.02° (~2.2km)
City 4   557                  8       69.6 0.02° (~2.2km)
City 5   626                 10       62.6 0.02° (~2.2km)

📋 IMPLEMENTATION SUMMARY:
• Create 5 separate city networks (no inter-city connections needed)
• Total recommended nodes across all cities: 49
• Average nodes per city: 9.8
• Use 0.02° grid size for balanced analysis

🔧 UTILITY FUNCTION AVAILABLE:
Use get_optimal_nodes_for_city(city_id, grid_size) to get node info for any city

Example: City 1 with 0.02° grid:
  Nodes: 13
  Trips per node: 45.0
  Total trips: 585
