<a href="https://colab.research.google.com/github/dmitrirepnikov/code_snippets/blob/master/la_expansion_clusters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [84]:
import pandas as pd
df = pd.read_csv('/content/Launch Merchants DTLA and Sawtell Nov 2024 - Sheet1.csv')

In [85]:
df.head()

Unnamed: 0,store_name,address,lat,lng,Eliglbe orders per day
0,Tuk Tuk Thai (LA),"1638 Sawtelle Boulevard , Los Angeles",34.046311,-118.448023,8.0
1,Mr Noodle,"936 Broxton Ave , Los Angeles",34.062977,-118.446823,6.0
2,Tacos Por Favor (Bundy),"11901 W Olympic Blvd , Los Angeles",34.033046,-118.45292,4.0
3,Greenview Thai Restaurant (Santa Monica),"11870 Santa Monica Blvd 107, Los Angeles",34.041677,-118.459342,4.0
4,Mr Rice,"1010 Broxton Ave , Los Angeles",34.062229,-118.44668,4.0


In [89]:
import folium
from folium import plugins
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from shapely.geometry import MultiPoint, Point
from shapely.ops import unary_union

def create_cluster_boundary(points, buffer_distance=0.001):
    """Create a boundary around points with a buffer"""
    if len(points) < 2:
        return None
    multi_point = MultiPoint(points)
    boundary = multi_point.convex_hull.buffer(buffer_distance)
    return boundary

def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate haversine distance between two points"""
    if any(pd.isna([lat1, lon1, lat2, lon2])):
        return None

    R = 6371  # Earth's radius in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

def analyze_delivery_hotspots(df, min_restaurants_per_cluster=4, max_restaurants_per_cluster=20):
    """Analyze delivery hotspots and create clusters"""
    # Clean and prepare the data
    df['weight'] = df['Eliglbe orders per day']

    # Separate DTLA and non-DTLA areas based on longitude
    dtla_mask = df['lng'] > -118.3  # Approximate division between DTLA and West LA
    df_dtla = df[dtla_mask].copy()
    df_west = df[~dtla_mask].copy()

    def process_area(df_area, epsilon, min_samples):
        if len(df_area) == 0:
            return pd.DataFrame()

        coords = df_area[['lat', 'lng']].values
        scaler = StandardScaler()
        coords_scaled = scaler.fit_transform(coords)

        db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(coords_scaled)
        df_area['cluster'] = db.labels_

        # Split oversized clusters if needed
        next_cluster_id = db.labels_.max() + 1

        for cluster_id in range(next_cluster_id):
            cluster_mask = df_area['cluster'] == cluster_id
            cluster_size = cluster_mask.sum()

            if cluster_size > max_restaurants_per_cluster:
                cluster_data = df_area[cluster_mask].copy()
                sub_coords = cluster_data[['lat', 'lng']].values
                sub_scaler = StandardScaler()
                sub_coords_scaled = sub_scaler.fit_transform(sub_coords)

                sub_epsilon = epsilon * 0.6
                sub_db = DBSCAN(eps=sub_epsilon, min_samples=min_samples).fit(sub_coords_scaled)

                if len(set(sub_db.labels_[sub_db.labels_ >= 0])) > 0:
                    df_area.loc[cluster_mask, 'cluster'] = sub_db.labels_ + next_cluster_id
                    next_cluster_id += len(set(sub_db.labels_)) - (1 if -1 in sub_db.labels_ else 0)

        return df_area

    # Process each area
    df_dtla = process_area(df_dtla, epsilon=0.3, min_samples=min_restaurants_per_cluster)
    df_west = process_area(df_west, epsilon=0.25, min_samples=min_restaurants_per_cluster)

    # Combine results and adjust cluster numbers
    if not df_dtla.empty:
        max_west_cluster = df_west['cluster'].max() if not df_west.empty else -1
        df_dtla.loc[df_dtla['cluster'] >= 0, 'cluster'] += max_west_cluster + 1

    df_combined = pd.concat([df_west, df_dtla])

    # Create cluster statistics and visualization elements
    cluster_info = create_cluster_info(df_combined)

    # Create maps
    delivery_map = create_delivery_map(df_combined, cluster_info)
    connection_map = create_connection_map(df_combined, cluster_info)
    cluster_summary = create_cluster_summary(df_combined, cluster_info)

    return delivery_map, connection_map, df_combined, cluster_summary

def create_cluster_info(df_combined):
    """Create cluster information including boundaries and statistics"""
    cluster_info = []
    colors = ['#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#FF00FF', '#00FFFF',
              '#FFA500', '#800080', '#008000', '#FFC0CB', '#A52A2A', '#808080']

    for cluster_id in sorted(set(df_combined['cluster'].unique())):
        if cluster_id != -1:
            mask = df_combined['cluster'] == cluster_id
            cluster_data = df_combined[mask]

            cluster_coords = [(row['lng'], row['lat']) for _, row in cluster_data.iterrows()]

            total_weight = cluster_data['weight'].sum()
            weighted_lat = (cluster_data['lat'] * cluster_data['weight']).sum() / total_weight
            weighted_lng = (cluster_data['lng'] * cluster_data['weight']).sum() / total_weight

            boundary = create_cluster_boundary(cluster_coords)
            boundary_coords = list(boundary.exterior.coords) if boundary else None

            cluster_info.append({
                'cluster_id': cluster_id,
                'center_lat': weighted_lat,
                'center_lng': weighted_lng,
                'total_weight': total_weight,
                'num_restaurants': len(cluster_data),
                'boundary_coords': boundary_coords,
                'color': colors[cluster_id % len(colors)]
            })

    return cluster_info

def create_delivery_map(df_combined, cluster_info):
    """Create the main delivery heatmap with clusters"""
    center_lat = df_combined['lat'].mean()
    center_lng = df_combined['lng'].mean()
    m = folium.Map(location=[center_lat, center_lng], zoom_start=12)

    # Add heatmap
    heat_data = [[row['lat'], row['lng'], row['weight']] for idx, row in df_combined.iterrows()]
    plugins.HeatMap(heat_data).add_to(m)

    # Add visualization elements
    for info in cluster_info:
        # Add center marker
        folium.CircleMarker(
            location=[info['center_lat'], info['center_lng']],
            radius=20,
            popup=f"Cluster {info['cluster_id']}<br>Orders/day: {info['total_weight']:.1f}<br>Restaurants: {info['num_restaurants']}",
            color=info['color'],
            fill=True,
            fill_opacity=0.6
        ).add_to(m)

        if info['boundary_coords']:
            boundary_coords_folium = [[coord[1], coord[0]] for coord in info['boundary_coords']]
            folium.Polygon(
                locations=boundary_coords_folium,
                color=info['color'],
                weight=2,
                fill=True,
                fill_opacity=0.2
            ).add_to(m)

    # Add restaurant markers
    for idx, row in df_combined.iterrows():
        color = 'black' if row['cluster'] == -1 else [info['color'] for info in cluster_info if info['cluster_id'] == row['cluster']][0]
        folium.CircleMarker(
            location=[row['lat'], row['lng']],
            radius=3,
            color=color,
            fill=True,
            fill_opacity=0.7,
            popup=f"{row['store_name']}<br>Orders/day: {row['weight']}"
        ).add_to(m)

    return m

def create_connection_map(df_combined, cluster_info):
    """Create map showing connections between restaurants and cluster centers"""
    center_lat = df_combined['lat'].mean()
    center_lng = df_combined['lng'].mean()
    m = folium.Map(location=[center_lat, center_lng], zoom_start=12)

    # Create centroid lookup
    centroids = {info['cluster_id']: (info['center_lat'], info['center_lng']) for info in cluster_info}

    # Add connections and markers
    for _, row in df_combined.iterrows():
        if row['cluster'] != -1:
            centroid = centroids[row['cluster']]
            # Add restaurant marker
            folium.CircleMarker(
                location=[row['lat'], row['lng']],
                radius=5,
                color='blue',
                fill=True,
                popup=f"{row['store_name']}<br>Cluster {int(row['cluster'])}"
            ).add_to(m)

            # Add connection line
            points = [[row['lat'], row['lng']], [centroid[0], centroid[1]]]
            folium.PolyLine(points, weight=1, color='red', opacity=0.5).add_to(m)

    # Add centroid markers
    for info in cluster_info:
        folium.CircleMarker(
            location=[info['center_lat'], info['center_lng']],
            radius=8,
            color='red',
            fill=True,
            popup=f"Cluster {int(info['cluster_id'])} Center"
        ).add_to(m)

    return m

def create_cluster_summary(df_combined, cluster_info):
    """Create summary statistics for clusters"""
    summary_data = []
    for info in cluster_info:
        cluster_data = df_combined[df_combined['cluster'] == info['cluster_id']]
        city = 'DTLA' if info['center_lng'] > -118.3 else 'West LA'

        summary_data.append({
            'Cluster ID': info['cluster_id'],
            'Area': city,
            'Total Daily Orders': info['total_weight'],
            'Number of Restaurants': info['num_restaurants'],
            'Center Latitude': round(info['center_lat'], 6),
            'Center Longitude': round(info['center_lng'], 6)
        })

    return pd.DataFrame(summary_data).sort_values(['Area', 'Total Daily Orders'], ascending=[True, False])

def create_cluster_id_map(cluster_summary):
    """Create map with visible cluster IDs"""
    center_lat = cluster_summary['Center Latitude'].mean()
    center_lng = cluster_summary['Center Longitude'].mean()
    m = folium.Map(location=[center_lat, center_lng], zoom_start=12)

    # Add cluster centers with visible IDs
    for _, cluster in cluster_summary.iterrows():
        # Create popup content
        popup_content = f"""
            <b>Cluster {int(cluster['Cluster ID'])}</b><br>
            Area: {cluster['Area']}<br>
            Daily Orders: {cluster['Total Daily Orders']:.1f}<br>
            Restaurants: {cluster['Number of Restaurants']}
        """

        # Add CircleMarker with cluster ID visible
        folium.CircleMarker(
            location=[cluster['Center Latitude'], cluster['Center Longitude']],
            radius=15,
            popup=folium.Popup(popup_content, max_width=300),
            color='red' if cluster['Area'] == 'DTLA' else 'blue',
            fill=True,
            fill_opacity=0.7
        ).add_to(m)

        # Add Cluster ID as text
        folium.map.Marker(
            [cluster['Center Latitude'], cluster['Center Longitude']],
            icon=folium.DivIcon(
                html=f'<div style="font-size: 12pt; color: white; font-weight: bold;">{int(cluster["Cluster ID"])}</div>'
            )
        ).add_to(m)

    # Add legend
    legend_html = """
    <div style="position: fixed; bottom: 50px; left: 50px; z-index:1000; background-color: white;
         padding: 10px; border: 2px solid grey; border-radius: 5px;">
        <p><b>Cluster Centers</b></p>
        <p><i class="fa fa-circle" style="color:red"></i> DTLA</p>
        <p><i class="fa fa-circle" style="color:blue"></i> West LA</p>
    </div>
    """
    m.get_root().html.add_child(folium.Element(legend_html))

    return m

def create_volume_map(cluster_summary):
    """Create map with volume-based circles"""
    center_lat = cluster_summary['Center Latitude'].mean()
    center_lng = cluster_summary['Center Longitude'].mean()
    m = folium.Map(location=[center_lat, center_lng], zoom_start=12)

    # Calculate size scaling factor
    max_volume = cluster_summary['Total Daily Orders'].max()
    min_radius = 10
    max_radius = 40

    for _, cluster in cluster_summary.iterrows():
        # Scale radius based on volume
        radius = (cluster['Total Daily Orders'] / max_volume) * max_radius + min_radius

        # Create popup content
        popup_content = f"""
            <b>Cluster {int(cluster['Cluster ID'])}</b><br>
            Area: {cluster['Area']}<br>
            Daily Orders: {cluster['Total Daily Orders']:.1f}<br>
            Restaurants: {cluster['Number of Restaurants']}
        """

        # Add CircleMarker with size based on volume
        folium.CircleMarker(
            location=[cluster['Center Latitude'], cluster['Center Longitude']],
            radius=radius,
            popup=folium.Popup(popup_content, max_width=300),
            color='red' if cluster['Area'] == 'DTLA' else 'blue',
            fill=True,
            fill_opacity=0.7,
            tooltip=f"Daily Orders: {cluster['Total Daily Orders']:.1f}"
        ).add_to(m)

    # Add legends
    legend_html = """
    <div style="position: fixed; bottom: 50px; left: 50px; z-index:1000; background-color: white;
         padding: 10px; border: 2px solid grey; border-radius: 5px;">
        <p><b>Cluster Centers</b></p>
        <p><i class="fa fa-circle" style="color:red"></i> DTLA</p>
        <p><i class="fa fa-circle" style="color:blue"></i> West LA</p>
    </div>
    """

    volume_legend_html = legend_html + """
    <div style="position: fixed; bottom: 50px; right: 50px; z-index:1000; background-color: white;
         padding: 10px; border: 2px solid grey; border-radius: 5px;">
        <p><b>Circle Size</b></p>
        <p>Proportional to daily order volume</p>
        <p>Hover over circles to see exact values</p>
    </div>
    """

    m.get_root().html.add_child(folium.Element(volume_legend_html))

    return m

def create_restaurant_details_csv(enriched_df, cluster_summary):
    """Create detailed CSV with restaurant and cluster information"""
    # Create a lookup dictionary for cluster centers and volumes
    cluster_info = cluster_summary.set_index('Cluster ID')[
        ['Center Latitude', 'Center Longitude', 'Total Daily Orders']
    ].to_dict('index')

    # Prepare the detailed data
    detailed_data = []
    for _, restaurant in enriched_df.iterrows():
        cluster_id = restaurant['cluster']
        cluster_data = cluster_info.get(cluster_id, {}) if cluster_id != -1 else {}

        detailed_data.append({
            'restaurant_name': restaurant['store_name'],
            'latitude': restaurant['lat'],
            'longitude': restaurant['lng'],
            'cluster_id': cluster_id,
            'daily_orders': restaurant['weight'],
            'cluster_total_orders': cluster_data.get('Total Daily Orders', 0) if cluster_id != -1 else 0,
            'cluster_center_lat': cluster_data.get('Center Latitude', None) if cluster_id != -1 else None,
            'cluster_center_lng': cluster_data.get('Center Longitude', None) if cluster_id != -1 else None
        })

    # Create and save DataFrame
    detailed_df = pd.DataFrame(detailed_data)
    detailed_df.to_csv('restaurant_details.csv', index=False)
    return detailed_df

def main():
    # Read data
    print("Loading data...")
    df = pd.read_csv('Launch Merchants DTLA and Sawtell Nov 2024 - Sheet1.csv')

    # Run analysis
    delivery_map, connection_map, enriched_df, cluster_summary = analyze_delivery_hotspots(
        df,
        min_restaurants_per_cluster=3,
        max_restaurants_per_cluster=30
    )

    # Create additional maps
    cluster_id_map = create_cluster_id_map(cluster_summary)
    volume_map = create_volume_map(cluster_summary)

    # Create and save detailed restaurant data
    detailed_df = create_restaurant_details_csv(enriched_df, cluster_summary)

    # Save cluster summary
    cluster_summary.to_csv('cluster_summary.csv', index=False)

    print(f"\nAnalysis complete. Files saved:")
    print("- restaurant_details.csv: Individual restaurant details with cluster information")
    print("- cluster_summary.csv: Cluster-level summary statistics")

    return {
        'delivery_map': delivery_map,
        'connection_map': connection_map,
        'cluster_id_map': cluster_id_map,
        'volume_map': volume_map,
        'cluster_summary': cluster_summary,
        'restaurant_details': detailed_df
    }

if __name__ == "__main__":
    # Get all results
    results = main()

    # Display all maps and results with clear spacing
    print("\n1. Delivery Heatmap")
    print("-" * 40)
    print("Shows order density and cluster boundaries")
    display(results['delivery_map'])
    print("\n" + "="*80 + "\n")

    print("2. Restaurant-Cluster Connections")
    print("-" * 40)
    print("Shows how restaurants connect to their cluster centers")
    display(results['connection_map'])
    print("\n" + "="*80 + "\n")

    print("3. Cluster Identification Map")
    print("-" * 40)
    print("Shows numbered clusters with DTLA/West LA distinction")
    display(results['cluster_id_map'])
    print("\n" + "="*80 + "\n")

    print("4. Order Volume Visualization")
    print("-" * 40)
    print("Shows relative order volumes by cluster size")
    display(results['volume_map'])
    print("\n" + "="*80)
    print("Analysis Complete".center(80))
    print("="*80 + "\n")

Loading data...

Analysis complete. Files saved:
- restaurant_details.csv: Individual restaurant details with cluster information
- cluster_summary.csv: Cluster-level summary statistics

1. Delivery Heatmap
----------------------------------------
Shows order density and cluster boundaries




2. Restaurant-Cluster Connections
----------------------------------------
Shows how restaurants connect to their cluster centers




3. Cluster Identification Map
----------------------------------------
Shows numbered clusters with DTLA/West LA distinction




4. Order Volume Visualization
----------------------------------------
Shows relative order volumes by cluster size



                               Analysis Complete                                



In [92]:
cluster_summary.head()

Unnamed: 0,Cluster ID,Area,Total Daily Orders,Number of Restaurants,Center Latitude,Center Longitude
36,38,DTLA,51.5,48,34.04547,-118.254768
28,30,DTLA,14.9,9,34.044577,-118.251939
23,25,DTLA,13.3,10,34.047007,-118.25064
18,19,DTLA,12.1,13,34.053525,-118.277324
22,24,DTLA,12.0,3,34.046664,-118.25598
