# Uber Ride Analysis Visualization
This notebook creates interactive maps showing spatial patterns in Uber ride data using Folium. Contains two main visualizations:
- Combined cluster map (all hours)
- Hourly cluster analysis map

Key metrics visualized:
- Trip Earnings per Minute (TED)
- Ride density clusters
- Temporal patterns

In [1]:
# Necessary libraries
import pandas as pd
import numpy as np
import folium
from folium.plugins import HeatMap
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Data loading and preprocessing
df = pd.read_csv('./artifacts/df_rides.csv', index_col=0)
df.reset_index(drop=True, inplace=True)
duration = pd.to_timedelta(df['duration_dt'])
df['duration_min_float'] = duration.dt.total_seconds() / 60
df['datetime'] = pd.to_datetime(df['datetime'])
df['hour'] = df['datetime'].dt.hour
df.dropna(inplace=True)
df['origin_lat'] = round(df['origin_lat_lng'].str.replace('[', '').\
    str.replace(']', '').str.split(',', expand=True)[0].astype(float),6)
df['origin_lng'] = round(df['origin_lat_lng'].str.replace('[', '').\
    str.replace(']', '').str.split(',', expand=True)[1].astype(float),6)
df['dest_lat'] = round(df['destination_lat_lng'].str.replace('[', '').\
    str.replace(']', '').str.split(',', expand=True)[0].astype(float),6)
df['dest_lng'] = round(df['destination_lat_lng'].str.replace('[', '').\
    str.replace(']', '').str.split(',', expand=True)[1].astype(float),6)

## Data Cleaning & Outlier Detection
Automated process that:
1. Identifies key features impacting earnings
2. Applies adaptive outlier removal (IQR/Z-score based on distribution skew)
3. Maintains business logic constraints

In [3]:
# Outlier removal based on top features
def earnings_aware_outlier_removal(df, 
                                   target_col='ted', 
                                   number_of_features=5, 
                                   n_estimators=100, 
                                   show_impact=False,
                                   show_report=False):
    """
    Prioritizes outlier removal based on feature impact on 'ted' (total earnings 
    per duration).
    Returns cleaned DataFrame and impact report.
    """
    from sklearn.ensemble import RandomForestRegressor

    # 1. Calculate feature earnings impact on 'ted'
    X = df.select_dtypes(include='number').drop(columns=[target_col,
                                                         'origin_lat_lng',
                                                         'destination_lat_lng',
                                                         'origin_lat',
                                                         'origin_lng',
                                                         'dest_lat',
                                                         'dest_lng',
                                                         'hour'],
                                                errors='ignore')
    y = df[target_col]

    model = RandomForestRegressor(n_estimators=n_estimators) # Increased estimators for robustness
    model.fit(X, y)
    feature_impact = pd.Series(model.feature_importances_, index=X.columns)
    top_features = feature_impact.nlargest(number_of_features).index.tolist() # Top 5 features as requested
    top_features.insert(0, target_col)
    
    # 2. Adaptive outlier removal
    def adaptive_outlier_removal(df, 
                                 z_threshold=3, 
                                 show_impact=show_impact, 
                                 top_features=top_features,
                                 show_report=show_report):
        """
        Automatically selects IQR or Z-score based on skewness.
        Returns cleaned DataFrame and method usage report.
        """
        from scipy.stats import skew, zscore
        
        METHOD_REPORT = {}
        clean_df = df.copy()
        
        # Business constraints (adjust per domain knowledge)
        BUSINESS_LIMITS = {
            'duration_min_float': (1, 180),       # 1min to 3hrs
            #'distance_km': (0.1, 50),          # 100m to 50km  
            #'total_fare': (2.0, 200),
            #'surge_multiplier': (1.0, 5.0)
        }
        
        # Apply business rules first
        for col in clean_df.select_dtypes(include='number'):
            if col in BUSINESS_LIMITS:
                min_val, max_val = BUSINESS_LIMITS[col]
                mask = (clean_df[col] >= min_val) & (clean_df[col] <= max_val)
                clean_df = clean_df[mask]
        
        # Outlier removal
        for col in clean_df[top_features].columns:        
            # Calculate skewness
            col_skew = skew(clean_df[col].dropna())
            METHOD = 'IQR' if abs(col_skew) > 1 else 'Z-score'
            
            # Apply selected method
            if METHOD == 'IQR':
                q1 = clean_df[col].quantile(0.25)
                q3 = clean_df[col].quantile(0.75)
                iqr = q3 - q1
                lower = q1 - 1.5*iqr
                upper = q3 + 1.5*iqr
            else: # Z-score
                z_scores = zscore(clean_df[col].dropna())
                lower = -z_threshold
                upper = z_threshold
                # Align z-scores with original scale
                lower = clean_df[col].mean() + lower*clean_df[col].std()
                upper = clean_df[col].mean() + upper*clean_df[col].std()
            
            # Apply bounds
            mask = (clean_df[col] >= lower) & (clean_df[col] <= upper)
            outliers_removed = len(clean_df) - mask.sum()
            clean_df = clean_df[mask]
            clean_skew = skew(clean_df[col].dropna())
            
            # Store metrics
            METHOD_REPORT[col] = {
                'method': METHOD,
                'skewness': round(col_skew, 2),
                'outliers_removed': outliers_removed,
                'bounds': (round(lower,2), round(upper,2)),
                'retained': len(clean_df),
            }
            
            # Visual comparison
            if show_impact:
                fig, ax = plt.subplots(1,2, figsize=(14,5))
                sns.histplot(df[col], ax=ax[0], color='red', kde=True)
                ax[0].set_title(f'Original {col}\n(Skew: {skew(df[col]):.2f})')
                sns.histplot(clean_df[col], ax=ax[1], color='green', kde=True)
                ax[1].set_title(f'Cleaned {col}\n({METHOD} | Skew: {clean_skew:.2f})')
                plt.tight_layout()
                plt.show()
        
        # Print summary
        if show_report:
            print(pd.DataFrame(METHOD_REPORT).T)
        return clean_df
    
    clean_df = adaptive_outlier_removal(df)
    return clean_df

clean_df = earnings_aware_outlier_removal(df).reset_index(drop=True)

# Business Impact Analysis:
print(f"Original records: {len(df):,}")
print(f"Cleaned records: {len(clean_df):,}")
print(f"Data retention: {len(clean_df)/len(df):.1%}")

Original records: 3,470
Cleaned records: 3,166
Data retention: 91.2%


In [4]:
# Clustering function
def get_dbscan_clusters(coords, radius, min_samples):
    km_per_radian = 6371.0088
    radius = radius/1000
    radius_in_radians = radius / km_per_radian
    db = DBSCAN(eps=radius_in_radians, 
                min_samples=min_samples,
                algorithm='ball_tree',
                metric='haversine').fit(np.radians(coords))
    return db.labels_

## Interactive Map Generation
### Layer Structure
- **Base Layers**  
  CartoDB Positron (default)  
  OpenStreetMap (toggleable)

- **Overlay Layers**  
  Hourly clusters (0-23h)  
    ├─ Origins (green clusters)  
    └─ Destinations (red clusters)  
  Combined clusters  
    ├─ Origin points  
    └─ Destination points

### Cluster Metrics
- Click markers for detailed metrics

In [5]:
#Combined map clustering

# Clustering parameters
radius = 250 # meters, adjust as needed
min_samples = 12

# Cluster Origins
origin_coords = clean_df[['origin_lat', 'origin_lng']].values
db_origin = get_dbscan_clusters(origin_coords, radius, min_samples)
clean_df['origin_cluster'] = db_origin

origin_clusters = clean_df[clean_df['origin_cluster'] != -1].groupby('origin_cluster').agg({ # Exclude noise points (-1)
    'ted': 'mean',
    'type': 'count',
    'origin_lat': 'mean',
    'origin_lng': 'mean'
}).reset_index()
origin_clusters.rename(columns={'type': 'origin_ride_count'}, inplace=True)


# Cluster Destinations
dest_coords = clean_df[['dest_lat', 'dest_lng']].values
db_dest = get_dbscan_clusters(dest_coords, radius, min_samples)
clean_df['dest_cluster'] = db_dest

dest_clusters = clean_df[clean_df['dest_cluster'] != -1].groupby('dest_cluster').agg({ # Exclude noise points (-1)
    'ted': 'mean',
    'type': 'count',
    'dest_lat': 'mean',
    'dest_lng': 'mean'
}).reset_index()
dest_clusters.rename(columns={'type': 'dest_ride_count'}, inplace=True)

#Combined Visualization with Layers
combined_map_center_lat = origin_clusters['origin_lat'].mean() # Center map (can adjust)
combined_map_center_lng = origin_clusters['origin_lng'].mean()

combined_map = folium.Map(location=[combined_map_center_lat, combined_map_center_lng],
                      zoom_start=12,
                      tiles='Cartodb Positron')

# Create Feature Groups for Layers
origin_points = folium.FeatureGroup(name='Origin Points', show=False)
dest_points = folium.FeatureGroup(name='Destination Points', show=False)
origin_layer = folium.FeatureGroup(name='Origin Clusters')
dest_layer = folium.FeatureGroup(name='Destination Clusters')

# Visualize Origin Points on Origin Points Layer
for index, row in clean_df[clean_df['origin_cluster'] != -1].iterrows():
    point_lat = row['origin_lat']
    point_lng = row['origin_lng']
    cluster_origin_points = row['origin_cluster']

    folium.CircleMarker(
        location=[point_lat, point_lng],
        radius=5, # Adjust radius scale
        fill=True,
        fill_color='blue', # Origin point color
        color='blue',
        fill_opacity=0.6,
        popup=f"Origin: {point_lat}, {point_lng}<br>Cluster: {cluster_origin_points}"
    ).add_to(origin_points) # Add to origin points layer

# Visualize Destination Points on Destination Points Layer
for index, row in clean_df[clean_df['dest_cluster'] != -1].iterrows(): 
    point_lat = row['dest_lat']
    point_lng = row['dest_lng']
    cluster_dest_points = row['dest_cluster'] 

    folium.CircleMarker(
        location=[point_lat, point_lng],
        radius=5, # Adjust radius scale
        fill=True,
        fill_color='orange', # Destination point color
        color='orange',
        fill_opacity=0.6,
        popup=f"Destination: {point_lat}, {point_lng}<br>Cluster: {cluster_dest_points}"
    ).add_to(dest_points) # Add to destination points layer
    
# Visualize Origin Clusters on Origin Layer
for index, row in origin_clusters.iterrows():
    cluster_lat = row['origin_lat']
    cluster_lng = row['origin_lng']
    ride_count = row['origin_ride_count']
    ted_mean = row['ted']
    
    folium.Circle(
        location=[cluster_lat, cluster_lng],
        radius=radius, # Adjust radius scale
        fill=True,
        fill_color='green', # Origin cluster color
        color='green',
        fill_opacity=0.6,
        popup=f"Origin Cluster {row['origin_cluster']}<br>Rides: {ride_count}<br>Avg TED: {ted_mean:.2f}"
    ).add_to(origin_layer) # Add to origin layer


# Visualize Destination Clusters on Destination Layer
for index, row in dest_clusters.iterrows():
    cluster_lat = row['dest_lat']
    cluster_lng = row['dest_lng']
    ride_count = row['dest_ride_count']
    ted_mean = row['ted']

    folium.Circle(
        location=[cluster_lat, cluster_lng],
        radius=radius, # Adjust radius scale
        fill=True,
        fill_color='red',   # Destination cluster color
        color='red',
        fill_opacity=0.6,
        popup=f"Destination Cluster {row['dest_cluster']}<br>Rides: {ride_count}<br>Avg TED: {ted_mean:.2f}"
    ).add_to(dest_layer) # Add to destination layer



# Add Layers to Map and Layer Control
origin_points.add_to(combined_map)
dest_points.add_to(combined_map)
origin_layer.add_to(combined_map)
dest_layer.add_to(combined_map)

folium.LayerControl().add_to(combined_map) # Layer control to toggle layers


combined_map.save("./artifacts/combined_cluster_map.html")
print("Combined Origin/Destination Cluster Map saved to combined_cluster_map.html")
combined_map

Combined Origin/Destination Cluster Map saved to combined_cluster_map.html


## Temporal Analysis

In [6]:
# Hourly Cluster Maps with Layers

# Clustering parameters
radius = 250
min_samples = 2

# Map creation
hourly_map = folium.Map(location=[combined_map_center_lat, combined_map_center_lng],
                      zoom_start=12,
                      tiles='Cartodb Positron')

# Create a layer control upfront
layer_control = folium.LayerControl(position='topright', collapsed=False)

# Create hourly clusters and layers
for hour in range(24):
    hourly_df = clean_df[clean_df['hour'] == hour].copy()

    if hourly_df.empty: # Skip hours with no data
        print(f"No data for hour {hour}, skipping.")
        continue

    # Cluster Origins for the hour
    origin_coords_hourly = hourly_df[['origin_lat', 'origin_lng']].values
    db_origin_hourly = get_dbscan_clusters(origin_coords_hourly,
                                           radius,
                                           min_samples) # Use your get_dbscan_clusters function
    hourly_df['origin_cluster_hourly'] = db_origin_hourly

    origin_clusters_hourly = hourly_df[hourly_df['origin_cluster_hourly'] != -1].\
        groupby('origin_cluster_hourly').agg({
        'ted': 'mean',
        'type': 'count',
        'origin_lat': 'mean',
        'origin_lng': 'mean'
    }).reset_index()
    origin_clusters_hourly.rename(columns={'type': 'origin_ride_count'}, inplace=True)


    # Cluster Destinations for the hour
    dest_coords_hourly = hourly_df[['dest_lat', 'dest_lng']].values
    db_dest_hourly = get_dbscan_clusters(dest_coords_hourly,
                                         radius,
                                         min_samples) # Use your get_dbscan_clusters function
    hourly_df['dest_cluster_hourly'] = db_dest_hourly

    dest_clusters_hourly = hourly_df[hourly_df['dest_cluster_hourly'] != -1].\
        groupby('dest_cluster_hourly').agg({
        'ted': 'mean',
        'type': 'count',
        'dest_lat': 'mean',
        'dest_lng': 'mean'
    }).reset_index()
    dest_clusters_hourly.rename(columns={'type': 'dest_ride_count'}, inplace=True)


    # Create separate feature groups for each hour
    hour_group = folium.FeatureGroup(name=f'Hour {hour}', show=False)

    # Visualize Origin Clusters for the hour
    origin_layer = folium.FeatureGroup(name='Origins',
                                       show=True).add_to(hour_group)
    for index, row in origin_clusters_hourly.iterrows():
        cluster_lat = row['origin_lat']
        cluster_lng = row['origin_lng']
        ride_count = row['origin_ride_count']
        ted_mean = row['ted']

        folium.CircleMarker(
            location=[cluster_lat, cluster_lng],
            radius=10,
            fill=True,
            fill_color='green',
            color='green',
            fill_opacity=0.6,
            popup=f"Hour {hour} - Origin Cluster {row['origin_cluster_hourly']}<br>Rides: {ride_count}<br>Avg TED: {ted_mean:.2f}"
        ).add_to(origin_layer) # Add to hourly origin layer


    # Visualize Destination Clusters for the hour
    dest_layer = folium.FeatureGroup(name='Destinations',
                                     show=True).add_to(hour_group)
    for index, row in dest_clusters_hourly.iterrows():
        cluster_lat = row['dest_lat']
        cluster_lng = row['dest_lng']
        ride_count = row['dest_ride_count']
        ted_mean = row['ted']

        folium.CircleMarker(
            location=[cluster_lat, cluster_lng],
            radius=10,
            fill=True,
            fill_color='red',
            color='red',
            fill_opacity=0.6,
            popup=f"Hour {hour} - Destination Cluster {row['dest_cluster_hourly']}<br>Rides: {ride_count}<br>Avg TED: {ted_mean:.2f}"
        ).add_to(dest_layer) # Add to hourly destination layer
        
    # Add the hour group to the map
    hour_group.add_to(hourly_map)
    
# Layer control added to hourly map
layer_control.add_to(hourly_map)

# Hourly map save and display
hourly_map.save("./artifacts/hourly_cluster_map.html")
print("Hourly Cluster Map saved to hourly_cluster_map.html")
hourly_map

Hourly Cluster Map saved to hourly_cluster_map.html


In [7]:
# Save clusters to csv
origin_clusters.to_csv('./artifacts/clusters_org.csv')
dest_clusters.to_csv('./artifacts/clusters_dest.csv')

## Heatmap analysis

- **Overlay Layers**  
  Hourly clusters (0-23h)  
    ├─ Origins (green clusters)  
    └─ Destinations (red clusters)  
  Combined clusters  
    ├─ Origin points  
    └─ Destination points

In [8]:
# Normalize ted data
scaler = MinMaxScaler()
ted_scaled = clean_df['ted'].values.reshape(-1, 1)
scaler.fit(ted_scaled)
ted_scaled = scaler.transform(ted_scaled)
clean_df['ted_scaled'] = ted_scaled

In [9]:
# Combined Origin/Destination Heatmap

# Heatmap params
radius = 15

# Create heat_map
heat_location_lat = clean_df['origin_lat'].mean()
heat_location_lng = clean_df['origin_lng'].mean()

heat_map = folium.Map(location=[heat_location_lat,
                                heat_location_lng],
                      zoom_start=12,
                      tiles='Cartodb Positron')

# Create a layer control upfront
layer_control = folium.LayerControl(position='topright', collapsed=False)

# Create Feature Groups for Layers
origin_heat = folium.FeatureGroup(name='Origin Heatmap', show=True)
dest_heat = folium.FeatureGroup(name='Destination Heatmap', show=False)

# Visualize Heatmap for Origins
origin_data = [[row['origin_lat'], row['origin_lng'], row['ted_scaled']] 
               for index, row in clean_df.iterrows()]
hm_origin = folium.plugins.HeatMap(origin_data,
                                   radius=radius,
                                   name='Origin Heatmap')
hm_origin.add_to(origin_heat)

# Visualize Heatmap for Destinations
dest_data = [[row['dest_lat'], row['dest_lng'], row['ted_scaled']] 
               for index, row in clean_df.iterrows()]
hm_dest = folium.plugins.HeatMap(dest_data,
                                 radius=radius,
                                 name='Destination Heatmap')
hm_dest.add_to(dest_heat)

# Add the heat layers to the map
origin_heat.add_to(heat_map)
dest_heat.add_to(heat_map)

# Layer control added to heat map
layer_control.add_to(heat_map)

# Heat map save and display
heat_map.save("./artifacts/heat_map.html")
print("Heat Map saved to heat_map.html")
heat_map

Heat Map saved to heat_map.html


In [10]:
# Origin Hourly Heatmap
# Hourly data for origins
hourly_origin_data = [[row['origin_lat'], row['origin_lng'], row['ted_scaled']] 
                      for index, row in clean_df.iterrows()]

# Create heat_map for origins
hourly_heat_map = folium.Map(location=[combined_map_center_lat,
                                        combined_map_center_lng],
                              zoom_start=12,
                              tiles='Cartodb Positron')

# Define radius for the heatmap
radius = 15  # You can adjust this value as needed

# Group origin data by hour
unique_hours = sorted(clean_df['hour'].unique())
origin_data_by_hour = [
    [[row['origin_lat'], row['origin_lng'], row['ted_scaled']] 
     for _, row in clean_df[clean_df['hour'] == hour].iterrows()]
    for hour in unique_hours
]

# Visualize Heatmap for Origins
hm_h_origin = folium.plugins.HeatMapWithTime(
    data=origin_data_by_hour,
    radius=radius,
    index=unique_hours,  # List of hours as the index
    name='Origin Heatmap'
)
hm_h_origin.add_to(hourly_heat_map)

# Destination Hourly Heatmap
# Hourly data for destinations
hourly_dest_data = [[row['dest_lat'], row['dest_lng'], row['ted_scaled']] 
                    for index, row in clean_df.iterrows()]

# Group destination data by hour
dest_data_by_hour = [
    [[row['dest_lat'], row['dest_lng'], row['ted_scaled']] 
     for _, row in clean_df[clean_df['hour'] == hour].iterrows()]
    for hour in unique_hours
]

# Visualize Heatmap for Destinations
hm_h_dest = folium.plugins.HeatMapWithTime(
    data=dest_data_by_hour,
    radius=radius,
    index=unique_hours,  # List of hours as the index
    name='Destination Heatmap'
)
hm_h_dest.add_to(hourly_heat_map)

# Add Layer Control
folium.LayerControl(position='topright', collapsed=False).add_to(hourly_heat_map)

# Save and display the map
hourly_heat_map.save("./artifacts/hourly_heat_map.html")
print("Heat Map saved to hourly_heat_map.html")
hourly_heat_map

Heat Map saved to hourly_heat_map.html


## Output Files
- `combined_cluster_map.html`: Persistent demand patterns  
- `hourly_cluster_map.html`: Time-based fluctuations 
- `heat_map.html`: Origin and Destination Heatmap
- `hourly_heat_map.html`: Origin and Destination Heatmap
- `clusters.csv`: Exported cluster metrics