# Data Vizualization

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('./artifacts/df_rides.csv', index_col=0)
df.reset_index(drop=True, inplace=True)
duration = pd.to_timedelta(df['duration_dt'])
df['duration_min_float'] = duration.dt.total_seconds() / 60
df['datetime'] = pd.to_datetime(df['datetime'])
# df['datetime'] = pd.to_datetime(df['datetime'])
# df['duration_dt'] = pd.to_timedelta(df['duration_dt'])

In [5]:
# Outlier removal based on top features
def earnings_aware_outlier_removal(df, 
                                   target_col='ted', 
                                   number_of_features=5, 
                                   n_estimators=100, 
                                   show_impact=False,
                                   show_report=False):
    """
    Prioritizes outlier removal based on feature impact on 'ted' (total earnings 
    per duration).
    Returns cleaned DataFrame and impact report.
    """
    from sklearn.ensemble import RandomForestRegressor

    # 1. Calculate feature earnings impact on 'ted'
    X = df.select_dtypes(include='number').drop(columns=[target_col], errors='ignore')
    y = df[target_col]

    model = RandomForestRegressor(n_estimators=n_estimators) # Increased estimators for robustness
    model.fit(X, y)
    feature_impact = pd.Series(model.feature_importances_, index=X.columns)
    top_features = feature_impact.nlargest(number_of_features).index.tolist() # Top 5 features as requested
    top_features.insert(0, target_col)
    
    # 2. Adaptive outlier removal
    def adaptive_outlier_removal(df, 
                                 z_threshold=3, 
                                 show_impact=show_impact, 
                                 top_features=top_features,
                                 show_report=show_report):
        """
        Automatically selects IQR or Z-score based on skewness.
        Returns cleaned DataFrame and method usage report.
        """
        from scipy.stats import skew, zscore
        
        METHOD_REPORT = {}
        clean_df = df.copy()
        
        # Business constraints (adjust per domain knowledge)
        BUSINESS_LIMITS = {
            'duration_min_float': (1, 180),       # 1min to 3hrs
            #'distance_km': (0.1, 50),          # 100m to 50km  
            #'total_fare': (2.0, 200),
            #'surge_multiplier': (1.0, 5.0)
        }
        
        # Apply business rules first
        for col in clean_df.select_dtypes(include='number'):
            if col in BUSINESS_LIMITS:
                min_val, max_val = BUSINESS_LIMITS[col]
                mask = (clean_df[col] >= min_val) & (clean_df[col] <= max_val)
                clean_df = clean_df[mask]
        
        # Outlier removal
        for col in clean_df[top_features].columns:        
            # Calculate skewness
            col_skew = skew(clean_df[col].dropna())
            METHOD = 'IQR' if abs(col_skew) > 1 else 'Z-score'
            
            # Apply selected method
            if METHOD == 'IQR':
                q1 = clean_df[col].quantile(0.25)
                q3 = clean_df[col].quantile(0.75)
                iqr = q3 - q1
                lower = q1 - 1.5*iqr
                upper = q3 + 1.5*iqr
            else: # Z-score
                z_scores = zscore(clean_df[col].dropna())
                lower = -z_threshold
                upper = z_threshold
                # Align z-scores with original scale
                lower = clean_df[col].mean() + lower*clean_df[col].std()
                upper = clean_df[col].mean() + upper*clean_df[col].std()
            
            # Apply bounds
            mask = (clean_df[col] >= lower) & (clean_df[col] <= upper)
            outliers_removed = len(clean_df) - mask.sum()
            clean_df = clean_df[mask]
            clean_skew = skew(clean_df[col].dropna())
            
            # Store metrics
            METHOD_REPORT[col] = {
                'method': METHOD,
                'skewness': round(col_skew, 2),
                'outliers_removed': outliers_removed,
                'bounds': (round(lower,2), round(upper,2)),
                'retained': len(clean_df),
            }
            
            # Visual comparison
            if show_impact:
                fig, ax = plt.subplots(1,2, figsize=(14,5))
                sns.histplot(df[col], ax=ax[0], color='red', kde=True)
                ax[0].set_title(f'Original {col}\n(Skew: {skew(df[col]):.2f})')
                sns.histplot(clean_df[col], ax=ax[1], color='green', kde=True)
                ax[1].set_title(f'Cleaned {col}\n({METHOD} | Skew: {clean_skew:.2f})')
                plt.tight_layout()
                plt.show()
        
        # Print summary
        if show_report:
            print(pd.DataFrame(METHOD_REPORT).T)
        return clean_df
    
    clean_df = adaptive_outlier_removal(df)
    return clean_df

clean_df = earnings_aware_outlier_removal(df)

# Business Impact Analysis:
print(f"Original records: {len(df):,}")
print(f"Cleaned records: {len(clean_df):,}")
print(f"Data retention: {len(clean_df)/len(df):.1%}")

Original records: 3,470
Cleaned records: 3,126
Data retention: 90.1%


In [6]:
import folium
from folium.plugins import HeatMap
from sklearn.cluster import DBSCAN
import geopandas as gpd

# Load the GeoDataFrame
geodf = gpd.read_parquet('./data/br_geo_gdf.parquet')

# Load data for geodf
scdf = pd.read_csv('./data/br_data.csv')
scdf =scdf[scdf['Nome_da_UF '] == 'São Paulo']

# Merge the GeoDataFrame with the DataFrame
geodf['Cod_setor'] = geodf['Cod_setor'].astype('str')
scdf['Cod_setor'] = scdf['Cod_setor'].astype('str')
gdf = pd.merge(scdf, geodf, how='left', on='Cod_setor')

# Filter data for family earnings and density
filtered_gdf = gdf[(gdf['renda_dom'] >= 8.418) & (gdf['densidade'] >= 150)]
filtered_gdf = gpd.GeoDataFrame(filtered_gdf, geometry='geometry')

borders_style = {
        'color': 'green',
        'weight': 0,
        'fillColor': 'green',
        'fillOpacity': 0.2,
    }

In [None]:
cleand

In [13]:
# --- 0. Parse lat long ---
clean_df['origin_lat'] = round(clean_df['origin_lat_lng'].str.replace('[', '').\
    str.replace(']', '').str.split(',', expand=True)[0].astype(float),6)
clean_df['origin_lng'] = round(clean_df['origin_lat_lng'].str.replace('[', '').\
    str.replace(']', '').str.split(',', expand=True)[1].astype(float),6)
clean_df['dest_lat'] = round(clean_df['destination_lat_lng'].str.replace('[', '').\
    str.replace(']', '').str.split(',', expand=True)[0].astype(float),6)
clean_df['dest_lng'] = round(clean_df['destination_lat_lng'].str.replace('[', '').\
    str.replace(']', '').str.split(',', expand=True)[1].astype(float),6)

# --- 1. Cluster Origins ---
radius = 250 # meters, adjust as needed
radius_in_degrees = radius / 111320
origin_coords = clean_df[['origin_lat', 'origin_lng']].values
db_origin = DBSCAN(eps=radius_in_degrees, min_samples=10).fit(origin_coords) # DBSCAN for origins
clean_df['origin_cluster'] = db_origin.labels_

origin_clusters = clean_df[clean_df['origin_cluster'] != -1].groupby('origin_cluster').agg({ # Exclude noise points (-1)
    'ted': 'mean',
    'type': 'count',
    'origin_lat': 'mean',
    'origin_lng': 'mean'
}).reset_index()
origin_clusters.rename(columns={'type': 'origin_ride_count'}, inplace=True)


# --- 2. Cluster Destinations ---
dest_coords = clean_df[['dest_lat', 'dest_lng']].values
db_dest = DBSCAN(eps=radius_in_degrees, min_samples=10).fit(dest_coords) # DBSCAN for destinations
clean_df['dest_cluster'] = db_dest.labels_

dest_clusters = clean_df[clean_df['dest_cluster'] != -1].groupby('dest_cluster').agg({ # Exclude noise points (-1)
    'ted': 'mean',
    'type': 'count',
    'dest_lat': 'mean',
    'dest_lng': 'mean'
}).reset_index()
dest_clusters.rename(columns={'type': 'dest_ride_count'}, inplace=True)

# --- 3. Combined Visualization with Layers ---
combined_map_center_lat = origin_clusters['origin_lat'].mean() # Center map (can adjust)
combined_map_center_lng = origin_clusters['origin_lng'].mean()

combined_map = folium.Map(location=[combined_map_center_lat, combined_map_center_lng],
                      zoom_start=12,
                      tiles='Cartodb Positron')

# --- Create Feature Groups for Layers ---
origin_layer = folium.FeatureGroup(name='Origin Clusters')
dest_layer = folium.FeatureGroup(name='Destination Clusters')


# --- Visualize Origin Clusters on Origin Layer ---
for index, row in origin_clusters.iterrows():
    cluster_lat = row['origin_lat']
    cluster_lng = row['origin_lng']
    ride_count = row['origin_ride_count']
    ted_mean = row['ted']

    folium.CircleMarker(
        location=[cluster_lat, cluster_lng],
        radius=10, # Adjust radius scale
        fill=True,
        fill_color='green', # Origin cluster color
        color='green',
        fill_opacity=0.6,
        popup=f"Origin Cluster {row['origin_cluster']}<br>Rides: {ride_count}<br>Avg TED: {ted_mean:.2f}"
    ).add_to(origin_layer) # Add to origin layer


# --- Visualize Destination Clusters on Destination Layer ---
for index, row in dest_clusters.iterrows():
    cluster_lat = row['dest_lat']
    cluster_lng = row['dest_lng']
    ride_count = row['dest_ride_count']
    ted_mean = row['ted']

    folium.CircleMarker(
        location=[cluster_lat, cluster_lng],
        radius=10, # Adjust radius scale
        fill=True,
        fill_color='red',   # Destination cluster color
        color='red',
        fill_opacity=0.6,
        popup=f"Destination Cluster {row['dest_cluster']}<br>Rides: {ride_count}<br>Avg TED: {ted_mean:.2f}"
    ).add_to(dest_layer) # Add to destination layer



# --- Add Layers to Map and Layer Control ---
origin_layer.add_to(combined_map)
dest_layer.add_to(combined_map)
folium.LayerControl().add_to(combined_map) # Layer control to toggle layers


combined_map.save("./artifacts/combined_cluster_map.html")
print("Combined Origin/Destination Cluster Map saved to combined_cluster_map.html")
combined_map


Combined Origin/Destination Cluster Map saved to combined_cluster_map.html


In [32]:
dest_clusters.describe()

Unnamed: 0,dest_cluster,ted,dest_ride_count,dest_lat,dest_lng
count,417.0,417.0,417.0,417.0,417.0
mean,208.0,1.000547,5.808153,-23.563737,-46.655565
std,120.521782,0.145907,12.007275,0.045838,0.040662
min,0.0,0.577676,2.0,-23.707078,-46.848121
25%,104.0,0.915064,2.0,-23.596295,-46.683944
50%,208.0,0.993505,3.0,-23.562339,-46.657271
75%,312.0,1.084353,5.0,-23.532984,-46.635527
max,416.0,1.525212,134.0,-23.426255,-46.471671


In [23]:
# Filter best clusters
thresh = clustered_location['ted'].mean()
clustered_location = clustered_location[clustered_location['ted'] >= thresh]


# 4. Create a Folium map centered around the average latitude and longitude
map_center_lat = clustered_location['origin_lat'].mean()
map_center_lng = clustered_location['origin_lng'].mean()


ride_map = folium.Map(location=[map_center_lat, map_center_lng], 
                      zoom_start=12,
                      tiles='Cartodb Positron') # Adjust zoom_start as needed


# 5. Create Heatmap layer to visualize 'ted' intensity
heat_data = [[row['origin_lat'], row['origin_lng'], row['type']] for index, row in clustered_location.iterrows()]

HeatMap(heat_data).add_to(ride_map)

# 5. Create population density and earnings layer
#folium.GeoJson(filtered_gdf, style_function=lambda x: borders_style).add_to(ride_map)


# Display the map (in a notebook environment, it will render directly)
ride_map.save("./artifacts/ride_efficiency_map.html") # Save to HTML file to view in browser
print("Map saved to ride_efficiency_map.html")

Map saved to ride_efficiency_map.html


In [24]:
ride_map

In [17]:
clustered_location.to_csv('./artifacts/cluster.csv', index=False)

In [33]:
clean_df.head().T

Unnamed: 0,0,1,2,3,4
type,UberX,UberX,UberX,UberX,UberX
date,2023-11-11,2023-11-11,2023-11-11,2023-11-11,2023-11-11
time,17:28:00,16:37:00,16:19:00,15:57:00,15:01:00
duration,11 min 32 sec,27 min 43 sec,16 min 18 sec,5 min 29 sec,2 min 54 sec
distance,4.15 km,13.42 km,4.78 km,1.38 km,0.96 km
origin,"Rua Antônio das Chagas, Santo Amaro - São Paul...","Rua Helena, Itaim Bibi - São Paulo - SP, 04552...","Alameda Gabriel Monteiro da Silva, Jardim Amer...","R. Turiassu, Perdizes - São Paulo - SP, 05005-...","Rua das Tabocas, Alto de Pinheiros - São Paulo..."
destination,"R. Geórgia, Brooklin - São Paulo - SP, 04559-0...","Rua Quipa, Campo Limpo - São Paulo - SP, 05756...","Rua Helena, Itaim Bibi - São Paulo - SP, 04552...","Rua Ministro Godói, Perdizes - São Paulo - SP,...","R. Delfina, Vila Madalena - São Paulo - SP, 05..."
total_earning,15.01,27.09,13.78,6.69,6.11
base_fare,11.01,27.09,13.78,6.69,6.11
customer_fare,19.97,34.95,22.96,10.96,9.98


In [32]:
clean_df['hour_of_day'] = clean_df['datetime'].dt.hour
clean_df['day_of_week'] = clean_df['datetime'].dt.day_name()

In [41]:
heat_data

[[[[-23.544494, -46.689933, 1.0],
   [-23.587079, -46.67627, 1.0],
   [-23.532273, -46.651858, 1.0],
   [-23.576731, -46.645414, 1.0],
   [-23.540903, -46.642864, 1.0],
   [-23.558262, -46.690596, 2.0],
   [-23.51999, -46.657016, 1.0],
   [-23.548968, -46.614979, 1.0],
   [-23.637816, -46.639402, 1.0],
   [-23.624536, -46.670554, 1.0],
   [-23.613082, -46.663552, 1.0],
   [-23.571209, -46.64943, 1.0],
   [-23.584407, -46.725074, 1.0],
   [-23.489809, -46.609847, 1.0],
   [-23.479406, -46.603416, 1.0],
   [-23.553395000000002, -46.660753, 2.0],
   [-23.546346, -46.63789, 1.0],
   [-23.548674, -46.645437, 1.0],
   [-23.567169333333336, -46.66568133333334, 3.0],
   [-23.586882, -46.627004, 1.0],
   [-23.533945, -46.693151, 1.0],
   [-23.52617, -46.67549, 1.0],
   [-23.556436, -46.653007, 1.0],
   [-23.579284, -46.712411, 1.0],
   [-23.469395, -46.61691, 1.0],
   [-23.570771999999998, -46.644915499999996, 2.0],
   [-23.554423, -46.655886, 1.0],
   [-23.570609, -46.687655, 1.0],
   [-23.596

In [62]:
heat_data = []

for hour in range(24):
    filtered_df = clean_df[clean_df['hour_of_day'] == hour].copy()
    
    radius = 250 # in meters  
    radius_in_degrees = radius / 111320  # 1 degree is approximately 111300 meters
    coords = filtered_df[['origin_lat', 'origin_lng']].values
    db = DBSCAN(eps=radius_in_degrees, min_samples=2).fit(coords) 
    filtered_df['cluster'] = db.labels_
    
    clustered_location = filtered_df.groupby('cluster').agg({
        'origin_lat': 'mean',
        'origin_lng': 'mean',
        'ted': 'mean',
        'type' : 'count'
    }).reset_index()

    # Filter best clusters
    thresh = clustered_location['ted'].mean()
    thresh_print = thresh*60
    print(f'{hour} of the day with mean {thresh_print}')
    clustered_location = clustered_location[clustered_location['ted'] >= thresh]
    
    # Prepare heatmap data for this hour
    heat_data.append([
        [row['origin_lat'], row['origin_lng'], row['type']]
        for index, row in clustered_location.iterrows()]
    )

m = folium.Map(location=[clean_df['origin_lat'].mean(), 
                         clean_df['origin_lng'].mean()
                         ],
               zoom_start=12)

# Add HeatMapWithTime
hm = folium.plugins.HeatMapWithTime(
        heat_data,
        index=[f"{hour}:00" for hour in range(24)],
        auto_play=True,
        radius=20
    )
hm.add_to(m)

m

0 of the day with mean 60.10986170939811
1 of the day with mean 72.1287722645612
2 of the day with mean 65.79095757770656
3 of the day with mean 62.79531881490018
4 of the day with mean 59.5531164249314
5 of the day with mean 54.20743408938497
6 of the day with mean 66.8801871150945
7 of the day with mean 69.01231740717397
8 of the day with mean 63.72557241821686
9 of the day with mean 60.9210711153501
10 of the day with mean 51.66260179568053
11 of the day with mean 50.91346839187577
12 of the day with mean 59.3907119062207
13 of the day with mean 60.53768770943571
14 of the day with mean 57.9948364915692
15 of the day with mean 57.32672330325129
16 of the day with mean 54.854932748150254
17 of the day with mean 57.84814035957751
18 of the day with mean 61.4922314479973
19 of the day with mean 64.11367551727153
20 of the day with mean 58.7640689262061
21 of the day with mean 60.422051251157605
22 of the day with mean 65.991869644178
23 of the day with mean 64.88200223750128


In [63]:
m.save('./artifacts/heat_with_time_rides.html')

In [48]:
heat_data[13]

[[-23.536642333333333, -46.65669233333333, 3.0],
 [-23.531216, -46.64377266666667, 3.0],
 [-23.5887635, -46.678560000000004, 2.0],
 [-23.505349250000002, -46.64211975, 4.0],
 [-23.529071000000002, -46.6407995, 2.0],
 [-23.48966233333333, -46.63758816666667, 6.0],
 [-23.5199105, -46.661929, 2.0],
 [-23.564052, -46.6523205, 2.0],
 [-23.497837, -46.632126, 2.0],
 [-23.490541333333336, -46.62625133333333, 3.0],
 [-23.4923338, -46.6327554, 5.0],
 [-23.6078465, -46.660134, 2.0],
 [-23.535862, -46.666777, 2.0],
 [-23.534772, -46.661855, 2.0]]

In [69]:
from branca.element import Element

def get_marker_color_for_hour(hour, ride_count):
    """Dynamically assigns marker color based on ride count for a given hour."""
    if ride_count < 10:
        return 'blue'
    elif 10 <= ride_count < 20:
        return 'green'
    else:
        return 'red'

def create_hourly_markers(hour, df):
    filtered_df = df[df['hour_of_day'] == hour].copy()

    radius = 250  # in meters
    radius_in_degrees = radius / 111320
    coords = filtered_df[['origin_lat', 'origin_lng']].values
    db = DBSCAN(eps=radius_in_degrees, min_samples=2).fit(coords)
    filtered_df['cluster'] = db.labels_

    clustered_location = filtered_df.groupby('cluster').agg({
        'origin_lat': 'mean',
        'origin_lng': 'mean',
        'ted': 'mean',
        'type': 'count'
    }).reset_index()

    thresh = clustered_location['ted'].mean()
    clustered_location = clustered_location[clustered_location['ted'] >= thresh]

    # Create a FeatureGroup for this hour with a unique ID
    hour_group = folium.FeatureGroup(name=f"Hour {hour:02d}:00", overlay=True, control=False)
    hour_group.layer_id = f"hour_{hour}" # Assign a unique ID

    for index, row in clustered_location.iterrows():
        marker_color = get_marker_color_for_hour(hour, row['type'])
        folium.CircleMarker(
            location=[row['origin_lat'], row['origin_lng']],
            radius=row['type'] * 1.5,
            popup=f"Hour: {hour:00}, Type: {row['type']}",
            fill=True,
            fill_color=marker_color,
            color=marker_color,
            fill_opacity=0.6
        ).add_to(hour_group)
    return hour_group

m = folium.Map(location=[clean_df['origin_lat'].mean(),
                         clean_df['origin_lng'].mean()
                         ],
               zoom_start=12)

hour_layers = {} # Dictionary to store hour layers

# Create and add marker layers for each hour
for hour in range(24):
    hour_markers = create_hourly_markers(hour, clean_df)
    hour_layers[f"Hour {hour:02d}:00"] = hour_markers # Store layer in dictionary
    m.add_child(hour_markers) # Add to map initially

# HTML Slider
html_slider = folium.Html(
    f"""
    <div style="position: fixed; bottom: 50px; left: 50px; z-index: 9999; background-color: white; padding: 10px; border-radius: 5px;">
        <input type="range" min="0" max="23" value="0" step="1" id="hourSlider" style="width: 200px;">
        <span id="hourValue">Hour: 00:00</span>
    </div>
    """,
    script=True
)

slider_element = Element(html_slider)
m.get_root().html.add_child(slider_element)


# JavaScript to control layer visibility based on slider
js_code = """
    var hourSlider = document.getElementById('hourSlider');
    var hourValueDisplay = document.getElementById('hourValue');
    var hourLayers = {};

    hourSlider.oninput = function() {
        var selectedHour = parseInt(hourSlider.value);
        var hourString = (selectedHour < 10 ? '0' : '') + selectedHour + ':00';
        hourValueDisplay.textContent = 'Hour: ' + hourString;

        // Hide all hour layers
        for (var hour in hourLayers) {
            if (hourLayers.hasOwnProperty(hour)) {
                map.removeLayer(hourLayers[hour]);
            }
        }

        // Show the selected hour layer
        var selectedLayerName = 'Hour ' + hourString;
        if (hourLayers.hasOwnProperty(selectedLayerName)) {
            map.addLayer(hourLayers[selectedLayerName]);
        }
    };

    // Initialize hour layers in JavaScript
    var layerControl = map._layersControl; // Access LayerControl
    layerControl._layers.forEach(function(layerData) {
        if (layerData.name.startsWith('Hour ')) {
            hourLayers[layerData.name] = layerData.layer;
        }
    });
"""

m.add_child(folium.Javascript(js_code))
folium.LayerControl().add_to(m) # Keep LayerControl for toggling all hours on/off


m

TypeError: Can't compile non template nodes