# E-Scooters vs Public Transport Analysis

This notebook analyzes whether people choose e-scooters when public transport would be slower.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import pickle
from scipy import stats
from scipy.stats import chi2_contingency
from adjustText import adjust_text

from tueplots import bundles
from tueplots.constants.color import rgb

import sys
sys.path.insert(0, '..')
from city_info import CITY_POPULATIONS, CITY_PT_RIDERSHIP
from noise_detection.non_ride_detection import filter_known_issues

# Define day/night hours (in CET)
NIGHT_HOURS = [21, 22, 23, 0, 1, 2, 3, 4, 5]
DAY_HOURS = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

## Load PT Comparison Data

In [None]:
pt_comparison = pd.read_parquet('pt_comparison_all_trips.parquet')
valid = pt_comparison[pt_comparison['pt_success'] == True].copy()

# Apply data quality filters
valid = filter_known_issues(valid)
valid['city_name'] = valid['city']
valid['hour'] = pd.to_datetime(valid['d_time']).dt.hour

print(f"Loaded {len(pt_comparison):,} trips, {len(valid):,} valid after filtering")

In [None]:
# Check city distribution
print(valid['city_name'].value_counts().head(15))

## E-Scooter vs Public Transport: Scale Comparison

Before analyzing whether e-scooters compete with public transport, let's compare the scale of usage.

In [None]:
# Compare e-scooter trips vs PT ridership for selected cities
comparison_cities = ['Stuttgart', 'Zürich', 'Reutlingen/Tübingen']
escooter_daily = valid.groupby('city_name').size() / valid['d_time'].dt.date.nunique()

print("E-Scooter vs Public Transport - Daily Ridership")
print(f"{'City':<25} {'PT Daily':<15} {'E-Scooter':<15} {'%':<10}")
print("-" * 65)
for city in comparison_cities:
    if city in CITY_PT_RIDERSHIP:
        pt_daily = CITY_PT_RIDERSHIP[city] / 365
        escooter = escooter_daily.get(city, 0)
        pct = escooter / pt_daily * 100 if pt_daily > 0 else 0
        print(f"{city:<25} {int(pt_daily):>12,}   {int(escooter):>10,}   {pct:>6.2f}%")

### Time of Day Analysis

Do people prefer e-scooters over PT more during certain times? PT service quality varies throughout the day (e.g., lower frequency late at night).

In [None]:
escooter_col = 'delta_time' if 'delta_time' in valid.columns else 'opt_route_min'
pt_col = 'pt_total_min' if 'pt_total_min' in valid.columns else 'pt_duration_min'

# Calculate stats by hour
hourly_stats = valid.groupby('hour').agg({
    'time_diff_min': ['mean', 'std', 'count'],
    'escooter_faster': 'mean',
    pt_col: 'mean',
    escooter_col: 'mean'
})
hourly_stats.columns = ['mean_time_diff', 'std_time_diff', 'n_trips', 'pct_escooter_faster', 'mean_pt_min', 'mean_escooter_min']
hourly_stats['pct_escooter_faster'] *= 100

plt.rcParams.update(bundles.icml2024(column="full", nrows=1, ncols=1))

fig, ax = plt.subplots()

ax.plot(hourly_stats.index, hourly_stats['mean_pt_min'], 'o-', label='Public Transport (incl. wait)', color=rgb.tue_red)
ax.plot(hourly_stats.index, hourly_stats['mean_escooter_min'], 's-', label='E-Scooter', color=rgb.tue_blue)
ax.set_xlabel('Hour of Day (CET)')
ax.set_ylabel('Mean Trip Duration (min)')
ax.set_xticks(range(0, 24, 2))
ax.legend()

for h in NIGHT_HOURS:
    ax.axvspan(h-0.5, h+0.5, alpha=0.1, color=rgb.tue_dark)

plt.savefig("hourly_duration_comparison.pdf")
plt.show()

# Night vs day comparison
night_trips = valid[valid['hour'].isin(NIGHT_HOURS)]
day_trips = valid[valid['hour'].isin(DAY_HOURS)]

print(f"Night: {night_trips['time_diff_min'].mean():.1f} min advantage, {night_trips['escooter_faster'].mean()*100:.1f}% e-scooter wins")
print(f"Day: {day_trips['time_diff_min'].mean():.1f} min advantage, {day_trips['escooter_faster'].mean()*100:.1f}% e-scooter wins")

## City-Level Analysis: Usage vs PT Time Disadvantage

In [None]:
# Calculate time difference statistics per city, separately for day and night
valid['date'] = pd.to_datetime(valid['d_time']).dt.date

valid['period'] = 'transition'
valid.loc[valid['hour'].isin(DAY_HOURS), 'period'] = 'day'
valid.loc[valid['hour'].isin(NIGHT_HOURS), 'period'] = 'night'

def calc_time_diff_stats(group):
    n = len(group)
    mean = group['time_diff_min'].mean()
    std = group['time_diff_min'].std()
    ci_95 = 1.96 * std / np.sqrt(n) if n > 0 else 0
    return pd.Series({
        'n_trips': n,
        'mean_time_diff': mean,
        'ci_lower': mean - ci_95,
        'ci_upper': mean + ci_95,
        'pct_escooter_faster': (group['escooter_faster'].mean() * 100),
    })

day_valid = valid[valid['period'] == 'day']
night_valid = valid[valid['period'] == 'night']

day_city_stats = day_valid.groupby('city_name', group_keys=False).apply(
    calc_time_diff_stats, include_groups=False
).reset_index()
day_city_stats['period'] = 'day'

night_city_stats = night_valid.groupby('city_name', group_keys=False).apply(
    calc_time_diff_stats, include_groups=False
).reset_index()
night_city_stats['period'] = 'night'

# Calculate daily rides per city per period
day_daily = day_valid.groupby(['city_name', 'date']).size().reset_index(name='rides')
day_rides = day_daily.groupby('city_name')['rides'].mean().reset_index()
day_rides.columns = ['city_name', 'mean_daily']
day_rides['period'] = 'day'

night_daily = night_valid.groupby(['city_name', 'date']).size().reset_index(name='rides')
night_rides = night_daily.groupby('city_name')['rides'].mean().reset_index()
night_rides.columns = ['city_name', 'mean_daily']
night_rides['period'] = 'night'

# Merge and add population
day_merged = day_rides.merge(day_city_stats, on=['city_name', 'period'])
night_merged = night_rides.merge(night_city_stats, on=['city_name', 'period'])

day_merged['population'] = day_merged['city_name'].map(CITY_POPULATIONS)
night_merged['population'] = night_merged['city_name'].map(CITY_POPULATIONS)

day_merged['rides_per_1k_inhabitants'] = (day_merged['mean_daily'] / day_merged['population']) * 1000
night_merged['rides_per_1k_inhabitants'] = (night_merged['mean_daily'] / night_merged['population']) * 1000

day_plot = day_merged[day_merged['population'].notna()].copy()
night_plot = night_merged[night_merged['population'].notna()].copy()

print(f"Cities with day data: {len(day_plot)}, night data: {len(night_plot)}")

In [None]:
plt.rcParams.update(bundles.icml2024(column="half", nrows=1, ncols=1))

fig, ax = plt.subplots()

max_trips = max(day_plot['n_trips'].max(), night_plot['n_trips'].max())
day_sizes = (day_plot['n_trips'] / max_trips) * 100 + 10
night_sizes = (night_plot['n_trips'] / max_trips) * 100 + 10

ax.scatter(day_plot['mean_time_diff'], day_plot['rides_per_1k_inhabitants'],
           s=day_sizes, c=[rgb.tue_gold], label='Day (06:00-21:00)', marker='o')
ax.scatter(night_plot['mean_time_diff'], night_plot['rides_per_1k_inhabitants'],
           s=night_sizes, c=[rgb.tue_blue], label='Night (21:00-06:00)', marker='s')

# Regression lines
z_day = np.polyfit(day_plot['mean_time_diff'], day_plot['rides_per_1k_inhabitants'], 1)
p_day = np.poly1d(z_day)
x_line_day = np.linspace(day_plot['mean_time_diff'].min(), day_plot['mean_time_diff'].max(), 100)
y_line_day = p_day(x_line_day)
ax.plot(x_line_day, y_line_day, '--', color=rgb.tue_gold, label='Day trend')

z_night = np.polyfit(night_plot['mean_time_diff'], night_plot['rides_per_1k_inhabitants'], 1)
p_night = np.poly1d(z_night)
x_line_night = np.linspace(night_plot['mean_time_diff'].min(), night_plot['mean_time_diff'].max(), 100)
y_line_night = p_night(x_line_night)
ax.plot(x_line_night, y_line_night, '--', color=rgb.tue_blue, label='Night trend')

ax.set_xlabel('Average PT Time Disadvantage (min)')
ax.set_ylabel('Daily Rides per 1,000 Inhabitants')
ax.legend()
ax.set_xlim(0, 80)

# Label top cities
all_x = list(day_plot['mean_time_diff']) + list(night_plot['mean_time_diff']) + list(x_line_day) + list(x_line_night)
all_y = list(day_plot['rides_per_1k_inhabitants']) + list(night_plot['rides_per_1k_inhabitants']) + list(y_line_day) + list(y_line_night)

top_cities = day_plot.nlargest(5, 'n_trips')['city_name'].tolist()
top_day = day_plot[day_plot['city_name'].isin(top_cities)]
top_night = night_plot[night_plot['city_name'].isin(top_cities)]

texts = []
for _, row in top_day.iterrows():
    texts.append(ax.text(row['mean_time_diff'], row['rides_per_1k_inhabitants'], row['city_name'], fontsize='small'))
for _, row in top_night.iterrows():
    texts.append(ax.text(row['mean_time_diff'], row['rides_per_1k_inhabitants'], row['city_name'], fontsize='small'))

adjust_text(texts, x=all_x, y=all_y, ax=ax,
            force_points=(2, 2), force_text=(1, 1),
            expand_points=(3, 3), expand_text=(1.5, 1.5), lim=500,
            arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))

plt.savefig("day_night_scatter.pdf")
plt.show()

# Correlation statistics
day_spearman, day_p = stats.spearmanr(day_plot['mean_time_diff'], day_plot['rides_per_1k_inhabitants'])
night_spearman, night_p = stats.spearmanr(night_plot['mean_time_diff'], night_plot['rides_per_1k_inhabitants'])

print(f"Day: Spearman rho = {day_spearman:.3f} (p = {day_p:.4f})")
print(f"Night: Spearman rho = {night_spearman:.3f} (p = {night_p:.4f})")