# Taxi Data Analysis

准备工作，BigQuery GEO data ready & pickup location clustering with text id

In [None]:
%%bigquery

CREATE OR REPLACE TABLE `du-hast-mich.taxi.chicago_taxi` AS
SELECT
    *，
    -- 1. 處理上車點 (Demand Source)
    ST_GEOGPOINT(pickup_longitude, pickup_latitude) AS pickup_geom,
    -- 預先計算上車網格 ID (WKT Text)
    ST_ASTEXT(ST_SNAPTOGRID(ST_GEOGPOINT(pickup_longitude, pickup_latitude), 0.005)) AS pickup_cluster_id,

    -- 2. 處理下車點 (Supply Source)
    ST_GEOGPOINT(dropoff_longitude, dropoff_latitude) AS dropoff_geom,
    -- 預先計算下車網格 ID (WKT Text) - 這次我們保證它有值
    ST_ASTEXT(ST_SNAPTOGRID(ST_GEOGPOINT(dropoff_longitude, dropoff_latitude), 0.005)) AS dropoff_cluster_id

FROM
    `bigquery-public-data.chicago_taxi_trips.taxi_trips`
WHERE
    pickup_latitude IS NOT NULL
    AND pickup_longitude IS NOT NULL
    AND dropoff_latitude IS NOT NULL
    AND dropoff_longitude IS NOT NULL
    AND trip_start_timestamp >= '2023-01-01';

識別「黃金網格」 (Top Grids Identification)
我們先找出哪些網格是需求最高的「核心區域」。這能幫助我們過濾掉雜訊，專注於優化那 20% 帶來 80% 訂單的區域。

In [None]:
%%bigquery df_top_grids

SELECT
    pickup_cluster_id,
    COUNT(*) AS total_trips,
    AVG(trip_total) AS avg_revenue, -- 平均客單價
    -- 為了 Python 繪圖方便，直接在 SQL 解析出經緯度
    ST_X(ST_GEOGFROMTEXT(pickup_cluster_id)) AS lng,
    ST_Y(ST_GEOGFROMTEXT(pickup_cluster_id)) AS lat
FROM
    `du-hast-mich.taxi.chicago_taxi`
GROUP BY
    1, 4, 5
ORDER BY
    total_trips DESC
LIMIT 500; --- top 500

視覺化 - 熱點分佈圖
將上述 SQL 的結果存為 DataFrame (例如 df_top_grids)，然後用 Folium 畫出來。這張圖能讓您直觀地看到「錢在哪裡」。

In [None]:
import folium
from folium.plugins import HeatMap

# 1. 初始化地圖 (以芝加哥為中心)
chi_map = folium.Map(location=[41.8781, -87.6298], zoom_start=11, tiles='CartoDB dark_matter')

# 2. 準備熱力圖數據 [Lat, Lng, Weight]
# Weight 我們使用 total_trips，這樣越紅的地方代表需求越強
heat_data = df_top_grids[['lat', 'lng', 'total_trips']].values.tolist()

# 3. 添加熱力圖層
HeatMap(heat_data, radius=10, blur=15, max_zoom=13).add_to(chi_map)

# 4. (選用) 在前 10 名的網格上標註 Marker，顯示客單價
for i, row in df_top_grids.head(10).iterrows():
    folium.CircleMarker(
        location=[row['lat'], row['lng']],
        radius=5,
        color='yellow',
        fill=True,
        popup=f"Rank: {i+1}<br>Trips: {row['total_trips']}<br>Avg Fare: ${row['avg_revenue']:.2f}"
    ).add_to(chi_map)

chi_map

In [None]:
import folium
from folium.plugins import HeatMap

# 1. 初始化地圖，使用 Google Maps (Roadmap) 底圖
# lyrs=m 代表標準地圖 (Roadmap)，背景較乾淨，適合疊加熱力圖
m_static = folium.Map(
    location=[41.8781, -87.6298],
    zoom_start=11,
    #tiles="https://mt1.google.com/vt/lyrs=m&x={x}&y={y}&z={z}",
    tiles="https://mt1.google.com/vt/lyrs=y&x={x}&y={y}&z={z}",
    attr="Google"
)

# 2. 準備數據：從 df_top_grids 提取 [Lat, Lng, Weight]
# 假設 df_top_grids 已經由前面的 %%bigquery 跑出來了
heat_data = df_top_grids[['lat', 'lng', 'total_trips']].values.tolist()

# 3. 繪製熱力圖層
# radius 和 blur 可以根據視覺效果微調
HeatMap(heat_data, radius=15, blur=20, min_opacity=0.5).add_to(m_static)

# 4. (選用) 為前 5 大熱點加上詳細資訊標籤
for i, row in df_top_grids.head(5).iterrows():
    folium.Marker(
        location=[row['lat'], row['lng']],
        popup=f"<b>Rank #{i+1}</b><br>Trips: {row['total_trips']}<br>Avg Fare: ${row['avg_revenue']:.2f}",
        icon=folium.Icon(color='red', icon='info-sign')
    ).add_to(m_static)

# 5. 顯示地圖
m_static

獲取時序數據 (Hourly Data)
靜態圖看不出「早晚高峰」的流動。我們同樣使用 %%bigquery 技巧，將前 100 大熱點的每小時流量存入 df_hourly。

In [None]:
%%bigquery df_hourly

SELECT
    -- 1. 網格 ID 與 經緯度
    ST_ASTEXT(ST_SNAPTOGRID(ST_GEOGPOINT(pickup_longitude, pickup_latitude), 0.005)) AS pickup_cluster_id,
    ST_Y(ST_CENTROID(ST_SNAPTOGRID(ST_GEOGPOINT(pickup_longitude, pickup_latitude), 0.005))) AS lat,
    ST_X(ST_CENTROID(ST_SNAPTOGRID(ST_GEOGPOINT(pickup_longitude, pickup_latitude), 0.005))) AS lng,

    -- 2. 時間維度
    EXTRACT(HOUR FROM trip_start_timestamp) AS hour_of_day,

    -- 3. 指標
    COUNT(*) AS trip_count

FROM
    `du-hast-mich.taxi.chicago_taxi`
WHERE
    trip_start_timestamp IS NOT NULL
GROUP BY
    1, 2, 3, 4

-- 關鍵修正：對「每個小時」進行獨立排名，只取該小時的前 50 名
QUALIFY ROW_NUMBER() OVER(PARTITION BY hour_of_day ORDER BY trip_count DESC) <= 50

ORDER BY
    hour_of_day, trip_count DESC;

視覺化動態熱點 (Time-Lapse Heatmap)
有了 df_hourly，我們就可以製作那個「會動的」地圖，這對展示調度邏輯非常有說服力。

In [None]:
import folium
from folium.plugins import HeatMapWithTime

# 1. 整理數據 (邏輯不變)
time_index = sorted(df_hourly['hour_of_day'].unique())
data_by_hour = []

for hour in time_index:
    # 這裡的 df_hourly 已經保證只包含該小時的 Top 50
    hourly_data = df_hourly[df_hourly['hour_of_day'] == hour]
    # 正規化權重 (Optional): 讓每一小時的顏色對比度一致，避免深夜全黑
    # max_val = hourly_data['trip_count'].max()
    # hourly_data['weight'] = hourly_data['trip_count'] / max_val

    data_by_hour.append(hourly_data[['lat', 'lng', 'trip_count']].values.tolist())

# 2. 繪圖
m_time = folium.Map(location=[41.8781, -87.6298], zoom_start=11, tiles='CartoDB Positron')

HeatMapWithTime(
    data_by_hour,
    index=[f"{h}:00" for h in time_index],
    auto_play=True,
    radius=20, # 可以稍微調大一點，因為點變少了 (每小時固定 50 個)
    max_opacity=0.9,
    use_local_extrema=True # 讓每個時間禎都根據當下的最大值渲染顏色，對比更強
).add_to(m_time)

m_time

In [None]:
import folium
from folium.plugins import HeatMapWithTime

# 1. Initialize map with Google Roadmap tiles
# 'tiles' parameter accepts a direct URL pattern for the map images
m_time = folium.Map(
    location=[41.8781, -87.6298],
    zoom_start=11,
    tiles="https://mt1.google.com/vt/lyrs=m&x={x}&y={y}&z={z}",
    attr="Google"
)

# 2. Add HeatMapWithTime layer
HeatMapWithTime(
    data_by_hour,
    index=[f"{h}:00" for h in time_index],
    auto_play=True,
    radius=20,
    max_opacity=0.9,
    use_local_extrema=True
).add_to(m_time)

m_time

供需缺口分析
最後，如果您想計算「哪裡缺車」

數據使用 pickups & drop offs 模擬

In [None]:
%%bigquery df_gap

WITH Demand AS (
    SELECT
        pickup_cluster_id AS grid_id,
        EXTRACT(HOUR FROM trip_start_timestamp) AS hour,
        COUNT(*) AS pickup_count
    FROM `du-hast-mich.taxi.chicago_taxi`
    GROUP BY 1, 2
),
Supply AS (
    -- 模擬：下車點即為供給點
    SELECT
        ST_ASTEXT(ST_SNAPTOGRID(ST_GEOGPOINT(dropoff_longitude, dropoff_latitude), 0.005)) AS grid_id,
        EXTRACT(HOUR FROM trip_end_timestamp) AS hour,
        COUNT(*) AS dropoff_count
    FROM `du-hast-mich.taxi.chicago_taxi`
    GROUP BY 1, 2
)
SELECT
    d.grid_id,
    d.hour,
    d.pickup_count AS demand,
    COALESCE(s.dropoff_count, 0) AS supply,
    (d.pickup_count - COALESCE(s.dropoff_count, 0)) AS gap, -- 正值代表缺車
    ST_Y(ST_GEOGFROMTEXT(d.grid_id)) AS lat,
    ST_X(ST_GEOGFROMTEXT(d.grid_id)) AS lng
FROM Demand d
LEFT JOIN Supply s ON d.grid_id = s.grid_id AND d.hour = s.hour
ORDER BY gap DESC
LIMIT 1000;

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. 數據聚合：將所有網格的數據按小時加總
# df_gap 是之前 SQL %%bigquery 的結果
hourly_agg = df_gap.groupby('hour')[['demand', 'supply']].sum().reset_index()

# 2. 設定畫布大小與風格
plt.figure(figsize=(12, 6))
sns.set_style("whitegrid")

# 3. 繪製兩條折線
# 橘色線：需求 (上車數)
sns.lineplot(data=hourly_agg, x='hour', y='demand',
             label='Total Demand', color='orange', marker='o', linewidth=2.5)

# 藍色線：供給 (下車數/空車釋出)
sns.lineplot(data=hourly_agg, x='hour', y='supply',
             label='Total Supply', color='#3498db', marker='X', linewidth=2.5)

# 4. 標註缺口區域 (Gap Highlighting)
# 在需求 > 供給的區域塗上紅色陰影 (代表缺車)
plt.fill_between(hourly_agg['hour'], hourly_agg['demand'], hourly_agg['supply'],
                 where=(hourly_agg['demand'] > hourly_agg['supply']),
                 interpolate=True, color='red', alpha=0.1, label='Shortage Gap')

# 在供給 > 需求的區域塗上綠色陰影 (代表車多)
plt.fill_between(hourly_agg['hour'], hourly_agg['demand'], hourly_agg['supply'],
                 where=(hourly_agg['demand'] < hourly_agg['supply']),
                 interpolate=True, color='green', alpha=0.1, label='Surplus Gap')

# 5. 圖表修飾
plt.title('System-wide Hourly Demand vs. Supply (Chicago Taxi)', fontsize=16, fontweight='bold')
plt.xlabel('Hour of Day (0-23)', fontsize=12)
plt.ylabel('Total Trips / Available Taxis', fontsize=12)
plt.xticks(range(0, 24))  # 顯示 0-23 所有小時刻度
plt.legend(loc='upper left', frameon=True)
plt.tight_layout()

# 顯示圖表
plt.show()

In [None]:
import folium
import numpy as np

# 假設 df_gap 已經準備好 (包含 columns: lat, lng, gap, hour)

# 1. 設定要觀察的時段 (例如：傍晚 18:00 的尖峰時刻)
target_hour = 18
df_slice = df_gap[df_gap['hour'] == target_hour].copy()

# 2. 初始化地圖
m_gap = folium.Map(location=[41.8781, -87.6298], zoom_start=11, tiles='CartoDB Positron')

# 3. 繪製雙色圓點 (Red for Shortage, Blue for Surplus)
for idx, row in df_slice.iterrows():
    gap = row['gap']

    # 過濾掉缺口太小的雜訊 (例如 +/- 5 以內視為平衡)
    if abs(gap) < 5:
        continue

    # 設定顏色與半徑
    if gap > 0:
        color = '#e74c3c' # 紅色 (缺車)
        popup_text = f"缺車 (Shortage): {int(gap)}"
    else:
        color = '#3498db' # 藍色 (車多)
        popup_text = f"車多 (Surplus): {int(abs(gap))}"

    # 圓點大小隨缺口量級變化
    radius = min(abs(gap) / 2, 20) # 設定上限避免遮擋

    folium.CircleMarker(
        location=[row['lat'], row['lng']],
        radius=radius,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        popup=popup_text
    ).add_to(m_gap)

# 4. 加入標題 (HTML 覆蓋層)
title_html = f'''
     <div style="position: fixed;
     bottom: 50px; left: 50px; width: 300px; height: 120px;
     border:2px solid grey; z-index:9999; font-size:14px;
     background-color:white; opacity:0.9; padding: 10px;">
     <b>Hour: {target_hour}:00 Gap Analysis</b><br>
     <i class="fa fa-circle" style="color:#e74c3c"></i> Shortage (Need Taxis)<br>
     <i class="fa fa-circle" style="color:#3498db"></i> Oversupply (Too Many Taxis)<br>
     Circle size indicates magnitude.
     </div>
     '''
m_gap.get_root().html.add_child(folium.Element(title_html))

m_gap