In [1]:
import pandas as pd
import requests
from tqdm import tqdm

# Step 1: Download data
file_name = "green_tripdata_2024-03.parquet"
url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file_name}"
save_path = f"data/{file_name}"

# Create data folder if not exists
import os
os.makedirs("data", exist_ok=True)

# Download
resp = requests.get(url, stream=True)
with open(save_path, "wb") as f:
    for chunk in tqdm(resp.iter_content(chunk_size=8192), desc="Downloading"):
        f.write(chunk)

# Step 2: Load and print shape
df = pd.read_parquet(save_path)
print("Q1 Answer - Raw Data Shape:", df.shape)

Downloading: 168it [00:06, 26.57it/s]

Q1 Answer - Raw Data Shape: (57457, 20)





In [3]:
from evidently import Report
from evidently.metrics import QuantileValue, RowCount

# Create report with extra metrics
report = Report(metrics=[
    QuantileValue(column="fare_amount", quantile=0.5),
])

# Run on full dataset (since no reference is required)
repo = report.run(current_data=df)
repo.save_html("report_q2.html")

  infinite=StatCountValue(infinite_count, infinite_count / data.count()),


In [4]:
from datetime import datetime, timedelta

quantiles = []

for day in range(1, 32):
    day_start = pd.to_datetime(f"2024-03-{day:02d}")
    day_end = day_start + pd.Timedelta(days=1)
    daily_data = df[df.lpep_pickup_datetime.between(day_start, day_end, inclusive="left")]

    if daily_data.empty:
        continue

    daily_report = Report(metrics=[
        QuantileValue(column="fare_amount", quantile=0.5)
    ])
    repo = daily_report.run(current_data=daily_data)
    val = repo.dict()['metrics'][0]['value']
    print(f"{day_start.date()}: {val}")
    quantiles.append(val)

print("Q3 Answer - Max daily median fare_amount:", max(quantiles))



invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



2024-03-01: 13.5
2024-03-02: 13.5
2024-03-03: 14.2
2024-03-04: 12.8
2024-03-05: 13.5
2024-03-06: 12.8



invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



2024-03-07: 13.5
2024-03-08: 13.5
2024-03-09: 13.5
2024-03-10: 14.2
2024-03-11: 12.8
2024-03-12: 13.5
2024-03-13: 13.5



invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



2024-03-14: 14.2
2024-03-15: 13.5
2024-03-16: 14.2
2024-03-17: 13.5
2024-03-18: 13.5
2024-03-19: 13.5
2024-03-20: 12.8
2024-03-21: 13.5



invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



2024-03-22: 13.5
2024-03-23: 12.8
2024-03-24: 14.2
2024-03-25: 13.5
2024-03-26: 13.5
2024-03-27: 13.5
2024-03-28: 13.5
2024-03-29: 13.5
2024-03-30: 14.2
2024-03-31: 13.5
Q3 Answer - Max daily median fare_amount: 14.2



invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



In [8]:
from evidently.ui.workspace import Workspace
from evidently.sdk.panels import text_panel, line_plot_panel, PanelMetric

# Initialize workspace and project
ws = Workspace("workspace")
project = ws.create_project("nyc-taxi-monitoring")
project.description = "Monitoring fare amount median and row counts"

# Add panels to dashboard

# 1. Title
project.dashboard.add_panel(
    text_panel(title="🚕 NYC Taxi Dashboard")
)

# 2. Fare amount median (quantile 0.5)
project.dashboard.add_panel(
    line_plot_panel(
        title="fare_amount median (q=0.5)",
        values=[
            PanelMetric(
                metric="QuantileValue",                      # must match metric name
                metric_labels={"column": "fare_amount", "quantile": "0.5"},
                legend="Median Fare"
            )
        ],
        size="half"
    )
)

# 3. Row count
project.dashboard.add_panel(
    line_plot_panel(
        title="Row Count",
        values=[
            PanelMetric(
                metric="RowCount",
                legend="Total rows"
            )
        ],
        size="half"
    )
)

# Save project/dashboard
project.save()

In [9]:
for day in range(1, 32):
    start = pd.Timestamp(f"2024-03-{day:02d}")
    end = start + pd.Timedelta(days=1)
    daily_df = df[df.lpep_pickup_datetime.between(start, end, inclusive="left")]

    if daily_df.empty:
        continue  # skip empty days

    daily_report = Report(metrics=[
        RowCount(),
        QuantileValue(column="fare_amount", quantile=0.5)
    ])
    snapshot = daily_report.run(current_data=daily_df, timestamp=start)

    ws.add_run(project.id, snapshot)
    print(f"📅 Logged run for {start.date()}, rows: {len(daily_df)}")


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



📅 Logged run for 2024-03-01, rows: 2095
📅 Logged run for 2024-03-02, rows: 1638
📅 Logged run for 2024-03-03, rows: 1460
📅 Logged run for 2024-03-04, rows: 1913
📅 Logged run for 2024-03-05, rows: 1987



invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



📅 Logged run for 2024-03-06, rows: 2286
📅 Logged run for 2024-03-07, rows: 2150
📅 Logged run for 2024-03-08, rows: 2085
📅 Logged run for 2024-03-09, rows: 1769
📅 Logged run for 2024-03-10, rows: 1430
📅 Logged run for 2024-03-11, rows: 1839



invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



📅 Logged run for 2024-03-12, rows: 1907
📅 Logged run for 2024-03-13, rows: 2086
📅 Logged run for 2024-03-14, rows: 2108
📅 Logged run for 2024-03-15, rows: 2036
📅 Logged run for 2024-03-16, rows: 1684
📅 Logged run for 2024-03-17, rows: 1426



invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



📅 Logged run for 2024-03-18, rows: 1880
📅 Logged run for 2024-03-19, rows: 1985
📅 Logged run for 2024-03-20, rows: 2052
📅 Logged run for 2024-03-21, rows: 2119
📅 Logged run for 2024-03-22, rows: 1953
📅 Logged run for 2024-03-23, rows: 1389



invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



📅 Logged run for 2024-03-24, rows: 1383
📅 Logged run for 2024-03-25, rows: 1801
📅 Logged run for 2024-03-26, rows: 1947
📅 Logged run for 2024-03-27, rows: 2035
📅 Logged run for 2024-03-28, rows: 2144
📅 Logged run for 2024-03-29, rows: 1836



invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



📅 Logged run for 2024-03-30, rows: 1549
📅 Logged run for 2024-03-31, rows: 1475
