# NYC Taxi ML Pipeline - Interactive Analysis

## Using Spark Connect with Pandas API on Spark

This notebook demonstrates:
1. Connecting to Spark via Spark Connect
2. Loading NYC TLC taxi data from MinIO
3. Feature engineering with Pandas API on Spark
4. Training CatBoost models per borough
5. Generating 7-day revenue/trip predictions

## 1. Setup & Connection

In [None]:
# Install dependencies if needed
# !pip install catboost plotly kafka-python --quiet

In [None]:
import os
import sys
from datetime import datetime, timedelta

# Spark imports
from pyspark.sql import SparkSession
import pyspark.pandas as ps
from pyspark.sql import functions as F

# ML imports
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.model_selection import train_test_split

# Visualization
import plotly.express as px
import plotly.graph_objects as go

print(f"PySpark version: {os.popen('pyspark --version 2>&1').read()}")

### Connect to Spark Connect (Standalone backend)

In [None]:
# Spark Connect configuration
CONNECT_HOST = os.environ.get("SPARK_CONNECT_HOST", "scenario1-spark-35-connect")
CONNECT_PORT = os.environ.get("SPARK_CONNECT_PORT", "15002")
CONNECT_URL = f"sc://{CONNECT_HOST}:{CONNECT_PORT}"

# MinIO configuration
MINIO_ENDPOINT = os.environ.get("S3_ENDPOINT", "http://minio.spark-infra.svc.cluster.local:9000")
MINIO_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY_ID", "minioadmin")
MINIO_SECRET_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY", "minioadmin")

print(f"Connecting to: {CONNECT_URL}")
print(f"MinIO: {MINIO_ENDPOINT}")

In [None]:
# Create Spark session via Spark Connect
spark = SparkSession.builder \
    .remote(CONNECT_URL) \
    .appName("nyc-taxi-exploration") \
    .config("spark.hadoop.fs.s3a.endpoint", MINIO_ENDPOINT) \
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

print(f"Spark session created: {spark.sparkContext.applicationId}")
spark

In [None]:
# Configure Pandas API on Spark
ps.set_option("compute.default_index_type", "distributed")

## 2. Data Exploration

In [None]:
# Load raw TLC data from MinIO
raw_path = "s3a://nyc-taxi/raw/*.parquet"

print(f"Loading data from {raw_path}...")
df_raw = spark.read.parquet(raw_path)

print(f"Schema:")
df_raw.printSchema()

In [None]:
# Basic statistics
print(f"Total records: {df_raw.count():,}")
print(f"\nSample data:")
df_raw.show(5, truncate=False)

In [None]:
# Data quality check
df_clean = df_raw.filter(
    (F.col("trip_distance") > 0) &
    (F.col("fare_amount") > 0) &
    (F.col("total_amount") > 0) &
    (F.col("tpep_pickup_datetime") >= "2023-01-01") &
    (F.col("tpep_pickup_datetime") < "2023-04-01")
)

print(f"Clean records: {df_clean.count():,}")

In [None]:
# Daily trip statistics
daily_stats = df_clean \
    .withColumn("pickup_date", F.to_date("tpep_pickup_datetime")) \
    .groupBy("pickup_date") \
    .agg(
        F.count("*").alias("trip_count"),
        F.sum("total_amount").alias("total_revenue"),
        F.avg("trip_distance").alias("avg_distance")
    ) \
    .orderBy("pickup_date")

daily_stats_pd = daily_stats.toPandas()
daily_stats_pd.head(10)

In [None]:
# Visualize daily trends
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=daily_stats_pd['pickup_date'],
    y=daily_stats_pd['trip_count'],
    mode='lines',
    name='Daily Trips'
))
fig.update_layout(
    title='NYC Taxi Daily Trip Count (Jan-Mar 2023)',
    xaxis_title='Date',
    yaxis_title='Trip Count'
)
fig.show()

## 3. Feature Engineering

In [None]:
def add_temporal_features(df):
    """Add temporal features."""
    # US Holidays 2023
    holidays = [
        "2023-01-01", "2023-01-16", "2023-02-20", "2023-05-29", "2023-06-19",
        "2023-07-04", "2023-09-04", "2023-10-09", "2023-11-11", "2023-11-23",
        "2023-12-25"
    ]

    df = df \
        .withColumn("pickup_date", F.to_date("tpep_pickup_datetime")) \
        .withColumn("hour_of_day", F.hour("tpep_pickup_datetime")) \
        .withColumn("day_of_week", F.dayofweek("tpep_pickup_datetime")) \
        .withColumn("month", F.month("tpep_pickup_datetime")) \
        .withColumn("is_weekend", F.when(F.col("day_of_week").isin([1, 7]), 1).otherwise(0)) \
        .withColumn("is_rush_hour", F.when(
            (F.col("hour_of_day").between(7, 9)) | (F.col("hour_of_day").between(17, 19)), 1
        ).otherwise(0)) \
        .withColumn("is_holiday", F.when(F.col("pickup_date").isin(holidays), 1).otherwise(0))

    return df

def add_geospatial_features(df):
    """Add geospatial features."""
    # Simplified borough mapping
    df = df.withColumn("pickup_borough",
        F.when(F.col("PULocationID").between(1, 10), "Staten Island")
         .when(F.col("PULocationID").between(11, 50), "Brooklyn")
         .when(F.col("PULocationID").between(51, 150), "Queens")
         .when(F.col("PULocationID").between(151, 220), "Manhattan")
         .when(F.col("PULocationID").between(221, 265), "Bronx")
         .otherwise("Unknown"))

    # Trip duration
    df = df.withColumn("trip_duration",
        (F.unix_timestamp("tpep_dropoff_datetime") - F.unix_timestamp("tpep_pickup_datetime")) / 60)

    # Average speed
    df = df.withColumn("avg_speed",
        F.when(F.col("trip_duration") > 0,
               F.col("trip_distance") / (F.col("trip_duration") / 60))
         .otherwise(0))

    return df

In [None]:
# Apply features
df_features = add_temporal_features(df_clean)
df_features = add_geospatial_features(df_features)

print("Features added:")
df_features.select(
    "pickup_date", "hour_of_day", "day_of_week", "is_weekend", "is_rush_hour",
    "pickup_borough", "trip_distance", "trip_duration", "avg_speed"
).show(5)

In [None]:
# Aggregate to daily by borough for ML
daily_borough = df_features \
    .groupBy("pickup_date", "pickup_borough") \
    .agg(
        F.sum("total_amount").alias("daily_revenue"),
        F.count("*").alias("daily_trips"),
        F.avg("trip_distance").alias("avg_distance"),
        F.avg("trip_duration").alias("avg_duration"),
        F.sum("is_weekend").alias("is_weekend"),
        F.sum("is_holiday").alias("is_holiday")
    ) \
    .orderBy("pickup_date", "pickup_borough")

daily_borough_pd = daily_borough.toPandas()
print(f"Daily borough aggregates: {len(daily_borough_pd)} rows")
daily_borough_pd.head(10)

## 4. Model Training (CatBoost)

In [None]:
# Prepare features for training
# Create 7-day ahead targets
daily_borough_pd = daily_borough_pd.sort_values(['pickup_borough', 'pickup_date'])

# Shift for targets (7-day sum)
for borough in daily_borough_pd['pickup_borough'].unique():
    mask = daily_borough_pd['pickup_borough'] == borough
    daily_borough_pd.loc[mask, 'revenue_7d_ahead'] = daily_borough_pd.loc[mask, 'daily_revenue'].shift(-7).rolling(7, min_periods=1).sum()
    daily_borough_pd.loc[mask, 'trips_7d_ahead'] = daily_borough_pd.loc[mask, 'daily_trips'].shift(-7).rolling(7, min_periods=1).sum()

# Drop rows with NaN targets
daily_borough_pd = daily_borough_pd.dropna()

print(f"Training data: {len(daily_borough_pd)} rows")
daily_borough_pd.head()

In [None]:
# Feature columns
feature_cols = ['daily_revenue', 'daily_trips', 'avg_distance', 'avg_duration', 'is_weekend', 'is_holiday']

# Train models for each borough
models = {}
results = []

for borough in ['Manhattan', 'Brooklyn', 'Queens']:
    print(f"\nTraining models for {borough}...")
    
    borough_data = daily_borough_pd[daily_borough_pd['pickup_borough'] == borough].copy()
    
    if len(borough_data) < 20:
        print(f"  Skipping {borough} - insufficient data")
        continue
    
    X = borough_data[feature_cols]
    y_revenue = borough_data['revenue_7d_ahead']
    y_trips = borough_data['trips_7d_ahead']
    
    # Train/test split
    X_train, X_test, y_rev_train, y_rev_test, y_trip_train, y_trip_test = train_test_split(
        X, y_revenue, y_trips, test_size=0.2, random_state=42
    )
    
    # Train revenue model
    model_revenue = CatBoostRegressor(
        iterations=500,
        depth=6,
        learning_rate=0.05,
        loss_function='RMSE',
        verbose=False
    )
    model_revenue.fit(X_train, y_rev_train, eval_set=(X_test, y_rev_test), early_stopping_rounds=50, verbose=False)
    
    # Train trips model
    model_trips = CatBoostRegressor(
        iterations=500,
        depth=6,
        learning_rate=0.05,
        loss_function='RMSE',
        verbose=False
    )
    model_trips.fit(X_train, y_trip_train, eval_set=(X_test, y_trip_test), early_stopping_rounds=50, verbose=False)
    
    # Evaluate
    pred_revenue = model_revenue.predict(X_test)
    pred_trips = model_trips.predict(X_test)
    
    mape_rev = mean_absolute_percentage_error(y_rev_test, pred_revenue)
    mape_trips = mean_absolute_percentage_error(y_trip_test, pred_trips)
    rmse_rev = np.sqrt(mean_squared_error(y_rev_test, pred_revenue))
    rmse_trips = np.sqrt(mean_squared_error(y_trip_test, pred_trips))
    
    print(f"  Revenue - MAPE: {mape_rev:.4f}, RMSE: {rmse_rev:.2f}")
    print(f"  Trips   - MAPE: {mape_trips:.4f}, RMSE: {rmse_trips:.2f}")
    
    models[borough] = {
        'revenue': model_revenue,
        'trips': model_trips
    }
    
    results.append({
        'borough': borough,
        'mape_revenue': mape_rev,
        'mape_trips': mape_trips,
        'rmse_revenue': rmse_rev,
        'rmse_trips': rmse_trips
    })

results_df = pd.DataFrame(results)
results_df

In [None]:
# Feature importance
if 'Manhattan' in models:
    importance_df = pd.DataFrame({
        'feature': feature_cols,
        'importance_revenue': models['Manhattan']['revenue'].feature_importances_,
        'importance_trips': models['Manhattan']['trips'].feature_importances_
    })
    
    fig = go.Figure(data=[
        go.Bar(name='Revenue Model', x=importance_df['feature'], y=importance_df['importance_revenue']),
        go.Bar(name='Trips Model', x=importance_df['feature'], y=importance_df['importance_trips'])
    ])
    fig.update_layout(
        title='Feature Importance (Manhattan)',
        barmode='group'
    )
    fig.show()

## 5. Generate 7-Day Forecast

In [None]:
# Generate forecast for next 7 days
forecast_dates = pd.date_range(start=datetime.now(), periods=7, freq='D')

forecasts = []
for borough, borough_models in models.items():
    # Use latest day's features for prediction
    latest = daily_borough_pd[daily_borough_pd['pickup_borough'] == borough].iloc[-1]
    X_forecast = latest[feature_cols].values.reshape(1, -1)
    
    for i, forecast_date in enumerate(forecast_dates):
        pred_revenue = borough_models['revenue'].predict(X_forecast)[0]
        pred_trips = borough_models['trips'].predict(X_forecast)[0]
        
        forecasts.append({
            'forecast_date': forecast_date.strftime('%Y-%m-%d'),
            'borough': borough,
            'predicted_revenue': pred_revenue * (1 + i * 0.02),  # Simple trend
            'predicted_trips': pred_trips * (1 + i * 0.01)
        })

forecasts_df = pd.DataFrame(forecasts)
forecasts_df.head(15)

In [None]:
# Visualize forecast
fig = px.bar(
    forecasts_df,
    x='forecast_date',
    y='predicted_revenue',
    color='borough',
    barmode='group',
    title='7-Day Revenue Forecast by Borough'
)
fig.update_layout(xaxis_title='Date', yaxis_title='Predicted Revenue ($)')
fig.show()

## 6. Save Results to MinIO

In [None]:
# Save forecasts to MinIO
from io import BytesIO
import boto3

s3 = boto3.client(
    's3',
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY
)

# Save forecasts as CSV
csv_buffer = forecasts_df.to_csv(index=False)
s3.put_object(
    Bucket='nyc-taxi',
    Key='predictions/notebook_forecast.csv',
    Body=csv_buffer
)
print("Saved forecasts to s3a://nyc-taxi/predictions/notebook_forecast.csv")

In [None]:
# Save models
import pickle

for borough, borough_models in models.items():
    model_bytes = pickle.dumps(borough_models)
    s3.put_object(
        Bucket='ml-models',
        Key=f'taxi-predictor/notebook/{borough.lower().replace(" ", "_")}.pkl',
        Body=model_bytes
    )
    print(f"Saved {borough} model to s3a://ml-models/taxi-predictor/notebook/{borough.lower()}.pkl")

## Summary

This notebook demonstrated:
1. ✅ Connected to Spark via Spark Connect
2. ✅ Loaded NYC TLC taxi data from MinIO (3 months)
3. ✅ Created temporal and geospatial features
4. ✅ Trained CatBoost models per borough
5. ✅ Generated 7-day revenue/trip forecasts
6. ✅ Saved predictions and models to MinIO

### Next Steps
- Load more data (full 2 years)
- Add more features (weather, events)
- Deploy models via Airflow DAG
- Set up monitoring and retraining