# 04 - ML Pipeline: Exploration, Feature Engineering & Classification

**Complete Machine Learning Workflow** using Silver_ML data:

1. **SQL Exploration**: Analyze flight patterns and statistics
2. **Feature Engineering**: Create ML-ready features with window functions
3. **Classification**: Random Forest for flight phase prediction

**Note**: Silver_ML data is generated by 02_Unified_Pipeline. Ensure it's running first.

In [None]:
import os
import sys
import logging
from typing import List, Optional

import pandas as pd
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import (
    col, lag, avg, stddev, row_number, when, sqrt, pow, lit, min as spark_min,
    sum as spark_sum, count, broadcast
)
from pyspark.sql.window import Window
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, IndexToString
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

sys.path.insert(0, '/home/jovyan/work')
from config import get_s3_path, create_spark_session
from dotenv import load_dotenv

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

load_dotenv()

SILVER_PATH: str = get_s3_path("silver", "flights")
SILVER_ML_PATH: str = get_s3_path("silver", "flights_ml")
GOLD_MODEL_PATH: str = get_s3_path("gold", "models", "rf_flight_phase")
GOLD_PREDICTIONS_PATH: str = get_s3_path("gold", "predictions", "flight_phase")
AIRPORTS_CSV: str = "./data/airports.csv"

logger.info(f"Silver: {SILVER_PATH}")
logger.info(f"Silver_ML: {SILVER_ML_PATH}")
logger.info(f"Model: {GOLD_MODEL_PATH}")
logger.info(f"Predictions: {GOLD_PREDICTIONS_PATH}")

In [None]:
spark: SparkSession = create_spark_session(
    "MLPipeline",
    extra_packages=["org.apache.spark:spark-mllib_2.12:3.5.3"],
    shuffle_partitions=6
)

logger.info("Spark session initialized")

## Part 1: SQL Exploration

Analyze flight patterns using SparkSQL

In [None]:
df_silver: DataFrame = spark.read.format("delta").load(SILVER_PATH)
df_silver.createOrReplaceTempView("flights")
logger.info(f"Loaded {df_silver.count():,} records from Silver layer")

In [None]:
df_silver.printSchema()

In [None]:
spark.sql("""
    SELECT 
        origin_country,
        COUNT(*) AS observations,
        COUNT(DISTINCT icao24) AS unique_aircraft,
        ROUND(AVG(altitude_meters), 2) AS avg_altitude_m,
        ROUND(AVG(velocity_kmh), 2) AS avg_velocity_kmh,
        ROUND(MIN(velocity_kmh), 2) AS min_velocity,
        ROUND(MAX(velocity_kmh), 2) AS max_velocity
    FROM flights
    WHERE origin_country IS NOT NULL
    GROUP BY origin_country
    ORDER BY observations DESC
    LIMIT 15
""").show(truncate=False)

In [None]:
spark.sql("""
    SELECT 
        CASE WHEN on_ground = true THEN 'ON_GROUND' ELSE 'IN_FLIGHT' END AS status,
        COUNT(*) AS count,
        COUNT(DISTINCT icao24) AS unique_aircraft,
        ROUND(AVG(velocity_kmh), 2) AS avg_velocity,
        ROUND(AVG(altitude_meters), 2) AS avg_altitude
    FROM flights
    GROUP BY on_ground
""").show(truncate=False)

In [None]:
spark.sql("""
    SELECT 
        CASE 
            WHEN altitude_meters IS NULL THEN 'NULL'
            WHEN altitude_meters < 1000 THEN '0-1000m'
            WHEN altitude_meters < 5000 THEN '1000-5000m'
            WHEN altitude_meters < 10000 THEN '5000-10000m'
            WHEN altitude_meters < 15000 THEN '10000-15000m'
            ELSE '15000m+'
        END AS altitude_range,
        COUNT(*) AS count,
        ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 2) AS percentage
    FROM flights
    GROUP BY altitude_range
    ORDER BY count DESC
""").show(truncate=False)

In [None]:
spark.sql("""
    SELECT
        icao24,
        callsign,
        origin_country,
        CAST(event_timestamp AS STRING) AS timestamp,
        altitude_meters,
        LAG(altitude_meters) OVER (PARTITION BY icao24 ORDER BY event_timestamp) AS prev_altitude,
        CAST(altitude_meters - LAG(altitude_meters) OVER (PARTITION BY icao24 ORDER BY event_timestamp) AS INT) AS altitude_change,
        velocity_kmh,
        ROUND(AVG(altitude_meters) OVER (PARTITION BY icao24 ORDER BY event_timestamp ROWS BETWEEN 5 PRECEDING AND CURRENT ROW), 2) AS rolling_avg_altitude,
        RANK() OVER (PARTITION BY origin_country ORDER BY velocity_kmh DESC) AS velocity_rank_in_country,
        ROW_NUMBER() OVER (PARTITION BY icao24 ORDER BY event_timestamp) AS observation_num
    FROM flights
    WHERE icao24 IS NOT NULL AND altitude_meters IS NOT NULL AND event_timestamp IS NOT NULL
    ORDER BY icao24, event_timestamp
    LIMIT 25
""").show(truncate=False)

In [None]:
df_silver.select(
    "icao24", "callsign", "origin_country", "latitude", "longitude",
    "altitude_meters", "velocity_kmh", "on_ground", "event_timestamp"
).limit(10).show(truncate=False)

## Part 2: Check Silver_ML Availability

Verify that the unified pipeline has generated Silver_ML features

In [None]:
try:
    df_ml: DataFrame = spark.read.format("delta").load(SILVER_ML_PATH)
    has_ml_data: bool = True
    logger.info(f"Loaded {df_ml.count():,} ML-enriched records from Silver_ML")
except Exception as e:
    logger.warning(f"Silver_ML not available: {str(e)}")
    has_ml_data: bool = False

In [None]:
if has_ml_data:
    spark.sql("""
        SELECT 
            flight_phase,
            COUNT(*) AS count,
            COUNT(DISTINCT icao24) AS unique_aircraft,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 2) AS percentage
        FROM flights_ml
        WHERE flight_phase IS NOT NULL
        GROUP BY flight_phase
        ORDER BY count DESC
    """).show(truncate=False)
    
    df_ml.select(
        "icao24", "flight_phase", "altitude_meters", "velocity_kmh",
        "altitude_change", "velocity_change", "rolling_avg_altitude",
        "rolling_std_altitude", "rolling_avg_velocity"
    ).filter(col("flight_phase").isNotNull()).limit(10).show(truncate=False)
else:
    logger.warning("Skipping ML training (Silver_ML not available)")

## Part 3: Feature Engineering & Preparation

Prepare features for Random Forest classification

In [None]:
if has_ml_data:
    FEATURE_COLUMNS: List[str] = [
        "altitude_meters",
        "velocity_kmh",
        "altitude_change",
        "velocity_change",
        "observation_rank",
        "rolling_avg_altitude",
        "rolling_std_altitude",
        "rolling_avg_velocity"
    ]
    
    logger.info(f"Using {len(FEATURE_COLUMNS)} features for classification")
    
    df_train_raw = df_ml.filter(
        (col("flight_phase").isNotNull()) &
        (col("altitude_meters").isNotNull()) &
        (col("velocity_kmh").isNotNull())
    ).cache()
    
    logger.info(f"Prepared {df_train_raw.count():,} training records")

## Part 4: Random Forest Classification

Train and evaluate Random Forest model for flight phase prediction

In [None]:
if has_ml_data and df_train_raw.count() > 0:
    label_indexer = StringIndexer(
        inputCol="flight_phase",
        outputCol="label",
        handleInvalid="skip"
    )
    
    vector_assembler = VectorAssembler(
        inputCols=FEATURE_COLUMNS,
        outputCol="features_raw",
        handleInvalid="skip"
    )
    
    scaler = StandardScaler(
        inputCol="features_raw",
        outputCol="features",
        withStd=True,
        withMean=False
    )
    
    rf_classifier = RandomForestClassifier(
        labelCol="label",
        featuresCol="features",
        numTrees=100,
        maxDepth=12,
        maxBins=32,
        seed=42,
        numPartitions=4
    )
    
    label_converter = IndexToString(
        inputCol="prediction",
        outputCol="predicted_label",
        labels=label_indexer.fit(df_train_raw).labels
    )
    
    pipeline = Pipeline(stages=[
        label_indexer,
        vector_assembler,
        scaler,
        rf_classifier,
        label_converter
    ])
    
    logger.info("ML pipeline constructed")

In [None]:
if has_ml_data and df_train_raw.count() > 0:
    train_df, test_df = df_train_raw.randomSplit([0.8, 0.2], seed=42)
    
    logger.info(f"Train: {train_df.count():,} | Test: {test_df.count():,}")
    
    model = pipeline.fit(train_df)
    logger.info("Random Forest trained successfully")

In [None]:
if has_ml_data and df_train_raw.count() > 0:
    predictions = model.transform(test_df)
    
    evaluator_accuracy = MulticlassClassificationEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="accuracy"
    )
    
    evaluator_f1 = MulticlassClassificationEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="f1"
    )
    
    accuracy = evaluator_accuracy.evaluate(predictions)
    f1_score = evaluator_f1.evaluate(predictions)
    
    logger.info(f"Model Accuracy: {accuracy:.4f}")
    logger.info(f"Model F1 Score: {f1_score:.4f}")
    
    predictions.select("flight_phase", "predicted_label", "probability").show(20, truncate=False)

In [None]:
if has_ml_data and df_train_raw.count() > 0:
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    
    predictions_eval = model.transform(test_df)
    
    evaluators: dict = {
        "accuracy": MulticlassClassificationEvaluator(
            labelCol="label", predictionCol="prediction", metricName="accuracy"
        ),
        "f1": MulticlassClassificationEvaluator(
            labelCol="label", predictionCol="prediction", metricName="f1"
        ),
        "weightedPrecision": MulticlassClassificationEvaluator(
            labelCol="label", predictionCol="prediction", metricName="weightedPrecision"
        ),
        "weightedRecall": MulticlassClassificationEvaluator(
            labelCol="label", predictionCol="prediction", metricName="weightedRecall"
        )
    }
    
    metrics_dict: dict = {name: evaluator.evaluate(predictions_eval) 
                          for name, evaluator in evaluators.items()}
    
    for metric_name, metric_value in metrics_dict.items():
        logger.info(f"{metric_name}: {metric_value:.4f}")

In [None]:
if has_ml_data and df_train_raw.count() > 0:
    rf_model = model.stages[3]
    importances = rf_model.featureImportances.toArray()
    
    feature_importance_df = pd.DataFrame({
        "feature": FEATURE_COLUMNS,
        "importance": importances
    }).sort_values("importance", ascending=False)
    
    logger.info("Feature Importance (Top 10):")
    for idx, row in feature_importance_df.head(10).iterrows():
        logger.info(f"  {row['feature']:30} {row['importance']:.6f}")

## Part 5: Save Model & Predictions

Persist model and predictions to Gold layer

In [None]:
if has_ml_data and df_train_raw.count() > 0:
    try:
        model.write().overwrite().save(GOLD_MODEL_PATH)
        logger.info(f"Model persisted to {GOLD_MODEL_PATH}")
    except Exception as e:
        logger.error(f"Model save failed: {str(e)}")
    
    predictions_to_save = predictions.select(
        "event_timestamp",
        "icao24",
        "callsign",
        "origin_country",
        "flight_phase",
        "predicted_label",
        col("probability").cast("string").alias("confidence"),
        "altitude_meters",
        "velocity_kmh"
    )
    
    try:
        predictions_to_save.write.format("delta").mode("overwrite").save(GOLD_PREDICTIONS_PATH)
        logger.info(f"Predictions persisted to {GOLD_PREDICTIONS_PATH}")
    except Exception as e:
        logger.error(f"Predictions save failed: {str(e)}")
else:
    logger.info("ML training skipped (insufficient data or Silver_ML unavailable)")

## Summary

This notebook completed three ML tasks:

1. **SQL Exploration**: Analyzed flight patterns by country, altitude, status, and trajectory
2. **Feature Engineering**: Verified Silver_ML features (temporal, rolling statistics, flight phases)
3. **Classification**: Trained Random Forest model to predict flight phases

**Output artifacts:**
- `GOLD_MODEL_PATH`: Trained model (PipelineModel)
- `GOLD_PREDICTIONS_PATH`: Test set predictions with confidence scores

**Key insights:**
- Feature importance shows which flight characteristics best predict phase
- Confusion matrix reveals classification strengths/weaknesses
- Model accuracy indicates quality of flight phase predictions