# NASA GCN Data Pipeline

Lakeflow Declarative Pipeline for ingesting NASA Gamma-ray Coordinates Network (GCN) events.

**Architecture (Multiplex):**
- **Bronze**: `gcn_raw` - All raw Kafka messages
- **Silver**: Separate tables per topic category
  - `gcn_classic_text` - Classic text format alerts
  - `gcn_classic_voevent` - VoEvent XML format
  - `gcn_classic_binary` - Binary format
  - `gcn_notices` - JSON notices (new format)
  - `gcn_circulars` - GCN Circulars
  - `igwn_gwalert` - Gravitational wave alerts
  - `gcn_heartbeat` - Test/heartbeat messages

In [0]:
import dlt
import sys

# Add source path for local modules
sys.path.append(spark.conf.get("bundle.sourcePath", "."))

from pyspark.sql.functions import col, decode, split, current_timestamp
from nasa_gcn.config import get_kafka_options

## Bronze Layer: Raw Kafka Stream

In [0]:
@dlt.table(
    name="gcn_raw",
    comment="Raw NASA GCN Kafka messages (all topics)",
    table_properties={"quality": "bronze"}
)
def gcn_raw():
    """Ingest raw messages from NASA GCN Kafka topics."""
    kafka_options = get_kafka_options()
    
    return (
        spark.readStream
        .format("kafka")
        .options(**kafka_options)
        .load()
        .select(
            col("key").cast("string").alias("message_key"),
            col("value"),
            col("topic"),
            col("partition"),
            col("offset"),
            col("timestamp").alias("kafka_timestamp"),
            current_timestamp().alias("ingestion_timestamp")
        )
    )

## Silver Layer: Multiplex Tables by Topic Category

In [0]:
@dlt.table(
    name="gcn_classic_text",
    comment="Classic text format GCN alerts",
    table_properties={"quality": "silver"}
)
def gcn_classic_text():
    """Filter and decode classic text format messages."""
    return (
        dlt.read_stream("gcn_raw")
        .filter(col("topic").startswith("gcn.classic.text."))
        .select(
            col("message_key"),
            decode(col("value"), "UTF-8").alias("message_text"),
            col("topic"),
            split(col("topic"), r"\.").getItem(3).alias("event_type"),
            col("kafka_timestamp"),
            col("ingestion_timestamp"),
            col("partition"),
            col("offset")
        )
    )

In [0]:
@dlt.table(
    name="gcn_classic_voevent",
    comment="Classic VoEvent XML format GCN alerts",
    table_properties={"quality": "silver"}
)
def gcn_classic_voevent():
    """Filter and decode VoEvent XML format messages."""
    return (
        dlt.read_stream("gcn_raw")
        .filter(col("topic").startswith("gcn.classic.voevent."))
        .select(
            col("message_key"),
            decode(col("value"), "UTF-8").alias("voevent_xml"),
            col("topic"),
            split(col("topic"), r"\.").getItem(3).alias("event_type"),
            col("kafka_timestamp"),
            col("ingestion_timestamp"),
            col("partition"),
            col("offset")
        )
    )

In [0]:
@dlt.table(
    name="gcn_classic_binary",
    comment="Classic binary format GCN alerts",
    table_properties={"quality": "silver"}
)
def gcn_classic_binary():
    """Filter binary format messages (keep as raw bytes)."""
    return (
        dlt.read_stream("gcn_raw")
        .filter(col("topic").startswith("gcn.classic.binary."))
        .select(
            col("message_key"),
            col("value").alias("binary_data"),
            col("topic"),
            split(col("topic"), r"\.").getItem(3).alias("event_type"),
            col("kafka_timestamp"),
            col("ingestion_timestamp"),
            col("partition"),
            col("offset")
        )
    )

In [0]:
@dlt.table(
    name="gcn_notices",
    comment="New JSON format GCN notices",
    table_properties={"quality": "silver"}
)
def gcn_notices():
    """Filter and decode JSON format notices."""
    return (
        dlt.read_stream("gcn_raw")
        .filter(col("topic").startswith("gcn.notices."))
        .select(
            col("message_key"),
            decode(col("value"), "UTF-8").alias("notice_json"),
            col("topic"),
            # Extract mission from topic: gcn.notices.{mission}.{type}
            split(col("topic"), r"\.").getItem(2).alias("mission"),
            col("kafka_timestamp"),
            col("ingestion_timestamp"),
            col("partition"),
            col("offset")
        )
    )

In [0]:
@dlt.table(
    name="gcn_circulars",
    comment="GCN Circulars (astronomer reports)",
    table_properties={"quality": "silver"}
)
def gcn_circulars():
    """Filter and decode GCN Circulars."""
    return (
        dlt.read_stream("gcn_raw")
        .filter(col("topic") == "gcn.circulars")
        .select(
            col("message_key"),
            decode(col("value"), "UTF-8").alias("circular_json"),
            col("topic"),
            col("kafka_timestamp"),
            col("ingestion_timestamp"),
            col("partition"),
            col("offset")
        )
    )

In [0]:
@dlt.table(
    name="igwn_gwalert",
    comment="IGWN Gravitational Wave Alerts",
    table_properties={"quality": "silver"}
)
def igwn_gwalert():
    """Filter and decode gravitational wave alerts."""
    return (
        dlt.read_stream("gcn_raw")
        .filter(col("topic") == "igwn.gwalert")
        .select(
            col("message_key"),
            decode(col("value"), "UTF-8").alias("gwalert_json"),
            col("topic"),
            col("kafka_timestamp"),
            col("ingestion_timestamp"),
            col("partition"),
            col("offset")
        )
    )

In [0]:
@dlt.table(
    name="gcn_heartbeat",
    comment="GCN Heartbeat/test messages",
    table_properties={"quality": "silver"}
)
def gcn_heartbeat():
    """Filter heartbeat messages (useful for monitoring)."""
    return (
        dlt.read_stream("gcn_raw")
        .filter(col("topic") == "gcn.heartbeat")
        .select(
            col("message_key"),
            decode(col("value"), "UTF-8").alias("heartbeat_json"),
            col("topic"),
            col("kafka_timestamp"),
            col("ingestion_timestamp"),
            col("partition"),
            col("offset")
        )
    )