In [0]:
from datetime import datetime
from pyspark.sql.functions import lit
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, ArrayType, LongType

In [0]:
stream_schema = StructType(
    [
        StructField("abstract", StringType(), True),
        StructField("byline", StringType(), True),
        StructField("created_date", StringType(), True),
        StructField("des_facet", ArrayType(StringType(), True), True),
        StructField("geo_facet", ArrayType(StringType(), True), True),
        StructField("item_type", StringType(), True),
        StructField("kicker", StringType(), True),
        StructField("material_type_facet", StringType(), True),
        StructField(
            "multimedia",
            ArrayType(
                StructType(
                    [
                        StructField("caption", StringType(), True),
                        StructField("copyright", StringType(), True),
                        StructField("format", StringType(), True),
                        StructField("height", LongType(), True),
                        StructField("subtype", StringType(), True),
                        StructField("type", StringType(), True),
                        StructField("url", StringType(), True),
                        StructField("width", LongType(), True),
                    ]
                ),
                True,
            ),
            True,
        ),
        StructField("org_facet", ArrayType(StringType(), True), True),
        StructField("per_facet", ArrayType(StringType(), True), True),
        StructField("published_date", StringType(), True),
        StructField("section", StringType(), True),
        StructField("short_url", StringType(), True),
        StructField("subsection", StringType(), True),
        StructField("title", StringType(), True),
        StructField("updated_date", StringType(), True),
        StructField("uri", StringType(), True),
        StructField("url", StringType(), True),
        StructField("ingestion_date", StringType(), True),
    ]
)

In [0]:
def ingestion(df):
    df = df.withColumn('ingestion_date', lit(datetime.now().strftime('%Y-%m-%d')))

    (df.write
        .partitionBy('ingestion_date')
        .mode('overwrite')
        .saveAsTable('bronze.nytimes.top_stories'))


In [0]:
df_stream = (spark.readStream
                .format('cloudFiles')
                .option('cloudFiles.format', 'json')
                .schema(stream_schema)
                .load('/Volumes/raw/nytimes/top_stories')
)

In [0]:
stream = (df_stream.writeStream
    .option("checkpointLocation", "/Volumes/raw/nytimes/top_stories_checkpoint/")
    .foreachBatch(
        lambda df, epoch_id: ingestion(df)
    )
    .trigger(availableNow=True)
)

In [0]:
start = stream.start()

In [0]:
%sql

SHOW PARTITIONS bronze.nytimes.top_stories;

In [0]:
%sql

SELECT 
  ingestion_date, 
  COUNT(*) AS entries
FROM bronze.nytimes.top_stories
GROUP BY ingestion_date
;