In [None]:
from pyspark.sql.functions import * 
from pyspark.sql.types import * 
import dlt 



In [None]:
#Difining schema for loading file this example will load JSON files 

json_schema = StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True), 
    StructField("age", IntegerType(), True), 
    StructField("timestamp", TimeStamp(), True)
])

In [None]:
#Define the Raw Table using Delta Live Tables 

@dlt.table(
    name="raw_jason_data" #does not need to be in the Unit Catalog
    comment="The raw data ingested from json files"
)

@dlt.expect("Valid ID", "id IS NOT NULL and id !=''")
@dlt.expect("Valid Age", "age IS NOT NULL AND age > 0")
@dlt.expect("Valid Timestamp", "timestamp IS NOT NULL")

def raw_data(): 
    input = "s3://mybucket/json_files"

    df = (
        spark.readStream
            .format("CloudFiles")
            .option("cloudFiles.format", "json")
            .option("checkpointLocation", "/tmp/json_checkpoint")
            .schema(json_schema)
            .load(input)
    )

    return df 



In [None]:
#Difine a transfromed table using delta live tables 

@dlt.table(
    name="catalog.schema.transformed_data",
    comment="Transformed data with necessary changes"
)

@dlt.expect("valid Processed", "processed_at IS NOT NULL")

def transform_data(): 

    raw_df = dlt.read_stream("raw_json_data")
    transformed_data = (
        raw_df
            .withColumn("processed_at")
    )