In [0]:
df = spark.read.parquet("/FileStore/tables/Streaming/Stream_readStream/csv/multiline_nested_parquet.parquet")
display(df)

source,description,input_timestamp,last_update_timestamp,country,user,Location,Zipcode
catalog,bravia,1124256609,1524256609,IND,Hari,Bangalore,560103
SAP,sony,1224256609,1424256609,US,Rajesh,Chennai,860103
ADLS,bse,1324256609,1524256609,CANADA,Lokesh,Hyderabad,755103
Blob,exchange,1424256609,1724256609,US,Sharath,Kochin,120103
SQL,Stock,1524256609,1664256609,SWEDEN,Sheetal,Delhi,875103
datawarehouse,azure,1624256609,1874256609,UK,Raj,Mumbai,123403
oracle,ADF,1779256609,188256609,Norway,Synapse,Nasik,456103


**Create folder in DBFS**

In [0]:
dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_checkpoint/csv")
dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_checkpoint/json")
dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_checkpoint/parquet")
dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_checkpoint/orc")
dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_checkpoint/avro")

dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_readStream/csv/")
dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_readStream/json/")
dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_readStream/parquet/")
dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_readStream/orc/")
dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_readStream/avro/")

dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_writeStream/csv/")
dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_writeStream/json/")
dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_writeStream/parquet/")
dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_writeStream/orc/")
dbutils.fs.mkdirs("/FileStore/tables/Streaming/Stream_writeStream/avro/")

Out[4]: True

**Delete folder in DBFS**

In [0]:
dbutils.fs.rm("/FileStore/tables/Streaming/Stream_checkpoint/csv", True)
dbutils.fs.rm("/FileStore/tables/Streaming/Stream_checkpoint/json", True)
dbutils.fs.rm("/FileStore/tables/Streaming/Stream_checkpoint/parquet", True)
dbutils.fs.rm("/FileStore/tables/Streaming/Stream_checkpoint/orc", True)
dbutils.fs.rm("/FileStore/tables/Streaming/Stream_checkpoint/avro", True)

dbutils.fs.rm("/FileStore/tables/Streaming/Stream_readStream/csv", True)
dbutils.fs.rm("/FileStore/tables/Streaming/Stream_readStream/json", True)
dbutils.fs.rm("/FileStore/tables/Streaming/Stream_readStream/parquet", True)
dbutils.fs.rm("/FileStore/tables/Streaming/Stream_readStream/orc", True)
dbutils.fs.rm("/FileStore/tables/Streaming/Stream_readStream/avro", True)

dbutils.fs.rm("/FileStore/tables/Streaming/Stream_writeStream/csv", True)
dbutils.fs.rm("/FileStore/tables/Streaming/Stream_writeStream/json", True)
dbutils.fs.rm("/FileStore/tables/Streaming/Stream_writeStream/parquet", True)
dbutils.fs.rm("/FileStore/tables/Streaming/Stream_writeStream/orc", True)
dbutils.fs.rm("/FileStore/tables/Streaming/Stream_writeStream/avro", True)

Out[1]: False

**Define schema for input JSON file**
- schema must be specified when creating a streaming source dataframe, otherwise it will through error.

In [0]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, LongType

schema_parquet = StructType([StructField('source',StringType(),False),
                             StructField('description',StringType(),False),
                             StructField('input_timestamp',LongType(),False),
                             StructField('last_update_timestamp',LongType(),False),
                             StructField('country',StringType(),False),
                             StructField('user',StringType(),False),
                             StructField('Location',StringType(),False),
                             StructField('Zipcode',StringType(),False)]
                           )

#### **1) readStream**

In [0]:
stream_parquet = spark.readStream\
                      .format("parquet")\
                      .schema(schema_parquet)\
                      .parquet("/FileStore/tables/Streaming/Stream_readStream/parquet/")

print(stream_parquet.isStreaming)
print(stream_parquet.printSchema())

display(stream_parquet)

# stream_parquet.awaitTermination()

source,description,input_timestamp,last_update_timestamp,country,user,Location,Zipcode
catalog,bravia,1124256609,1524256609,IND,Hari,Bangalore,560103
SAP,sony,1224256609,1424256609,US,Rajesh,Chennai,860103
ADLS,bse,1324256609,1524256609,CANADA,Lokesh,Hyderabad,755103
Blob,exchange,1424256609,1724256609,US,Sharath,Kochin,120103
SQL,Stock,1524256609,1664256609,SWEDEN,Sheetal,Delhi,875103
datawarehouse,azure,1624256609,1874256609,UK,Raj,Mumbai,123403
oracle,ADF,1779256609,188256609,Norway,Synapse,Nasik,456103
catalog,bravia,1124256609,1524256609,IND,Hari,Bangalore,560103
SAP,sony,1224256609,1424256609,US,Rajesh,Chennai,860103
ADLS,bse,1324256609,1524256609,CANADA,Lokesh,Hyderabad,755103


#### **2) writeStream**

**format('parquet')**

In [0]:
check_point = "/FileStore/tables/Streaming/Stream_checkpoint/parquet"

stream_parquet.writeStream\
              .format('parquet')\
              .outputMode('append')\
              .option("path", "/FileStore/tables/Streaming/Stream_writeStream/parquet/")\
              .option("checkpointLocation", check_point)\
              .start()

display(stream_parquet)

source,description,input_timestamp,last_update_timestamp,country,user,Location,Zipcode
catalog,bravia,1124256609,1524256609,IND,Hari,Bangalore,560103
SAP,sony,1224256609,1424256609,US,Rajesh,Chennai,860103
ADLS,bse,1324256609,1524256609,CANADA,Lokesh,Hyderabad,755103
Blob,exchange,1424256609,1724256609,US,Sharath,Kochin,120103
SQL,Stock,1524256609,1664256609,SWEDEN,Sheetal,Delhi,875103
datawarehouse,azure,1624256609,1874256609,UK,Raj,Mumbai,123403
oracle,ADF,1779256609,188256609,Norway,Synapse,Nasik,456103
catalog,bravia,1124256609,1524256609,IND,Hari,Bangalore,560103
SAP,sony,1224256609,1424256609,US,Rajesh,Chennai,860103
ADLS,bse,1324256609,1524256609,CANADA,Lokesh,Hyderabad,755103


**verify the written stream data**

In [0]:
display(spark.read.format("parquet").load("/FileStore/tables/Streaming/Stream_writeStream/parquet/*.parquet"))

source,description,input_timestamp,last_update_timestamp,country,user,Location,Zipcode
catalog,bravia,1124256609,1524256609,IND,Hari,Bangalore,560103
SAP,sony,1224256609,1424256609,US,Rajesh,Chennai,860103
ADLS,bse,1324256609,1524256609,CANADA,Lokesh,Hyderabad,755103
Blob,exchange,1424256609,1724256609,US,Sharath,Kochin,120103
SQL,Stock,1524256609,1664256609,SWEDEN,Sheetal,Delhi,875103
datawarehouse,azure,1624256609,1874256609,UK,Raj,Mumbai,123403
oracle,ADF,1779256609,188256609,Norway,Synapse,Nasik,456103
catalog,bravia,1124256609,1524256609,IND,Hari,Bangalore,560103
SAP,sony,1224256609,1424256609,US,Rajesh,Chennai,860103
ADLS,bse,1324256609,1524256609,CANADA,Lokesh,Hyderabad,755103
