## Store data to Mongo DB

This notebook will demonstrate uploading of the data to Mongo DB with Checkpoint

The data can be read from captured events in Datalake or delta table

### Option - 1
##### Structure Streaming Upload
This option allows you to uplaod the data as and when it is stored in data lake

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

claims_schema = StructType([
    StructField("id", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("phone_number", StringType(), True),
    StructField("country", StringType(), True),
    StructField("claim_amount", IntegerType(), True),
    StructField("type_id", StringType(), True),
    StructField("status", StringType(), True),
    StructField("processed", TimestampType(), True),
    StructField("type", StringType(), True)
])


In [0]:
checkpoint_path = "dbfs:/FileStore/events/_checkpoints/lake_events"
upload_path = "dbfs:/mnt/stream-data/claims"

# Set up the stream to begin reading incoming files from the
# upload_path location.
events_datalake_df = spark.readStream.format('cloudFiles') \
  .option('cloudFiles.format', 'csv') \
  .option('header', 'true') \
  .schema(claims_schema) \
  .load(upload_path)

In [0]:
def write_row(batch_df , batch_id):
    batch_df.write\
    .format("mongo")\
    .mode("append")\
    .option('checkpointLocation', checkpoint_path) \
    .option("uri", "mongodb+srv://admin:demo%40PSL@cluster0.s5tuvb0.mongodb.net/events_db.claims_events?retryWrites=true&w=majority")\
    .save()
    pass

In [0]:
events_datalake_df.writeStream\
.foreachBatch(write_row)\
.start()

In [0]:
# Reading from MongoDB
mongo_df = spark.read\
.format("com.mongodb.spark.sql.DefaultSource")\
.option("uri", "mongodb+srv://admin:demo%40PSL@cluster0.s5tuvb0.mongodb.net/events_db?retryWrites=true&w=majority")\
.option("database", "events_db")\
.option("collection", "claims_events")\
.load()

display(mongo_df)

_id,claim_amount,country,customer_name,id,phone_number,processed,status,type,type_id
List(62f033e2ae5050723f30d961),8745,VCT,Kim Warner,04da5e7b-4ccb-44b4-8f27-bec9d7e0330d,6134594862,2022-08-07T21:47:00.004+0000,Approved,Commercial Property Insurance,CP
List(62f033e2ae5050723f30d972),9779,TUV,Bradley Peterson,723e06bb-beb9-40cd-8f59-f80be00b1968,(302)180-3008x31407,2022-08-07T21:47:00.004+0000,Hold,Personal Home Insurance,PH
List(62f033e2ae5050723f30d978),8190,AZE,Cory Taylor,f533549e-3bfb-4bf2-a02a-5d53a40e95f4,(136)818-9633x739,2022-08-07T21:47:00.004+0000,Approved,Personal Life Insurance,PL
List(62f033e2ae5050723f30d963),2577,VCT,Nicholas Wilson,c978f452-bf7e-416f-9cb9-09ba67975711,+1-046-154-6348x8754,2022-08-07T21:47:00.004+0000,Hold,Commercial Health Insurance,CH
List(62f033e2ae5050723f30d95b),7296,USA,Jeremy Campbell,ff0c84c9-79f8-4436-9397-2f99f4aadb5e,(213)076-7154,2022-08-07T21:47:00.004+0000,Rejected,Commercial Property Insurance,CP
List(62f033e2ae5050723f30d95f),9382,GUY,Jeffrey Stevenson,5c51fe81-a8db-4f98-ac94-e5f1820fc324,(549)214-1162x50711,2022-08-07T21:47:00.004+0000,Hold,Personal Life Insurance,PL
List(62f033e2ae5050723f30d96b),6332,BMU,Kathleen Hensley,2c3255f4-d393-465e-815c-c3d82e44388e,+1-115-701-2916x9613,2022-08-07T21:47:00.004+0000,Approved,Personal Travel Insurance,PT
List(62f033e2ae5050723f30d96d),5629,KAZ,Mackenzie Holmes,29341a55-c63d-4661-a90a-5e1093da8c0c,001-532-255-9009x99015,2022-08-07T21:47:00.004+0000,Hold,Commercial Accident Insurance,CA
List(62f033e2ae5050723f30d976),7086,DZA,Crystal Robinson,54e01168-fd5a-43dd-8592-5a5113e9501a,841.472.8342,2022-08-07T21:47:00.004+0000,Approved,Commercial Accident Insurance,CA
List(62f033e2ae5050723f30d965),2964,MDV,Jorge Rodgers,d86b7783-3543-4760-a305-7cbf7f39cabb,(971)754-1171,2022-08-07T21:47:00.004+0000,Hold,Personal Motor Insurance,PM


### Option - 2
##### Batch Upload
This option allows you to uplaod the complete data in one go.

In [0]:
events_datalake_df = spark.read.format("csv")\
.schema(claims_schema)\
.option("checkpointLocation", "dbfs:/FileStore/events/_checkpoints/lake_events")\
.load("dbfs:/mnt/stream-data/claims")

In [0]:
display(events_datalake_df)

id,customer_name,phone_number,country,claim_amount,type_id,status,processed,type
b0273763-7440-4a87-a04f-15e55efabc39,Laura Stokes,001-768-178-8156,GUY,4207,PL,Approved,2022-08-07T21:45:00.004+0000,Personal Life Insurance
20f3c1ac-4bba-4b56-a55a-3ece94608a3e,Joseph Pierce,613.311.8381,CAN,3149,PL,Rejected,2022-08-07T21:45:00.004+0000,Personal Life Insurance
3c456a06-2467-42c4-b44d-c990549f1ddb,Mark Riley,(895)354-4972x3382,IRQ,5663,CP,Hold,2022-08-07T21:45:00.004+0000,Commercial Property Insurance
e185ac27-0b42-4857-aa9b-881a886f6046,Miss Sabrina Wright,(471)809-6274,IRL,2599,PM,Hold,2022-08-07T21:45:00.004+0000,Personal Motor Insurance
86e358e7-5b70-416e-b674-6dbaea73c215,Keith Henderson,496-516-3092,WSM,7895,CA,Rejected,2022-08-07T21:45:00.004+0000,Commercial Accident Insurance
61102aa2-4d2d-4f2c-8ae1-362bca63674d,Douglas Johnson,370-657-4530x887,MYT,4511,PT,Rejected,2022-08-07T21:45:00.004+0000,Personal Travel Insurance
7a6a5cbc-660d-4aa7-b8d6-e91d564ecacd,Micheal Schneider,001-104-553-9956x5350,USA,4820,PL,Hold,2022-08-07T21:45:00.004+0000,Personal Life Insurance
67759d82-d53d-4202-bff8-42ca4e80f5e3,Pamela Martinez,+1-909-598-6732,PAN,3063,PT,Approved,2022-08-07T21:45:00.004+0000,Personal Travel Insurance
16c84fda-a3d3-4637-adf5-d31a75a78979,Danny Rivera,001-654-507-0564x851,USA,3377,PT,Hold,2022-08-07T21:45:00.004+0000,Personal Travel Insurance
a26989f9-224a-4b2a-b20f-c9b4d4597773,John Brown,(983)602-8831x29708,ERI,8686,PH,Approved,2022-08-07T21:45:00.004+0000,Personal Home Insurance


In [0]:
events_datalake_df.write\
.format('com.mongodb.spark.sql.DefaultSource')\
.option("checkpointLocation", "dbfs:/FileStore/events/_checkpoints/mongo_events")\
.mode("overwrite")\
.option("uri", "mongodb+srv://admin:demo%40PSL@cluster0.s5tuvb0.mongodb.net/events_db.claims_events?retryWrites=true&w=majority")\
.save()

In [0]:
# Reading from MongoDB
mongo_df = spark.read\
.format("com.mongodb.spark.sql.DefaultSource")\
.option("uri", "<Endpoint URL>")\
.option("database", "events_db")\
.option("collection", "claims_events")\
.load()

display(mongo_df)

_id,claim_amount,country,customer_name,id,phone_number,processed,status,type,type_id
List(62f033e2ae5050723f30d95b),7296,USA,Jeremy Campbell,ff0c84c9-79f8-4436-9397-2f99f4aadb5e,(213)076-7154,2022-08-07T21:47:00.004+0000,Rejected,Commercial Property Insurance,CP
List(62f033e2ae5050723f30d95d),5409,USA,Mary Ramirez,f48ac830-a899-4dbe-bb51-8066f00d1a89,261-165-9985,2022-08-07T21:47:00.004+0000,Rejected,Commercial Health Insurance,CH
List(62f033e2ae5050723f30d95f),9382,GUY,Jeffrey Stevenson,5c51fe81-a8db-4f98-ac94-e5f1820fc324,(549)214-1162x50711,2022-08-07T21:47:00.004+0000,Hold,Personal Life Insurance,PL
List(62f033e2ae5050723f30d961),8745,VCT,Kim Warner,04da5e7b-4ccb-44b4-8f27-bec9d7e0330d,6134594862,2022-08-07T21:47:00.004+0000,Approved,Commercial Property Insurance,CP
List(62f033e2ae5050723f30d963),2577,VCT,Nicholas Wilson,c978f452-bf7e-416f-9cb9-09ba67975711,+1-046-154-6348x8754,2022-08-07T21:47:00.004+0000,Hold,Commercial Health Insurance,CH
List(62f033e2ae5050723f30d965),2964,MDV,Jorge Rodgers,d86b7783-3543-4760-a305-7cbf7f39cabb,(971)754-1171,2022-08-07T21:47:00.004+0000,Hold,Personal Motor Insurance,PM
List(62f033e2ae5050723f30d967),8878,COG,Nancy Rowland,13328f5c-86d4-48e0-9449-2cdd25e57567,+1-808-299-5010x7734,2022-08-07T21:47:00.004+0000,Rejected,Personal Home Insurance,PH
List(62f033e2ae5050723f30d969),5496,GGY,Angela Kane,2c7c41b9-0afd-4cad-8ed5-b52a29723c2d,(224)030-3261x8590,2022-08-07T21:47:00.004+0000,Rejected,Commercial Health Insurance,CH
List(62f033e2ae5050723f30d96b),6332,BMU,Kathleen Hensley,2c3255f4-d393-465e-815c-c3d82e44388e,+1-115-701-2916x9613,2022-08-07T21:47:00.004+0000,Approved,Personal Travel Insurance,PT
List(62f033e2ae5050723f30d96d),5629,KAZ,Mackenzie Holmes,29341a55-c63d-4661-a90a-5e1093da8c0c,001-532-255-9009x99015,2022-08-07T21:47:00.004+0000,Hold,Commercial Accident Insurance,CA


##### ==== end of notebook ====