In [None]:
# Import Necessary Classes
from pyspark.sql.types import *
from pyspark.sql.functions import *
import urllib

# Define the path to the Delta table
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"

# Read the Delta table to a Spark DataFrame
aws_keys_df = spark.read.format("delta").load(delta_table_path)

In [None]:
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [None]:
%sql
-- Disable format checks during the reading of Delta tables
SET spark.databricks.delta.formatCheck.enabled=false

key,value
spark.databricks.delta.formatCheck.enabled,False


##Task 4: Read data from Kinesis streams in Databricks
Step 1:

Create a new Notebook in Databricks and read in your credentials from the Delta table, located at dbfs:/user/hive/warehouse/authentication_credentials, to retrieve the Access Key and Secret Access Key. Follow the same process for this, as you have followed for your batch data.

Step 2:

Run your preferred method to ingest data into Kinesis Data Streams. In the Kinesis console, check your data streams are receiving the data.

Step 3:

Read the data from the three streams you have created in your Databricks Notebook.


## ETL (stands for “Extract, Transform, Load,” ) PIPELINE

### Define the names of the Kinesis streams
stream_name_geo = "streaming-0af0031518e7-geo"

stream_name_pin = "streaming-0af0031518e7-pin"

stream_name_user = "streaming-0af0031518e7-user"

### Extract Starts here to read all stream in

In [None]:
def create_dataframes(stream_name):
    df = spark \
    .readStream \
    .format('kinesis') \
    .option('streamName', stream_name) \
    .option('initialPosition', 'earliest') \
    .option('region', 'us-east-1') \
    .option('awsAccessKey', ACCESS_KEY) \
    .option('awsSecretKey', SECRET_KEY) \
    .load()
    
    return df

In [None]:
stream_name_geo = "streaming-0af0031518e7-geo"
stream_name_pin = "streaming-0af0031518e7-pin"
stream_name_user = "streaming-0af0031518e7-user"

df_pin_raw = create_dataframes(stream_name_pin)
df_geo_raw = create_dataframes(stream_name_geo)
df_user_raw = create_dataframes(stream_name_user)

In [None]:
#Starts here to read all stream in 

# Read data from the first Kinesis stream without specifying schema
stream_name_geo = "streaming-0af0031518e7-geo"

df_geo_raw = spark \
    .readStream \
    .format('kinesis') \
    .option('streamName', stream_name_geo) \
    .option('initialPosition', 'earliest') \
    .option('region', 'us-east-1') \
    .option('awsAccessKey', ACCESS_KEY) \
    .option('awsSecretKey', SECRET_KEY) \
    .load()
# Define the names of the Kinesis streams
display(df_geo_raw)

partitionKey,data,stream,shardId,sequenceNumber,approximateArrivalTimestamp
desired-name,eyJpbmQiOjEwMCwidGltZXN0YW1wIjoiMjAxOC0wOS0xNCAxMTozMDoxMyIsImxhdGl0dWRlIjotODQuNjQ0NiwibG9uZ2l0dWRlIjotMTczLjA1OCwiY291bnRyeSI6IkFuZ3VpbGxhIn0=,streaming-0af0031518e7-geo,shardId-000000000002,49649671496703673780746892847819668898954898727057227810,2024-03-13T15:11:32.433+0000
desired-name,eyJpbmQiOjUzNjAsInRpbWVzdGFtcCI6IjIwMTctMTEtMDQgMTc6MDM6MjMiLCJsYXRpdHVkZSI6LTg3LjIyNDUsImxvbmdpdHVkZSI6LTEwMC4yMTMsImNvdW50cnkiOiJBcmdlbnRpbmEifQ==,streaming-0af0031518e7-geo,shardId-000000000002,49649671496703673780746892853911446103993015413279555618,2024-03-13T15:11:36.665+0000
desired-name,eyJpbmQiOjYzMTEsInRpbWVzdGFtcCI6IjIwMjAtMTAtMTIgMDk6MTc6MDciLCJsYXRpdHVkZSI6LTg5LjYzLCJsb25naXR1ZGUiOi0xNzkuMDIyLCJjb3VudHJ5IjoiQXJnZW50aW5hIn0=,streaming-0af0031518e7-geo,shardId-000000000002,49649671496703673780746892859350403366439232345160548386,2024-03-13T15:11:40.396+0000
desired-name,eyJpbmQiOjY5NywidGltZXN0YW1wIjoiMjAyMC0wNy0xOSAyMTo0Njo1OCIsImxhdGl0dWRlIjotODguODI5OCwibG9uZ2l0dWRlIjotMTcwLjE4OCwiY291bnRyeSI6IkFsYmFuaWEifQ==,streaming-0af0031518e7-geo,shardId-000000000002,49649671496703673780746892865228200701405559667459883042,2024-03-13T15:11:44.764+0000
desired-name,eyJpbmQiOjIyOCwidGltZXN0YW1wIjoiMjAyMS0wMS0xNSAxNzo0NzozNCIsImxhdGl0dWRlIjotMzIuMzUwOCwibG9uZ2l0dWRlIjotMTE0LjY1NiwiY291bnRyeSI6IkJvc25pYSBhbmQgSGVyemVnb3ZpbmEifQ==,streaming-0af0031518e7-geo,shardId-000000000002,49649671496703673780746892870709470367538288620455591970,2024-03-13T15:11:48.609+0000
desired-name,eyJpbmQiOjEwOTk5LCJ0aW1lc3RhbXAiOiIyMDE5LTA0LTE5IDA3OjQ0OjA4IiwibGF0aXR1ZGUiOi01Mi4yMTcyLCJsb25naXR1ZGUiOjIxLjY3MDQsImNvdW50cnkiOiJTYWludCBNYXJ0aW4ifQ==,streaming-0af0031518e7-geo,shardId-000000000002,49649671496703673780746892877375487336893354233382830114,2024-03-13T15:11:53.392+0000
desired-name,eyJpbmQiOjY4NTYsInRpbWVzdGFtcCI6IjIwMjItMDYtMTcgMjE6MjY6MzEiLCJsYXRpdHVkZSI6MTEuMzA4OCwibG9uZ2l0dWRlIjotNTMuNTU1NCwiY291bnRyeSI6IkFybWVuaWEifQ==,streaming-0af0031518e7-geo,shardId-000000000002,49649671496703673780746892883000619175560223989449097250,2024-03-13T15:11:57.298+0000
desired-name,eyJpbmQiOjEwMjMsInRpbWVzdGFtcCI6IjIwMTgtMDItMTUgMDc6MDA6MTIiLCJsYXRpdHVkZSI6NDkuOTMzNywibG9uZ2l0dWRlIjoxMTAuODU5LCJjb3VudHJ5IjoiUGFuYW1hIn0=,streaming-0af0031518e7-geo,shardId-000000000002,49649671496703673780746892888982384131013409489492639778,2024-03-13T15:12:01.521+0000
desired-name,eyJpbmQiOjMwNjEsInRpbWVzdGFtcCI6IjIwMTktMDQtMjcgMTA6NTY6NTYiLCJsYXRpdHVkZSI6LTc2Ljk1MSwibG9uZ2l0dWRlIjotMTA2LjI2NSwiY291bnRyeSI6IkNhbWJvZGlhIn0=,streaming-0af0031518e7-geo,shardId-000000000002,49649671496703673780746892895347378571284432369198563362,2024-03-13T15:12:05.797+0000
desired-name,eyJpbmQiOjkwNSwidGltZXN0YW1wIjoiMjAxOC0wNi0yMSAwNjo1MTozMSIsImxhdGl0dWRlIjotODUuMjkyNywibG9uZ2l0dWRlIjotMTQyLjg3OSwiY291bnRyeSI6Ik5pZ2VyIn0=,streaming-0af0031518e7-geo,shardId-000000000002,49649671496703673780746892901246936571003823016642609186,2024-03-13T15:12:10.047+0000


In [None]:
# Define the names of the Kinesis streams
stream_name_pin = "streaming-0af0031518e7-pin"

# Read data from the second Kinesis stream without specifying schema
df_pin_raw = spark \
    .readStream \
    .format('kinesis') \
    .option('streamName', stream_name_pin) \
    .option('initialPosition', 'earliest') \
    .option('region', 'us-east-1') \
    .option('awsAccessKey', ACCESS_KEY) \
    .option('awsSecretKey', SECRET_KEY) \
    .load()
display(df_pin_raw)

partitionKey,data,stream,shardId,sequenceNumber,approximateArrivalTimestamp
desired-name,eyJpbmRleCI6NDY3MCwidW5pcXVlX2lkIjoiMTYxODk3NzQtZDY5Ni00NjBhLWFmZmUtZDUzZTk1ZWI5ZjBkIiwidGl0bGUiOiJBZnRlciBJIEFtIEdvbmUgUGxhbm5lciAtIEJpbmRlciIsImRlc2NyaXB0aW9uIjoiJDEyLjA= (truncated),streaming-0af0031518e7-pin,shardId-000000000002,49649684810426963265115952515438572548345601364190036002,2024-03-13T15:11:31.553+0000
desired-name,eyJpbmRleCI6ODMwMiwidW5pcXVlX2lkIjoiZTUxNzVlODctZTM5Ni00MjY1LWI5M2UtNTViNzcyMTcxY2IwIiwidGl0bGUiOiI1MCBCZXN0IE5pa2l0YSBHaWxsIFF1b3RlcyArIEFuIEV4Y2x1c2l2ZSBJbnRlcnZpZXcgT24= (truncated),streaming-0af0031518e7-pin,shardId-000000000002,49649684810426963265115952518813893436709646294847586338,2024-03-13T15:11:35.291+0000
desired-name,eyJpbmRleCI6MTA1MjgsInVuaXF1ZV9pZCI6IjU3M2MyNjQyLWFiNjUtNDYwMy1hMzdhLTQ0ODQ0YTYwZjZiMiIsInRpdGxlIjoiTWVyY2VkZXMtQmVuenwgTWVyY2VkZXMgY2FycyB1bmNvbW1vbiBjb2xvdXJ8IGhpZ2ggc3A= (truncated),streaming-0af0031518e7-pin,shardId-000000000002,49649684810426963265115952522943584036513219899241267234,2024-03-13T15:11:39.545+0000
desired-name,eyJpbmRleCI6MTQ3MywidW5pcXVlX2lkIjoiMGE4YTQ3MjMtMGIxYS00MTBmLWFmZDUtMTVhODYzMmIzODZjIiwidGl0bGUiOiJESVkgTWFrZXVwIFByaW1lcnMiLCJkZXNjcmlwdGlvbiI6IkluIERJWSBtYWtldXAgcHJpbWU= (truncated),streaming-0af0031518e7-pin,shardId-000000000002,49649684810426963265115952527049096119924500851421347874,2024-03-13T15:11:43.925+0000
desired-name,eyJpbmRleCI6MTA5NTYsInVuaXF1ZV9pZCI6ImNlMzdhZDU1LWI0MzMtNDI0Ny05ZTM3LTE0OTlmOWQ1ZTc2ZSIsInRpdGxlIjoiU3BvcnRzIENhcnM6IFByaWNlcywgTVBHICYgRmVhdHVyZXMiLCJkZXNjcmlwdGlvbiI6Im0= (truncated),streaming-0af0031518e7-pin,shardId-000000000002,49649684810426963265115952530628725471803418112604241954,2024-03-13T15:11:47.654+0000
desired-name,eyJpbmRleCI6Mzk0LCJ1bmlxdWVfaWQiOiJiM2U2NDVlZi04MzRmLTQ0NDAtYmQ4ZC04NzhhZmNkYzc4MDEiLCJ0aXRsZSI6IkhvdyBUbyBQYWludCBDbG91ZHMgV2l0aCBBY3J5bGljIFBhaW50IEZvciBCZWdpbm5lcnMgKEU= (truncated),streaming-0af0031518e7-pin,shardId-000000000002,49649684810426963265115952534919203205615737397233844258,2024-03-13T15:11:52.429+0000
desired-name,eyJpbmRleCI6NDA0NSwidW5pcXVlX2lkIjoiZjBjNjA1ZDctNzViYy00YjYwLWE5ZWMtZTJkNjRlYjI5Njg3IiwidGl0bGUiOiJLJlPihKIgUm91bmQgQnJhc3MgVHViZTogM21tIE9EIHggMC4yMjVtbSBXYWxsIHggMzAwbW0= (truncated),streaming-0af0031518e7-pin,shardId-000000000002,49649684810426963265115952538572577032491147038073815074,2024-03-13T15:11:56.436+0000
desired-name,eyJpbmRleCI6NjA2MiwidW5pcXVlX2lkIjoiODgwYTg3NmMtZDI0Zi00NGQxLTkwMWMtMzVkYjFmNjFhNGU3IiwidGl0bGUiOiJGYWxsIEJhdGhyb29tIHN0eWxpbmcgSW5zcG8iLCJkZXNjcmlwdGlvbiI6Ik5vIGRlc2NyaXA= (truncated),streaming-0af0031518e7-pin,shardId-000000000002,49649684810426963265115952542099013648307020546850160674,2024-03-13T15:12:00.261+0000
desired-name,eyJpbmRleCI6MjY2MCwidW5pcXVlX2lkIjoiODA0NDM0M2QtNzQwMS00NmYxLTljNWUtMDRkNjk2ZWM1MjBlIiwidGl0bGUiOiJDcmVhdGluZyBPcmRlcnMgUGVyc29uYWxpemVkICBXaXRoIFRoZSBOYW1lIEluc2lkZSIsImQ= (truncated),streaming-0af0031518e7-pin,shardId-000000000002,49649684810426963265115952546168257957129862692508532770,2024-03-13T15:12:04.796+0000
desired-name,eyJpbmRleCI6MjA3MywidW5pcXVlX2lkIjoiOTAwMzNhNzMtNDQwMC00MzZhLTliN2MtNTVjZWMwYzU1ZTEwIiwidGl0bGUiOiIxMDAgQ2hyaXN0bWFzIE91dGRvb3IgRGVjb3IgSWRlYXMgdGhhdCdsbCBtYWtlIHlvdSBzYXk= (truncated),streaming-0af0031518e7-pin,shardId-000000000002,49649684810426963265115952550557867608150581500744564770,2024-03-13T15:12:09.077+0000


In [None]:
# Read data from the third Kinesis stream without specifying schema

stream_name_user = "streaming-0af0031518e7-user"

df_user_raw = spark \
    .readStream \
    .format('kinesis') \
    .option('streamName', stream_name_user) \
    .option('initialPosition', 'earliest') \
    .option('region', 'us-east-1') \
    .option('awsAccessKey', ACCESS_KEY) \
    .option('awsSecretKey', SECRET_KEY) \
    .load()
display(df_user_raw)
# Define the names of the Kinesis streams
# Extract End here 

partitionKey,data,stream,shardId,sequenceNumber,approximateArrivalTimestamp
desired-name,eyJpbmQiOjQ4MzAsImZpcnN0X25hbWUiOiJBbmRyZXciLCJsYXN0X25hbWUiOiJCZW50bGV5IiwiYWdlIjozOCwiZGF0ZV9qb2luZWQiOiIyMDE2LTAyLTI0IDIyOjQ2OjQyIn0=,streaming-0af0031518e7-user,shardId-000000000002,49649638893371005452152416390866092035319857381612978210,2024-03-13T15:11:33.283+0000
desired-name,eyJpbmQiOjMyNTYsImZpcnN0X25hbWUiOiJCcmlhbiIsImxhc3RfbmFtZSI6Ik1vcnJpcyIsImFnZSI6NDIsImRhdGVfam9pbmVkIjoiMjAxNS0xMS0xMCAxMzoyODoxNyJ9,streaming-0af0031518e7-user,shardId-000000000002,49649638893371005452152416395782793343692554510020902946,2024-03-13T15:11:37.522+0000
desired-name,eyJpbmQiOjg4NjIsImZpcnN0X25hbWUiOiJKYXNvbiIsImxhc3RfbmFtZSI6Ik1vcmFsZXMiLCJhZ2UiOjQxLCJkYXRlX2pvaW5lZCI6IjIwMTYtMTItMDkgMTI6MDA6MDQifQ==,streaming-0af0031518e7-user,shardId-000000000002,49649638893371005452152416400347697238557394479869853730,2024-03-13T15:11:41.577+0000
desired-name,eyJpbmQiOjQ0NDAsImZpcnN0X25hbWUiOiJCcm9va2UiLCJsYXN0X25hbWUiOiJCcm93biIsImFnZSI6MzUsImRhdGVfam9pbmVkIjoiMjAxNi0wMy0yNCAxMjoyOTo1MiJ9,streaming-0af0031518e7-user,shardId-000000000002,49649638893371005452152416405141088113329399501177225250,2024-03-13T15:11:45.635+0000
desired-name,eyJpbmQiOjEwNDQ3LCJmaXJzdF9uYW1lIjoiS2FyZW4iLCJsYXN0X25hbWUiOiJTb2xpcyIsImFnZSI6MzcsImRhdGVfam9pbmVkIjoiMjAxNy0wNC0xMSAwMzoxMzoxOSJ9,streaming-0af0031518e7-user,shardId-000000000002,49649638893371005452152416409573010168036630330527973410,2024-03-13T15:11:49.478+0000
desired-name,eyJpbmQiOjExMDUwLCJmaXJzdF9uYW1lIjoiSmFtZXMiLCJsYXN0X25hbWUiOiJTbWl0aCIsImFnZSI6MjksImRhdGVfam9pbmVkIjoiMjAxNi0xMS0wOSAwMDowMzo0MiJ9,streaming-0af0031518e7-user,shardId-000000000002,49649638893371005452152416415275513259158836491214389282,2024-03-13T15:11:54.413+0000
desired-name,eyJpbmQiOjM1NTksImZpcnN0X25hbWUiOiJBbm5lIiwibGFzdF9uYW1lIjoiQXRraW5zIiwiYWdlIjoyMywiZGF0ZV9qb2luZWQiOiIyMDE2LTA0LTAxIDAzOjE5OjA1In0=,streaming-0af0031518e7-user,shardId-000000000002,49649638893371005452152416419425755597895858722858598434,2024-03-13T15:11:58.234+0000
desired-name,eyJpbmQiOjc5MDYsImZpcnN0X25hbWUiOiJEb25hbGQiLCJsYXN0X25hbWUiOiJDaGF2ZXoiLCJhZ2UiOjI1LCJkYXRlX2pvaW5lZCI6IjIwMTUtMTItMjYgMDk6MDQ6MjkifQ==,streaming-0af0031518e7-user,shardId-000000000002,49649638893371005452152416424329158722252794999064231970,2024-03-13T15:12:02.602+0000
desired-name,eyJpbmQiOjQ2MDYsImZpcnN0X25hbWUiOiJCcnlhbiIsImxhc3RfbmFtZSI6IkNyYWlnIiwiYWdlIjoyMSwiZGF0ZV9qb2luZWQiOiIyMDE1LTExLTEwIDAzOjIzOjE4In0=,streaming-0af0031518e7-user,shardId-000000000002,49649638893371005452152416429563807521184139462980927522,2024-03-13T15:12:06.759+0000
desired-name,eyJpbmQiOjQ5MDIsImZpcnN0X25hbWUiOiJIYXlsZXkiLCJsYXN0X25hbWUiOiJGcmFuY2lzIiwiYWdlIjoyMCwiZGF0ZV9qb2luZWQiOiIyMDE1LTExLTAzIDAyOjE0OjU5In0=,streaming-0af0031518e7-user,shardId-000000000002,49649638893371005452152416434741636806593596561844863010,2024-03-13T15:12:11.026+0000


Extract End

## Task 5: Transform Kinesis streams in Databricks
Clean the streaming data in the same way you have previously cleaned the batch data.

## Transform Starts

In [None]:
# Define schema for df_user
schema_df_user = StructType([
    StructField("ind", LongType(), True),
    StructField("user_name", StringType(), True),
    StructField("age", LongType(), True),
    StructField("date_joined", StringType(), True),
    StructField("age_category", StringType(), True),
    StructField("age_group", StringType(), True)
])

In [None]:
# Transform Starts here define the schemas
# Define schema for df_geo
schema_df_geo = StructType([
    StructField("ind", LongType(), True),
    StructField("country", StringType(), True),
    StructField("latitude", FloatType(), False),
    StructField("longitude", FloatType(), False),
    StructField("timestamp", TimestampType(), True)
])

In [None]:
# Define schema for df_pin
schema_df_pin = StructType([
    StructField("ind", IntegerType(), True),
    StructField("unique_id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("follower_count", IntegerType(), True),
    StructField("poster_name", StringType(), True),
    StructField("tag_list", StringType(), True),
    StructField("is_image_or_video", StringType(), True),
    StructField("image_src", StringType(), True),
    StructField("save_location", StringType(), True),
    StructField("category", StringType(), True)
])

In [None]:
# Parse JSON data and apply schema to df_geo

df_geo = df_geo_raw.selectExpr("CAST(data AS STRING)") \
                    .withColumn('data', from_json('data', schema_df_geo)) \
                    .select("data.*")
# # # Displaying the DataFrames
display(df_geo)

ind,country,latitude,longitude,timestamp
100,Anguilla,-84.6446,-173.058,2018-09-14T11:30:13.000+0000
5360,Argentina,-87.2245,-100.213,2017-11-04T17:03:23.000+0000
6311,Argentina,-89.63,-179.022,2020-10-12T09:17:07.000+0000
697,Albania,-88.8298,-170.188,2020-07-19T21:46:58.000+0000
228,Bosnia and Herzegovina,-32.3508,-114.656,2021-01-15T17:47:34.000+0000
10999,Saint Martin,-52.2172,21.6704,2019-04-19T07:44:08.000+0000
6856,Armenia,11.3088,-53.5554,2022-06-17T21:26:31.000+0000
1023,Panama,49.9337,110.859,2018-02-15T07:00:12.000+0000
3061,Cambodia,-76.951,-106.265,2019-04-27T10:56:56.000+0000
905,Niger,-85.2927,-142.879,2018-06-21T06:51:31.000+0000


In [None]:
# Parse JSON data and apply schema to df_pin
df_pin = df_pin_raw.selectExpr("CAST(data AS STRING)") \
                   .withColumn('data', from_json('data', schema_df_pin)) \
                   .select("value.*")

# Displaying the DataFrames
display(df_pin)

unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
4a609972-2386-43d5-962e-ffbfed0948ff,Powerful Affirmation Quotes for Self Love and Success,No description available Story format,,Karibikfruechtchen,"Now Quotes,Self Love Quotes,Words Quotes,Quotes To Live By,Life Quotes,Wise Words,Being Happy Quotes,Its Okay Quotes,Qoutes",multi-video(story page format),Image src error.,Local save in /data/quotes,quotes
ad01b4d1-dcd5-473e-8582-0dde129dc884,20 Christmas Trees For Small Spaces (Christmas Tree Alternatives) • Mama and More,"Christmas trees are beautiful and festive, but they take up a lot of space. Check out these alternative Christmas trees for small spaces!",,Mama & More - Love Your Home With DIY,"Scandinavian Christmas Decorations,Christmas Tree Design,Wooden Christmas Trees,Farmhouse Christmas Decor,Noel Christmas,Modern Christmas,Rustic Christmas,Christmas Projects,Simple Christmas",image,https://i.pinimg.com/originals/27/30/51/273051db01648351f013da36b2327971.jpg,Local save in /data/christmas,christmas
7d12f92d-f35a-4104-85ed-1f66c7affe9c,BMW M235i getting a Rainbow Chrome Wrap,"If you want a flashy car, nothing gets closer to that than this BMW M235i. Featuring a rainbow chrome wrap.",,BMWBLOG,"Luxury Sports Cars,Best Luxury Cars,Sport Cars,Fancy Cars,Cute Cars,Ford Gt,Bmw M235i,Dream Cars,Carros Bmw",image,https://i.pinimg.com/originals/9d/69/d8/9d69d86233096c6377460b1b7737f9a5.jpg,Local save in /data/vehicles,vehicles
8a9954cc-ccd2-4e42-8575-0db50ef6db23,Nestfair 7.5 ft. Pre-Lit Christmas Tree with 350 LED Lights,"This lush christmas tree create a vibrant atomasphere, adorning your room, when the tree without LED lights. And Warm yellow lights in the room to create a loving and comfort at…",,Wear24-7,"Frosted Christmas Tree,Pre Lit Christmas Tree,Christmas Store,Christmas Lights,Elegant Christmas,Holiday Tree,Fraser Fir,Artificial Tree,Christmas Wonderland",image,https://i.pinimg.com/originals/11/ad/71/11ad71ed3af810f9cf6e0a06061ba741.jpg,Local save in /data/christmas,christmas
364bc376-7841-4939-a27a-3ca23efc3644,Weihnachtsgeschenk Oma,Geschenk zu Weihnachten für Oma. Weihnachtsgeschenke Oma. Geschenk für Großmutter. Frohe Weihnachten Geschenkbox. Perlenarmband Omi. Lieblingsoma Armband. #oma #weihnachten #wei…,,SR Jewelry,"Gifts For Mum,Mother Gifts,Bridal Gifts,Wedding Gifts,Lace Wedding,Wedding Dresses,Dream Wedding,Wedding Ideas,Mother Of The Bride Bracelets",image,https://i.pinimg.com/originals/8d/92/d2/8d92d2c0f1b7dd3b35609829d3828f6b.jpg,Local save in /data/event-planning,event-planning
474598e3-dfac-4f7f-b80d-f21e2fd1c4ab,Vauxhall,Vauxhall,,Dana Dawes,"Classic Cars British,Classic Car Restoration,Cars Uk,Classic Chevy Trucks,Classic Motors,Vintage Trucks,Retro Cars,Old Cars,Dream Cars",image,https://i.pinimg.com/originals/0d/ce/ea/0dceeaa623cbacd03c11ac831e4406b9.jpg,Local save in /data/vehicles,vehicles
48619244-57dd-4e76-ae0c-6caba6d283d2,41 Best Small Flower Tattoos For Women,"90% people believe small flower tattoos symbolize loss and sadness, it meaning of flower tattoo is love, joy, freshness, unity, and vitality. read hear 41 ideas",,"Beautyholo | Latest Hairstyles, Nail Design Ideas, Home Décor DIY, Women Fashion Ideas","Dainty Tattoos,Cute Tattoos,Body Art Tattoos,Small Tattoos,Sleeve Tattoos,Pretty Tattoos,Tatoos,Delicate Feminine Tattoos,Sexy Tattoos",image,https://i.pinimg.com/originals/e6/36/e3/e636e36ebb8fc379825a82c0290f8b9b.jpg,Local save in /data/tattoos,tattoos
fa6c606c-278a-421b-b50d-645f3dad53ee,DIY Photo Ornaments with a Snow Globe - Busy Kids Happy Mom,Are you looking for the perfect Keepsake Christmas gift? Maybe you’re a teacher and you want to send home something special to the student’s parents for Christmas. I love this D…,,Busy Kids Happy Mom,"Diy Photo Ornaments,Christmas Ornament Crafts,Christmas Gifts For Mom,Xmas Crafts,Christmas Projects,Christmas Holidays,Christmas Decorations,Globe Ornament,Ornaments Ideas",image,https://i.pinimg.com/originals/2b/94/fd/2b94fd0ba711325bd513e6a2d922c6a9.jpg,Local save in /data/christmas,christmas
0086d23f-3538-48bc-9d00-ba9315469ff8,Tattling Troubles: Fostering Kindness in the Classroom,Do you have tattling troubles? No matter how many ways I have tried to cut down on tattling it always seems to creep it's way back into my classroom. This year is no different.…,,Learning with Mrs. Langley,"Social Emotional Learning,Social Skills,Teaching Kindness,Teaching Empathy,Character Education,Physical Education,Teaching Character,Special Education,Education Jobs",image,https://i.pinimg.com/originals/e2/d3/1c/e2d31c25969e6a89332f05d9135a10a7.jpg,Local save in /data/education,education
208d836a-58bf-4e9b-8cec-94f2c063287c,Scented Christmas Bowlies,"8 little bowl fillers scented with cloves to herald in the Christmas season! They are all different in size measuring from 4 3/4"" x 6"" to 5"" x 8"". The embroidery patterns and th…",,Kathy Schmitz,"N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e",image,https://i.pinimg.com/originals/4d/d9/cc/4dd9cc88bfe2b91469e5bcd02853c05f.jpg,Local save in /data/christmas,christmas


In [None]:
# Parse JSON data and apply schema to df_user
df_user = df_user_raw.selectExpr("CAST(data AS STRING)") \
                     .selectExpr("from_json(data, 'ind bigint, user_name string, age bigint, date_joined string, age_category string, age_group string') AS value") \
                     .select("value.*")
# Displaying the DataFrames
display(df_user)
# Transform stops here 

ind,user_name,age,date_joined,age_category,age_group
4830,,38,2016-02-24 22:46:42,,
3256,,42,2015-11-10 13:28:17,,
8862,,41,2016-12-09 12:00:04,,
4440,,35,2016-03-24 12:29:52,,
10447,,37,2017-04-11 03:13:19,,
11050,,29,2016-11-09 00:03:42,,
3559,,23,2016-04-01 03:19:05,,
7906,,25,2015-12-26 09:04:29,,
4606,,21,2015-11-10 03:23:18,,
4902,,20,2015-11-03 02:14:59,,


### To clean the df_pin DataFrame you should perform the following transformations:
Replace empty entries and entries with no relevant data in each column with Nones
Perform the necessary transformations on the follower_count to ensure every entry is a number. Make sure the data type of this column is an int.
Ensure that each column containing numeric data has a numeric data type
Clean the data in the save_location column to include only the save location path
Rename the index column to ind.
Reorder the DataFrame columns to have the following column order:
- ind
- unique_id
- title
- description
- follower_count
- poster_name
- tag_list
- is_image_or_video
- image_src
- save_location
- category

In [None]:
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col

# Replace empty entries and entries with no relevant data in each column with None for df_pin DataFrame
df_pin = df_pin.fillna("None")

# Replace "k" with "000" and cast to integer
df_pin = df_pin.withColumn("follower_count", 
                           regexp_replace(col("follower_count"), "k", "000").cast(IntegerType())
                          )

# Rename index column to ind
df_pin = df_pin.withColumnRenamed("index", "ind")

# Reorder the DataFrame columns
df_pin = df_pin.select("unique_id", "title", "description", "follower_count", 
                       "poster_name", "tag_list", "is_image_or_video", "image_src", 
                       "save_location", "category")

# Show the DataFrame after reordering
display(df_pin)

unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
4a609972-2386-43d5-962e-ffbfed0948ff,Powerful Affirmation Quotes for Self Love and Success,No description available Story format,,Karibikfruechtchen,"Now Quotes,Self Love Quotes,Words Quotes,Quotes To Live By,Life Quotes,Wise Words,Being Happy Quotes,Its Okay Quotes,Qoutes",multi-video(story page format),Image src error.,Local save in /data/quotes,quotes
ad01b4d1-dcd5-473e-8582-0dde129dc884,20 Christmas Trees For Small Spaces (Christmas Tree Alternatives) • Mama and More,"Christmas trees are beautiful and festive, but they take up a lot of space. Check out these alternative Christmas trees for small spaces!",,Mama & More - Love Your Home With DIY,"Scandinavian Christmas Decorations,Christmas Tree Design,Wooden Christmas Trees,Farmhouse Christmas Decor,Noel Christmas,Modern Christmas,Rustic Christmas,Christmas Projects,Simple Christmas",image,https://i.pinimg.com/originals/27/30/51/273051db01648351f013da36b2327971.jpg,Local save in /data/christmas,christmas
7d12f92d-f35a-4104-85ed-1f66c7affe9c,BMW M235i getting a Rainbow Chrome Wrap,"If you want a flashy car, nothing gets closer to that than this BMW M235i. Featuring a rainbow chrome wrap.",,BMWBLOG,"Luxury Sports Cars,Best Luxury Cars,Sport Cars,Fancy Cars,Cute Cars,Ford Gt,Bmw M235i,Dream Cars,Carros Bmw",image,https://i.pinimg.com/originals/9d/69/d8/9d69d86233096c6377460b1b7737f9a5.jpg,Local save in /data/vehicles,vehicles
8a9954cc-ccd2-4e42-8575-0db50ef6db23,Nestfair 7.5 ft. Pre-Lit Christmas Tree with 350 LED Lights,"This lush christmas tree create a vibrant atomasphere, adorning your room, when the tree without LED lights. And Warm yellow lights in the room to create a loving and comfort at…",,Wear24-7,"Frosted Christmas Tree,Pre Lit Christmas Tree,Christmas Store,Christmas Lights,Elegant Christmas,Holiday Tree,Fraser Fir,Artificial Tree,Christmas Wonderland",image,https://i.pinimg.com/originals/11/ad/71/11ad71ed3af810f9cf6e0a06061ba741.jpg,Local save in /data/christmas,christmas
364bc376-7841-4939-a27a-3ca23efc3644,Weihnachtsgeschenk Oma,Geschenk zu Weihnachten für Oma. Weihnachtsgeschenke Oma. Geschenk für Großmutter. Frohe Weihnachten Geschenkbox. Perlenarmband Omi. Lieblingsoma Armband. #oma #weihnachten #wei…,,SR Jewelry,"Gifts For Mum,Mother Gifts,Bridal Gifts,Wedding Gifts,Lace Wedding,Wedding Dresses,Dream Wedding,Wedding Ideas,Mother Of The Bride Bracelets",image,https://i.pinimg.com/originals/8d/92/d2/8d92d2c0f1b7dd3b35609829d3828f6b.jpg,Local save in /data/event-planning,event-planning
474598e3-dfac-4f7f-b80d-f21e2fd1c4ab,Vauxhall,Vauxhall,,Dana Dawes,"Classic Cars British,Classic Car Restoration,Cars Uk,Classic Chevy Trucks,Classic Motors,Vintage Trucks,Retro Cars,Old Cars,Dream Cars",image,https://i.pinimg.com/originals/0d/ce/ea/0dceeaa623cbacd03c11ac831e4406b9.jpg,Local save in /data/vehicles,vehicles
48619244-57dd-4e76-ae0c-6caba6d283d2,41 Best Small Flower Tattoos For Women,"90% people believe small flower tattoos symbolize loss and sadness, it meaning of flower tattoo is love, joy, freshness, unity, and vitality. read hear 41 ideas",,"Beautyholo | Latest Hairstyles, Nail Design Ideas, Home Décor DIY, Women Fashion Ideas","Dainty Tattoos,Cute Tattoos,Body Art Tattoos,Small Tattoos,Sleeve Tattoos,Pretty Tattoos,Tatoos,Delicate Feminine Tattoos,Sexy Tattoos",image,https://i.pinimg.com/originals/e6/36/e3/e636e36ebb8fc379825a82c0290f8b9b.jpg,Local save in /data/tattoos,tattoos
fa6c606c-278a-421b-b50d-645f3dad53ee,DIY Photo Ornaments with a Snow Globe - Busy Kids Happy Mom,Are you looking for the perfect Keepsake Christmas gift? Maybe you’re a teacher and you want to send home something special to the student’s parents for Christmas. I love this D…,,Busy Kids Happy Mom,"Diy Photo Ornaments,Christmas Ornament Crafts,Christmas Gifts For Mom,Xmas Crafts,Christmas Projects,Christmas Holidays,Christmas Decorations,Globe Ornament,Ornaments Ideas",image,https://i.pinimg.com/originals/2b/94/fd/2b94fd0ba711325bd513e6a2d922c6a9.jpg,Local save in /data/christmas,christmas
0086d23f-3538-48bc-9d00-ba9315469ff8,Tattling Troubles: Fostering Kindness in the Classroom,Do you have tattling troubles? No matter how many ways I have tried to cut down on tattling it always seems to creep it's way back into my classroom. This year is no different.…,,Learning with Mrs. Langley,"Social Emotional Learning,Social Skills,Teaching Kindness,Teaching Empathy,Character Education,Physical Education,Teaching Character,Special Education,Education Jobs",image,https://i.pinimg.com/originals/e2/d3/1c/e2d31c25969e6a89332f05d9135a10a7.jpg,Local save in /data/education,education
208d836a-58bf-4e9b-8cec-94f2c063287c,Scented Christmas Bowlies,"8 little bowl fillers scented with cloves to herald in the Christmas season! They are all different in size measuring from 4 3/4"" x 6"" to 5"" x 8"". The embroidery patterns and th…",,Kathy Schmitz,"N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e",image,https://i.pinimg.com/originals/4d/d9/cc/4dd9cc88bfe2b91469e5bcd02853c05f.jpg,Local save in /data/christmas,christmas


In [None]:
# Clean geo df

In [None]:
# Clean user df

Transform Ends

## Task 6: Write the streaming data to Delta Tables
Once the streaming data has been cleaned, you should save each stream in a Delta Table. You should save the following tables: <your_UserId>_pin_table, <your_UserId>_geo_table and <your_UserId>_user_table.

## Load Starts

In [None]:
# Load starts here

# Writing the DataFrames to Delta tables
df_geo.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", "/tmp/kinesis/_checkpoints/geo") \
    .table("0af0031518e7_geo_table")

# to stop the geo stream after 10 seconds
#df_geo.awaitTerminationOrTimeout(10000)
#df_geo.stop()
display(df_geo)
# NOTE might need to terminate stream here after Load is complete

ind,country,latitude,longitude,timestamp
10974,Barbados,-73.8461,-167.426,2021-08-11T16:32:29.000+0000
5859,Liechtenstein,-10.6711,-116.164,2022-07-21T09:16:22.000+0000
8520,American Samoa,-81.8896,-153.897,2022-03-01T08:09:17.000+0000
7046,Bosnia and Herzegovina,-1.46457,28.1291,2021-03-23T06:49:43.000+0000
9414,Bermuda,-88.054,-162.366,2020-03-28T04:19:44.000+0000
1122,Gibraltar,-54.9169,-93.8569,2021-07-29T01:12:55.000+0000
2022,Netherlands,-25.0311,-2.28113,2020-07-25T22:35:38.000+0000
1025,Burundi,-65.2093,-163.873,2021-12-26T18:34:28.000+0000
5705,Belize,-26.5916,113.806,2019-11-23T14:35:32.000+0000
10709,Belarus,-9.23648,-142.205,2019-04-03T05:48:31.000+0000


In [None]:
# Writing the DataFrames to Delta tables

df_pin.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", "/tmp/kinesis/_checkpoints/pin") \
    .table("0af0031518e7_pin_table")

# to stop the pin stream after 10 seconds
#df_pin.awaitTerminationOrTimeout(10)
#df_pin.stop()
display(df_pin)

unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
4a609972-2386-43d5-962e-ffbfed0948ff,Powerful Affirmation Quotes for Self Love and Success,No description available Story format,,Karibikfruechtchen,"Now Quotes,Self Love Quotes,Words Quotes,Quotes To Live By,Life Quotes,Wise Words,Being Happy Quotes,Its Okay Quotes,Qoutes",multi-video(story page format),Image src error.,Local save in /data/quotes,quotes
ad01b4d1-dcd5-473e-8582-0dde129dc884,20 Christmas Trees For Small Spaces (Christmas Tree Alternatives) • Mama and More,"Christmas trees are beautiful and festive, but they take up a lot of space. Check out these alternative Christmas trees for small spaces!",,Mama & More - Love Your Home With DIY,"Scandinavian Christmas Decorations,Christmas Tree Design,Wooden Christmas Trees,Farmhouse Christmas Decor,Noel Christmas,Modern Christmas,Rustic Christmas,Christmas Projects,Simple Christmas",image,https://i.pinimg.com/originals/27/30/51/273051db01648351f013da36b2327971.jpg,Local save in /data/christmas,christmas
7d12f92d-f35a-4104-85ed-1f66c7affe9c,BMW M235i getting a Rainbow Chrome Wrap,"If you want a flashy car, nothing gets closer to that than this BMW M235i. Featuring a rainbow chrome wrap.",,BMWBLOG,"Luxury Sports Cars,Best Luxury Cars,Sport Cars,Fancy Cars,Cute Cars,Ford Gt,Bmw M235i,Dream Cars,Carros Bmw",image,https://i.pinimg.com/originals/9d/69/d8/9d69d86233096c6377460b1b7737f9a5.jpg,Local save in /data/vehicles,vehicles
8a9954cc-ccd2-4e42-8575-0db50ef6db23,Nestfair 7.5 ft. Pre-Lit Christmas Tree with 350 LED Lights,"This lush christmas tree create a vibrant atomasphere, adorning your room, when the tree without LED lights. And Warm yellow lights in the room to create a loving and comfort at…",,Wear24-7,"Frosted Christmas Tree,Pre Lit Christmas Tree,Christmas Store,Christmas Lights,Elegant Christmas,Holiday Tree,Fraser Fir,Artificial Tree,Christmas Wonderland",image,https://i.pinimg.com/originals/11/ad/71/11ad71ed3af810f9cf6e0a06061ba741.jpg,Local save in /data/christmas,christmas
364bc376-7841-4939-a27a-3ca23efc3644,Weihnachtsgeschenk Oma,Geschenk zu Weihnachten für Oma. Weihnachtsgeschenke Oma. Geschenk für Großmutter. Frohe Weihnachten Geschenkbox. Perlenarmband Omi. Lieblingsoma Armband. #oma #weihnachten #wei…,,SR Jewelry,"Gifts For Mum,Mother Gifts,Bridal Gifts,Wedding Gifts,Lace Wedding,Wedding Dresses,Dream Wedding,Wedding Ideas,Mother Of The Bride Bracelets",image,https://i.pinimg.com/originals/8d/92/d2/8d92d2c0f1b7dd3b35609829d3828f6b.jpg,Local save in /data/event-planning,event-planning
474598e3-dfac-4f7f-b80d-f21e2fd1c4ab,Vauxhall,Vauxhall,,Dana Dawes,"Classic Cars British,Classic Car Restoration,Cars Uk,Classic Chevy Trucks,Classic Motors,Vintage Trucks,Retro Cars,Old Cars,Dream Cars",image,https://i.pinimg.com/originals/0d/ce/ea/0dceeaa623cbacd03c11ac831e4406b9.jpg,Local save in /data/vehicles,vehicles
48619244-57dd-4e76-ae0c-6caba6d283d2,41 Best Small Flower Tattoos For Women,"90% people believe small flower tattoos symbolize loss and sadness, it meaning of flower tattoo is love, joy, freshness, unity, and vitality. read hear 41 ideas",,"Beautyholo | Latest Hairstyles, Nail Design Ideas, Home Décor DIY, Women Fashion Ideas","Dainty Tattoos,Cute Tattoos,Body Art Tattoos,Small Tattoos,Sleeve Tattoos,Pretty Tattoos,Tatoos,Delicate Feminine Tattoos,Sexy Tattoos",image,https://i.pinimg.com/originals/e6/36/e3/e636e36ebb8fc379825a82c0290f8b9b.jpg,Local save in /data/tattoos,tattoos
fa6c606c-278a-421b-b50d-645f3dad53ee,DIY Photo Ornaments with a Snow Globe - Busy Kids Happy Mom,Are you looking for the perfect Keepsake Christmas gift? Maybe you’re a teacher and you want to send home something special to the student’s parents for Christmas. I love this D…,,Busy Kids Happy Mom,"Diy Photo Ornaments,Christmas Ornament Crafts,Christmas Gifts For Mom,Xmas Crafts,Christmas Projects,Christmas Holidays,Christmas Decorations,Globe Ornament,Ornaments Ideas",image,https://i.pinimg.com/originals/2b/94/fd/2b94fd0ba711325bd513e6a2d922c6a9.jpg,Local save in /data/christmas,christmas
0086d23f-3538-48bc-9d00-ba9315469ff8,Tattling Troubles: Fostering Kindness in the Classroom,Do you have tattling troubles? No matter how many ways I have tried to cut down on tattling it always seems to creep it's way back into my classroom. This year is no different.…,,Learning with Mrs. Langley,"Social Emotional Learning,Social Skills,Teaching Kindness,Teaching Empathy,Character Education,Physical Education,Teaching Character,Special Education,Education Jobs",image,https://i.pinimg.com/originals/e2/d3/1c/e2d31c25969e6a89332f05d9135a10a7.jpg,Local save in /data/education,education
208d836a-58bf-4e9b-8cec-94f2c063287c,Scented Christmas Bowlies,"8 little bowl fillers scented with cloves to herald in the Christmas season! They are all different in size measuring from 4 3/4"" x 6"" to 5"" x 8"". The embroidery patterns and th…",,Kathy Schmitz,"N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e",image,https://i.pinimg.com/originals/4d/d9/cc/4dd9cc88bfe2b91469e5bcd02853c05f.jpg,Local save in /data/christmas,christmas


In [None]:
# Writing the DataFrames to Delta tables
df_user.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", "/tmp/kinesis/_checkpoints/user") \
    .table("0af0031518e7_user_table")

# to stop the user stream after 10 seconds
#df_user.awaitTerminationOrTimeout(10)
#df_user.stop()
display(df_user)
# Load ends here 

ind,user_name,age,date_joined,age_category,age_group
2376,,21,2016-02-16 12:29:18,,
9374,,27,2015-11-12 15:08:12,,
1268,,32,2016-06-08 22:10:13,,
7774,,42,2016-01-26 07:11:54,,
5778,,20,2015-10-26 23:04:52,,
10161,,22,2017-07-07 20:23:09,,
7652,,39,2015-10-24 03:55:03,,
9253,,39,2017-08-10 12:26:13,,
3741,,43,2016-06-14 12:18:24,,
2771,,37,2016-03-31 03:27:47,,


In [None]:
# Deleting the checkpoint folder
# dbutils.fs.rm("/tmp/kinesis/_checkpoints/", True)