In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import urllib

# Specify file type to be csv
file_type = "csv"
# Indicates file has first row as the header
first_row_is_header = "true"
# Indicates file has comma as the delimeter
delimiter = ","
# Read the CSV file to spark dataframe
aws_keys_df = spark.read.format(file_type)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load("/FileStore/tables/authentication_credentials.csv")

In [0]:
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Secret access key').collect()[0]['Secret access key']
# Encode the secret key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [0]:
schema = StructType() \
    .add("index", LongType()) \
    .add("unique_id", StringType()) \
    .add("title", StringType()) \
    .add("description", StringType()) \
    .add("follower_count", StringType()) \
    .add("poster_name", StringType()) \
    .add("tag_list", StringType()) \
    .add("is_image_or_video", StringType()) \
    .add("image_src", StringType()) \
    .add("save_location", StringType()) \
    .add("category", StringType())

In [0]:
df = spark \
.readStream \
.format('kinesis') \
.option('streamName','streaming-12256357c821-pin') \
.option('initialPosition','earliest') \
.option('region','us-east-1') \
.option('awsAccessKey', ACCESS_KEY) \
.option('awsSecretKey', SECRET_KEY) \
.load()

In [0]:
### extract information and visualize as a dataframe
df_pin = df.select(from_json(col("data").cast("string"), schema).alias("parsed_data")).select("parsed_data.*")


In [0]:
## transform data
# Replace "User Info Error" with None in the "poster_name" column
df_pin = df_pin.withColumn(
    "poster_name",
    when(col("poster_name").rlike("User Info Error"), None).otherwise(col("poster_name"))
)

# Replace "No Title Data Avai" with None in the "title" column
df_pin = df_pin.withColumn(
    "title",
    when(col("title").rlike("No Title Data"), None).otherwise(col("title"))
)

# Replace "User" with None in the "follower_count" column
df_pin = df_pin.withColumn(
    "follower_count",
    when(col("follower_count").rlike("User"), None).otherwise(col("follower_count"))
)

# Replace "No description available" with None in the "description" column
df_pin = df_pin.withColumn(
    "description",
    when(col("description").rlike("No description av"), None).otherwise(col("description"))
)

# Replace entries in the "tag_list" column that contain "N,o, ,T,a,g,s, ,A" with None
df_pin = df_pin.withColumn(
    "tag_list",
    when(col('tag_list').rlike("N,o, ,T,a,g,s, ,A"), None).otherwise(col("tag_list"))
)

In [0]:
df_pin = df_pin.withColumn(
    "follower_count", 
    when(df_pin.follower_count.endswith("k"), 
         regexp_replace(df_pin.follower_count, "k", "").cast("int") * 1000)
    .otherwise(
        when(df_pin.follower_count.endswith("M"), 
             regexp_replace(df_pin.follower_count, "M", "").cast("int") * 1000000)
        .otherwise(df_pin.follower_count.cast("int"))
    )
)

In [0]:
df_pin=df_pin.withColumn(
    "save_location",
    regexp_replace(df_pin.save_location,"Local save in","")
)

In [0]:
df_pin = df_pin.withColumnRenamed("index", "ind")

In [0]:
df_pin = df_pin.select(
    "ind",
    "unique_id",
    "title",
    "description",
    "follower_count",
    "poster_name",
    "tag_list",
    "is_image_or_video",
    "image_src",
    "save_location",
    "category"
)


In [0]:
display(df_pin)

In [0]:
dbutils.fs.rm("/tmp/kinesis/_checkpoints/", True)

df_pin.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .option("mergeSchema", "true") \
  .table("PIN_DATA")
