In [None]:
# Unmount the S3 bucket
# This is to unmount the drive in case of a cluster restart, cluster termination or dataset corruption or deletion.
# Mount name for the bucket
MOUNT_NAME = "/mnt/user-0affe012670f-mount"

# Unmounting the drive
dbutils.fs.unmount(MOUNT_NAME)

In [None]:
# Import pyspark functions
from pyspark.sql.functions import *
# Import url processing
import urllib

In [None]:
# Define the path to the Delta table
delta_table_path =  "dbfs:/user/hive/warehouse/authentication_credentials"
# Reading the Delta table to a Spark DataFrame
aws_keys_df =  spark.read.format("delta").load(delta_table_path)


In [None]:
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [None]:
# AWS S3 bucket name
AWS_S3_BUCKET = "user-0affe012670f-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/user-0affe012670f-mount"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

In [None]:
display(dbutils.fs.ls("/mnt/user-0affe012670f-mount/topics"))

path,name,size,modificationTime
dbfs:/mnt/user-0affe012670f-mount/topics/0affe012670f.geo/,0affe012670f.geo/,0,1722113431954
dbfs:/mnt/user-0affe012670f-mount/topics/0affe012670f.pin/,0affe012670f.pin/,0,1722113431954
dbfs:/mnt/user-0affe012670f-mount/topics/0affe012670f.user/,0affe012670f.user/,0,1722113431954


In [None]:
%sql
SET spark.databricks.delta.formatCheck.enabled=false

key,value
spark.databricks.delta.formatCheck.enabled,False


In [None]:
# File location and type
# indicates reading all the content of the file from the path that have .json extension but construct the path instead.
file_location = "/mnt/user-0affe012670f-mount/topics" 
file_type = "json"

# Construct paths to the JSON objects
user_id = "0affe012670f"
pin_data_path = f"{file_location}/{user_id}.pin/partition=0/"
geo_data_path = f"{file_location}/{user_id}.geo/partition=0/"
user_data_path = f"{file_location}/{user_id}.user/partition=0/"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
#df_pin = spark.read.json(pin_data_path, multiLine=true)
df_pin = spark.read.json(pin_data_path, multiLine=True, mode="PERMISSIVE")
df_geo = spark.read.json(geo_data_path, multiLine=True, mode="PERMISSIVE")
df_user = spark.read.json(user_data_path, multiLine=True, mode="PERMISSIVE")


display(df_pin)
display(df_geo)
display(df_user)

category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
christmas,This STEP-BY-STEP tutorial with video shows you how to add cascading ribbon on Christmas trees. Waterfall ribbon Christmas trees allow you to add any combinations of ribbon & me…,1,71k,https://i.pinimg.com/videos/thumbnails/originals/3b/de/b9/3bdeb9de975c6533ed4a40818df4d623.0000001.jpg,2008,video,"Karin Peters, Renovated Faith | Transforming Your Home & Heart",Local save in /data/christmas,"Diy Christmas Tree Garland,Elegant Christmas Trees,Diy Christmas Decorations Easy,Dollar Tree Christmas,Christmas Tree Toppers,Traditional Christmas Tree,Christmas Crafts,Christmas Décor,Christmas Tree Decorations Ribbon",EASIEST Way to Add Ribbon to a Christmas Tree (Simple Ribbon Hack),b117b8d7-2364-410e-8046-faa2024ce281
event-planning,Do you wish there were ways you can get great marketing for your business for free? Are you exploring ways to grow your event planning business but don’t yet have funds to pay f…,1,4k,https://i.pinimg.com/originals/c7/4a/7c/c74a7ccc1630bec91c5bbca72abdac7f.jpg,4507,image,EventPlanning.com | Learn How To Become An Event Planner,Local save in /data/event-planning,"Event Planning Template,Event Planning Quotes,Event Planning Checklist,Event Planning Business,Event Planning Design,Wedding Planning,Business Ideas,Catering Business,Wedding Ideas",Free Ways to Market Your Event Planning Business - Learn About Event Planning,54849a25-0e4c-4c9f-b732-6b25782a2e36
beauty,"Buy ""Neutrogena Makeup Remover Cleansing Face Wipes, Daily Cleansing Facial Towelettes to Remove Waterproof Makeup and Mascara, Alcohol-Free, Value Twin Pack, 25 Count, 2 Pack""…",1,184,https://i.pinimg.com/originals/b9/60/25/b96025d6275179059d1c4a4ead5d22da.jpg,1467,image,Megan Catalogna,Local save in /data/beauty,"Face Skin Care,Diy Skin Care,Anti Aging Skin Care,Natural Skin Care,Natural Beauty,Natural Hair,Piel Natural,Natural Life,Organic Skin Care","""Neutrogena Makeup Remover Cleansing Face Wipes, Daily Cleansing Facial Towelettes to Remove Waterproof Makeup and Mascara, Alcohol-Free, Value Twin Pack, 25 Count, 2 Pack""",b609b5c3-6b50-4cc5-b371-ff87fa61f352
art,"Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room, Bedroom Canvas Wall Art Ready to Hang Each canvas is professionally print…",1,305,https://i.pinimg.com/originals/06/63/7e/06637e1efa23c66e7b2194e4e23ad71a.jpg,20,image,Wall Canvas Mall,Local save in /data/art,"Wall Painting Frames,Framing Canvas Art,Bee Painting,Framed Canvas,Canvas Wall Art,Canvas Paintings,Bedroom Canvas,Bedroom Artwork,Canvas Home",Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room Bedroom Canvas Wall Art Ready to Hang - 1 Panel 60x30 / Floating Frame,70755e3d-dcad-4c71-a215-437a01739a02
event-planning,Pondering beginning a PARTY PLANNING and a Decorating business? Look at how to start a party planning business on a financial limit. If you love arranging and improving gatherin…,1,3k,https://i.pinimg.com/originals/db/91/3d/db913d245ae8d4da5c468ebf5f8a0ad3.jpg,4790,image,"Fun Party Plans | Planning Parties, Games, Shower Ideas, Weddings",Local save in /data/event-planning,"Wedding Event Planner,Wedding Events,Wedding Planning,Destination Wedding,Wedding Themes,Weddings,Party Events,Tent Wedding,Wedding Table",Surviving Your First Year as a Wedding Planner—Part 1 | Pointers For Planners,cca701ec-24ed-4cf9-b036-c0a87978ef27
finance,"You can retire early and have financial freedom. Learn how to live off your investments in early retirement. Money management and your money matters, so make the most of your fi…",1,28k,https://i.pinimg.com/originals/b5/c2/be/b5c2bef121cc458ebee325128fcc0974.png,5563,image,Dividends Diversify: Money Matters So Build Wealth & Be Rich,Local save in /data/finance,"Retirement Money,Investing For Retirement,Early Retirement,Investing Money,Retirement Planning,Financial Planning,Saving Money,Retirement Strategies,Financial Budget",How To Live Off Investments In Retirement – Dividends Diversify,007370e5-218a-4347-9ef4-19f3abed30bc
education,"Children, particularly young children, do not have the vocabulary or the experience to draw upon to express exactly how they are feeling. Download and use this printable poster…",1,1k,https://i.pinimg.com/originals/81/40/37/8140370a065b86bdc4cecab22811823a.png,4291,image,Educate2Empower Publishing,Local save in /data/education,"Mental Health Counseling,Kids Mental Health,Mental And Emotional Health,Social Emotional Learning,Counseling Activities,Art Therapy Activities,School Counseling,Play Therapy,Social Work Activities",Poster: How Are You Feeling Today? — Educate2Empower Publishing,62acf454-cb9f-4099-8a32-1a5b79d08380
christmas,"These are the best 70 Winter Decor Ideas for the home in the Wonderland Theme Decorations for your Rustic Farmhouse Living room, kitchen home decor ideas. After christmas party,…",1,164,https://i.pinimg.com/videos/thumbnails/originals/34/11/9e/34119e8339499f4c4d1c1e9405767259.0000001.jpg,2095,video,eyelashdance,Local save in /data/christmas,"Christmas Wood Crafts,Christmas Signs Wood,Rustic Christmas,Christmas Projects,Holiday Crafts,Christmas Time,Christmas Ornaments,Holiday Decor,Christmas Front Doors",Eyelashdance™Merry Christmas Front Door Hanging Sign With LED Light Wooden Sign,f8bfc64d-88dd-441a-9d5b-19e1a496c45d
finance,If you are a beginning investor this article is for you. Get more than 400 dividend stocks with this one Vanguard ETF. You don't need a lot of money. And you don't need to take…,1,28k,https://i.pinimg.com/originals/f1/14/ce/f114ce6e0657b64ade645124b9e3f8b1.jpg,5257,image,Dividends Diversify: Money Matters So Build Wealth & Be Rich,Local save in /data/finance,"Stock Market Investing,Investing In Stocks,Investing Money,Saving Money,Investment Tips,Investment Portfolio,Dividend Investing,Dividend Stocks,Planning Budget",VYM Review - Vanguard High Dividend Yield ETF – Dividends Diversify,dcd72c64-8bbf-496a-bd07-165ff8017c4b
diy-and-crafts,"If you enjoy thrift store crafts, you'll love this simple DIY upcycling project using the Cricut Joy! #ad Transform a thrifted photo frame into a beautiful and budget-friendly h…",1,40k,https://i.pinimg.com/videos/thumbnails/originals/8a/fd/86/8afd863592affcbcc5e7ae64386794dc.0000001.jpg,3272,video,Sustain My Craft Habit,Local save in /data/diy-and-crafts,"Diy Arts And Crafts,Diy Crafts To Sell,Diy Projects To Make And Sell,Diy Crafts Videos,Diy Vinyl Projects,Ideas For Cricut Projects,Vinil Cricut,Thrift Store Crafts,Cricut Craft Room",Spring Thrift Store Craft with Cricut Joy!,7ec6cac9-60be-4081-9030-d16e6963282f


country,ind,latitude,longitude,timestamp
Antarctica (the territory South of 60 deg S),5681,-84.9073,-105.769,2022-06-03T11:04:26
Antarctica (the territory South of 60 deg S),8681,-72.136,-130.529,2019-10-25T14:08:25
Antarctica (the territory South of 60 deg S),4790,45.2499,11.5767,2021-05-24T08:49:25
Heard Island and McDonald Islands,9701,45.5964,111.936,2017-10-31T17:42:09
Holy See (Vatican City State),7180,-22.7118,-167.739,2018-04-22T21:33:50
Slovakia (Slovak Republic),5203,-9.88959,14.5641,2019-06-04T16:23:54
Bouvet Island (Bouvetoya),9938,-88.516,-178.811,2018-04-03T18:40:47
Bouvet Island (Bouvetoya),9899,-88.516,-178.811,2019-11-10T10:07:08
Bouvet Island (Bouvetoya),5382,72.6957,136.639,2022-10-11T14:27:13
United States of America,1477,52.2604,-27.6119,2022-03-20T15:56:25


age,date_joined,first_name,ind,last_name
32,2015-12-18T05:07:36,Alexander,5260,Blanchard
23,2015-10-31T19:20:09,Alexandria,6051,Anderson
46,2016-07-02T08:06:40,Christina,2139,Carpenter
24,2016-08-24T23:59:06,Christopher,4630,Norris
29,2017-07-16T19:09:03,Abigail,5574,Henderson
41,2016-09-17T17:04:09,Alexandra,10476,Miller
54,2017-09-10T00:12:15,Christina,9951,Garcia
30,2016-01-22T19:09:18,Antonio,7350,Gonzalez
29,2017-05-30T10:56:50,Kathryn,316,Rasmussen
20,2015-11-20T09:08:00,Andrew,7695,Alexander


## Cleaning the Data and performing some computations.

In [None]:
from pyspark.sql.functions import col, when
# To clean the df_pin DataFrame you should perform some transformations:
# Replace empty entries and entries with no relevant data in each column with Nones

df_pin_cleaned = df_pin.select([when(col(c) == "", None).otherwise(col(c)).alias(c) for c in df_pin.columns])
df_geo_cleaned = df_geo.select([when(col(c) == "", None).otherwise(col(c)).alias(c) for c in df_geo.columns])
df_user_cleaned = df_user.select([when(col(c) == "", None).otherwise(col(c)).alias(c) for c in df_user.columns])

display(df_pin_cleaned)
display(df_geo_cleaned)
display(df_user_cleaned)

category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
christmas,This STEP-BY-STEP tutorial with video shows you how to add cascading ribbon on Christmas trees. Waterfall ribbon Christmas trees allow you to add any combinations of ribbon & me…,1,71k,https://i.pinimg.com/videos/thumbnails/originals/3b/de/b9/3bdeb9de975c6533ed4a40818df4d623.0000001.jpg,2008,video,"Karin Peters, Renovated Faith | Transforming Your Home & Heart",Local save in /data/christmas,"Diy Christmas Tree Garland,Elegant Christmas Trees,Diy Christmas Decorations Easy,Dollar Tree Christmas,Christmas Tree Toppers,Traditional Christmas Tree,Christmas Crafts,Christmas Décor,Christmas Tree Decorations Ribbon",EASIEST Way to Add Ribbon to a Christmas Tree (Simple Ribbon Hack),b117b8d7-2364-410e-8046-faa2024ce281
event-planning,Do you wish there were ways you can get great marketing for your business for free? Are you exploring ways to grow your event planning business but don’t yet have funds to pay f…,1,4k,https://i.pinimg.com/originals/c7/4a/7c/c74a7ccc1630bec91c5bbca72abdac7f.jpg,4507,image,EventPlanning.com | Learn How To Become An Event Planner,Local save in /data/event-planning,"Event Planning Template,Event Planning Quotes,Event Planning Checklist,Event Planning Business,Event Planning Design,Wedding Planning,Business Ideas,Catering Business,Wedding Ideas",Free Ways to Market Your Event Planning Business - Learn About Event Planning,54849a25-0e4c-4c9f-b732-6b25782a2e36
beauty,"Buy ""Neutrogena Makeup Remover Cleansing Face Wipes, Daily Cleansing Facial Towelettes to Remove Waterproof Makeup and Mascara, Alcohol-Free, Value Twin Pack, 25 Count, 2 Pack""…",1,184,https://i.pinimg.com/originals/b9/60/25/b96025d6275179059d1c4a4ead5d22da.jpg,1467,image,Megan Catalogna,Local save in /data/beauty,"Face Skin Care,Diy Skin Care,Anti Aging Skin Care,Natural Skin Care,Natural Beauty,Natural Hair,Piel Natural,Natural Life,Organic Skin Care","""Neutrogena Makeup Remover Cleansing Face Wipes, Daily Cleansing Facial Towelettes to Remove Waterproof Makeup and Mascara, Alcohol-Free, Value Twin Pack, 25 Count, 2 Pack""",b609b5c3-6b50-4cc5-b371-ff87fa61f352
art,"Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room, Bedroom Canvas Wall Art Ready to Hang Each canvas is professionally print…",1,305,https://i.pinimg.com/originals/06/63/7e/06637e1efa23c66e7b2194e4e23ad71a.jpg,20,image,Wall Canvas Mall,Local save in /data/art,"Wall Painting Frames,Framing Canvas Art,Bee Painting,Framed Canvas,Canvas Wall Art,Canvas Paintings,Bedroom Canvas,Bedroom Artwork,Canvas Home",Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room Bedroom Canvas Wall Art Ready to Hang - 1 Panel 60x30 / Floating Frame,70755e3d-dcad-4c71-a215-437a01739a02
event-planning,Pondering beginning a PARTY PLANNING and a Decorating business? Look at how to start a party planning business on a financial limit. If you love arranging and improving gatherin…,1,3k,https://i.pinimg.com/originals/db/91/3d/db913d245ae8d4da5c468ebf5f8a0ad3.jpg,4790,image,"Fun Party Plans | Planning Parties, Games, Shower Ideas, Weddings",Local save in /data/event-planning,"Wedding Event Planner,Wedding Events,Wedding Planning,Destination Wedding,Wedding Themes,Weddings,Party Events,Tent Wedding,Wedding Table",Surviving Your First Year as a Wedding Planner—Part 1 | Pointers For Planners,cca701ec-24ed-4cf9-b036-c0a87978ef27
finance,"You can retire early and have financial freedom. Learn how to live off your investments in early retirement. Money management and your money matters, so make the most of your fi…",1,28k,https://i.pinimg.com/originals/b5/c2/be/b5c2bef121cc458ebee325128fcc0974.png,5563,image,Dividends Diversify: Money Matters So Build Wealth & Be Rich,Local save in /data/finance,"Retirement Money,Investing For Retirement,Early Retirement,Investing Money,Retirement Planning,Financial Planning,Saving Money,Retirement Strategies,Financial Budget",How To Live Off Investments In Retirement – Dividends Diversify,007370e5-218a-4347-9ef4-19f3abed30bc
education,"Children, particularly young children, do not have the vocabulary or the experience to draw upon to express exactly how they are feeling. Download and use this printable poster…",1,1k,https://i.pinimg.com/originals/81/40/37/8140370a065b86bdc4cecab22811823a.png,4291,image,Educate2Empower Publishing,Local save in /data/education,"Mental Health Counseling,Kids Mental Health,Mental And Emotional Health,Social Emotional Learning,Counseling Activities,Art Therapy Activities,School Counseling,Play Therapy,Social Work Activities",Poster: How Are You Feeling Today? — Educate2Empower Publishing,62acf454-cb9f-4099-8a32-1a5b79d08380
christmas,"These are the best 70 Winter Decor Ideas for the home in the Wonderland Theme Decorations for your Rustic Farmhouse Living room, kitchen home decor ideas. After christmas party,…",1,164,https://i.pinimg.com/videos/thumbnails/originals/34/11/9e/34119e8339499f4c4d1c1e9405767259.0000001.jpg,2095,video,eyelashdance,Local save in /data/christmas,"Christmas Wood Crafts,Christmas Signs Wood,Rustic Christmas,Christmas Projects,Holiday Crafts,Christmas Time,Christmas Ornaments,Holiday Decor,Christmas Front Doors",Eyelashdance™Merry Christmas Front Door Hanging Sign With LED Light Wooden Sign,f8bfc64d-88dd-441a-9d5b-19e1a496c45d
finance,If you are a beginning investor this article is for you. Get more than 400 dividend stocks with this one Vanguard ETF. You don't need a lot of money. And you don't need to take…,1,28k,https://i.pinimg.com/originals/f1/14/ce/f114ce6e0657b64ade645124b9e3f8b1.jpg,5257,image,Dividends Diversify: Money Matters So Build Wealth & Be Rich,Local save in /data/finance,"Stock Market Investing,Investing In Stocks,Investing Money,Saving Money,Investment Tips,Investment Portfolio,Dividend Investing,Dividend Stocks,Planning Budget",VYM Review - Vanguard High Dividend Yield ETF – Dividends Diversify,dcd72c64-8bbf-496a-bd07-165ff8017c4b
diy-and-crafts,"If you enjoy thrift store crafts, you'll love this simple DIY upcycling project using the Cricut Joy! #ad Transform a thrifted photo frame into a beautiful and budget-friendly h…",1,40k,https://i.pinimg.com/videos/thumbnails/originals/8a/fd/86/8afd863592affcbcc5e7ae64386794dc.0000001.jpg,3272,video,Sustain My Craft Habit,Local save in /data/diy-and-crafts,"Diy Arts And Crafts,Diy Crafts To Sell,Diy Projects To Make And Sell,Diy Crafts Videos,Diy Vinyl Projects,Ideas For Cricut Projects,Vinil Cricut,Thrift Store Crafts,Cricut Craft Room",Spring Thrift Store Craft with Cricut Joy!,7ec6cac9-60be-4081-9030-d16e6963282f


country,ind,latitude,longitude,timestamp
Antarctica (the territory South of 60 deg S),5681,-84.9073,-105.769,2022-06-03T11:04:26
Antarctica (the territory South of 60 deg S),8681,-72.136,-130.529,2019-10-25T14:08:25
Antarctica (the territory South of 60 deg S),4790,45.2499,11.5767,2021-05-24T08:49:25
Heard Island and McDonald Islands,9701,45.5964,111.936,2017-10-31T17:42:09
Holy See (Vatican City State),7180,-22.7118,-167.739,2018-04-22T21:33:50
Slovakia (Slovak Republic),5203,-9.88959,14.5641,2019-06-04T16:23:54
Bouvet Island (Bouvetoya),9938,-88.516,-178.811,2018-04-03T18:40:47
Bouvet Island (Bouvetoya),9899,-88.516,-178.811,2019-11-10T10:07:08
Bouvet Island (Bouvetoya),5382,72.6957,136.639,2022-10-11T14:27:13
United States of America,1477,52.2604,-27.6119,2022-03-20T15:56:25


age,date_joined,first_name,ind,last_name
32,2015-12-18T05:07:36,Alexander,5260,Blanchard
23,2015-10-31T19:20:09,Alexandria,6051,Anderson
46,2016-07-02T08:06:40,Christina,2139,Carpenter
24,2016-08-24T23:59:06,Christopher,4630,Norris
29,2017-07-16T19:09:03,Abigail,5574,Henderson
41,2016-09-17T17:04:09,Alexandra,10476,Miller
54,2017-09-10T00:12:15,Christina,9951,Garcia
30,2016-01-22T19:09:18,Antonio,7350,Gonzalez
29,2017-05-30T10:56:50,Kathryn,316,Rasmussen
20,2015-11-20T09:08:00,Andrew,7695,Alexander


In [None]:
#Perform the necessary transformations on the follower_count to ensure every entry is a number. Make sure the data type of this column is an int.

from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

# UDF to convert follower_count with 'k' and 'M' to an integer
def convert_follower_count(follower_count):
    if follower_count is None:
        return None
    if 'k' in follower_count:
        return int(float(follower_count.replace('k', '')) * 1000)
    elif 'M' in follower_count:
        return int(float(follower_count.replace('M', '')) * 1000000)
    elif follower_count.isdigit():
        return int(follower_count)
    else:
        return follower_count

convert_follower_count_udf = udf(convert_follower_count, IntegerType())

# Apply the UDF to the follower_count column
df_pin_transformed = df_pin_cleaned.withColumn("follower_count", convert_follower_count_udf("follower_count"))

display(df_pin_transformed)

category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
christmas,This STEP-BY-STEP tutorial with video shows you how to add cascading ribbon on Christmas trees. Waterfall ribbon Christmas trees allow you to add any combinations of ribbon & me…,1,71000.0,https://i.pinimg.com/videos/thumbnails/originals/3b/de/b9/3bdeb9de975c6533ed4a40818df4d623.0000001.jpg,2008,video,"Karin Peters, Renovated Faith | Transforming Your Home & Heart",Local save in /data/christmas,"Diy Christmas Tree Garland,Elegant Christmas Trees,Diy Christmas Decorations Easy,Dollar Tree Christmas,Christmas Tree Toppers,Traditional Christmas Tree,Christmas Crafts,Christmas Décor,Christmas Tree Decorations Ribbon",EASIEST Way to Add Ribbon to a Christmas Tree (Simple Ribbon Hack),b117b8d7-2364-410e-8046-faa2024ce281
event-planning,Do you wish there were ways you can get great marketing for your business for free? Are you exploring ways to grow your event planning business but don’t yet have funds to pay f…,1,4000.0,https://i.pinimg.com/originals/c7/4a/7c/c74a7ccc1630bec91c5bbca72abdac7f.jpg,4507,image,EventPlanning.com | Learn How To Become An Event Planner,Local save in /data/event-planning,"Event Planning Template,Event Planning Quotes,Event Planning Checklist,Event Planning Business,Event Planning Design,Wedding Planning,Business Ideas,Catering Business,Wedding Ideas",Free Ways to Market Your Event Planning Business - Learn About Event Planning,54849a25-0e4c-4c9f-b732-6b25782a2e36
beauty,"Buy ""Neutrogena Makeup Remover Cleansing Face Wipes, Daily Cleansing Facial Towelettes to Remove Waterproof Makeup and Mascara, Alcohol-Free, Value Twin Pack, 25 Count, 2 Pack""…",1,184.0,https://i.pinimg.com/originals/b9/60/25/b96025d6275179059d1c4a4ead5d22da.jpg,1467,image,Megan Catalogna,Local save in /data/beauty,"Face Skin Care,Diy Skin Care,Anti Aging Skin Care,Natural Skin Care,Natural Beauty,Natural Hair,Piel Natural,Natural Life,Organic Skin Care","""Neutrogena Makeup Remover Cleansing Face Wipes, Daily Cleansing Facial Towelettes to Remove Waterproof Makeup and Mascara, Alcohol-Free, Value Twin Pack, 25 Count, 2 Pack""",b609b5c3-6b50-4cc5-b371-ff87fa61f352
art,"Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room, Bedroom Canvas Wall Art Ready to Hang Each canvas is professionally print…",1,305.0,https://i.pinimg.com/originals/06/63/7e/06637e1efa23c66e7b2194e4e23ad71a.jpg,20,image,Wall Canvas Mall,Local save in /data/art,"Wall Painting Frames,Framing Canvas Art,Bee Painting,Framed Canvas,Canvas Wall Art,Canvas Paintings,Bedroom Canvas,Bedroom Artwork,Canvas Home",Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room Bedroom Canvas Wall Art Ready to Hang - 1 Panel 60x30 / Floating Frame,70755e3d-dcad-4c71-a215-437a01739a02
event-planning,Pondering beginning a PARTY PLANNING and a Decorating business? Look at how to start a party planning business on a financial limit. If you love arranging and improving gatherin…,1,3000.0,https://i.pinimg.com/originals/db/91/3d/db913d245ae8d4da5c468ebf5f8a0ad3.jpg,4790,image,"Fun Party Plans | Planning Parties, Games, Shower Ideas, Weddings",Local save in /data/event-planning,"Wedding Event Planner,Wedding Events,Wedding Planning,Destination Wedding,Wedding Themes,Weddings,Party Events,Tent Wedding,Wedding Table",Surviving Your First Year as a Wedding Planner—Part 1 | Pointers For Planners,cca701ec-24ed-4cf9-b036-c0a87978ef27
finance,"You can retire early and have financial freedom. Learn how to live off your investments in early retirement. Money management and your money matters, so make the most of your fi…",1,28000.0,https://i.pinimg.com/originals/b5/c2/be/b5c2bef121cc458ebee325128fcc0974.png,5563,image,Dividends Diversify: Money Matters So Build Wealth & Be Rich,Local save in /data/finance,"Retirement Money,Investing For Retirement,Early Retirement,Investing Money,Retirement Planning,Financial Planning,Saving Money,Retirement Strategies,Financial Budget",How To Live Off Investments In Retirement – Dividends Diversify,007370e5-218a-4347-9ef4-19f3abed30bc
education,"Children, particularly young children, do not have the vocabulary or the experience to draw upon to express exactly how they are feeling. Download and use this printable poster…",1,1000.0,https://i.pinimg.com/originals/81/40/37/8140370a065b86bdc4cecab22811823a.png,4291,image,Educate2Empower Publishing,Local save in /data/education,"Mental Health Counseling,Kids Mental Health,Mental And Emotional Health,Social Emotional Learning,Counseling Activities,Art Therapy Activities,School Counseling,Play Therapy,Social Work Activities",Poster: How Are You Feeling Today? — Educate2Empower Publishing,62acf454-cb9f-4099-8a32-1a5b79d08380
christmas,"These are the best 70 Winter Decor Ideas for the home in the Wonderland Theme Decorations for your Rustic Farmhouse Living room, kitchen home decor ideas. After christmas party,…",1,164.0,https://i.pinimg.com/videos/thumbnails/originals/34/11/9e/34119e8339499f4c4d1c1e9405767259.0000001.jpg,2095,video,eyelashdance,Local save in /data/christmas,"Christmas Wood Crafts,Christmas Signs Wood,Rustic Christmas,Christmas Projects,Holiday Crafts,Christmas Time,Christmas Ornaments,Holiday Decor,Christmas Front Doors",Eyelashdance™Merry Christmas Front Door Hanging Sign With LED Light Wooden Sign,f8bfc64d-88dd-441a-9d5b-19e1a496c45d
finance,If you are a beginning investor this article is for you. Get more than 400 dividend stocks with this one Vanguard ETF. You don't need a lot of money. And you don't need to take…,1,28000.0,https://i.pinimg.com/originals/f1/14/ce/f114ce6e0657b64ade645124b9e3f8b1.jpg,5257,image,Dividends Diversify: Money Matters So Build Wealth & Be Rich,Local save in /data/finance,"Stock Market Investing,Investing In Stocks,Investing Money,Saving Money,Investment Tips,Investment Portfolio,Dividend Investing,Dividend Stocks,Planning Budget",VYM Review - Vanguard High Dividend Yield ETF – Dividends Diversify,dcd72c64-8bbf-496a-bd07-165ff8017c4b
diy-and-crafts,"If you enjoy thrift store crafts, you'll love this simple DIY upcycling project using the Cricut Joy! #ad Transform a thrifted photo frame into a beautiful and budget-friendly h…",1,40000.0,https://i.pinimg.com/videos/thumbnails/originals/8a/fd/86/8afd863592affcbcc5e7ae64386794dc.0000001.jpg,3272,video,Sustain My Craft Habit,Local save in /data/diy-and-crafts,"Diy Arts And Crafts,Diy Crafts To Sell,Diy Projects To Make And Sell,Diy Crafts Videos,Diy Vinyl Projects,Ideas For Cricut Projects,Vinil Cricut,Thrift Store Crafts,Cricut Craft Room",Spring Thrift Store Craft with Cricut Joy!,7ec6cac9-60be-4081-9030-d16e6963282f


In [None]:
# Ensure that each column containing numeric data has a numeric data type in all the dataframes

from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, FloatType

# List of columns with their desired data types for each DataFrame
pin_numeric_columns = {'follower_count': IntegerType(), 'index': IntegerType()}
geo_numeric_columns = {'latitude': FloatType(), 'longitude': FloatType(), 'ind': IntegerType()}
user_numeric_columns = {'ind': IntegerType(), 'age': IntegerType()}

# Function to convert columns to their desired data types
def convert_columns_to_numeric(df, columns_dict):
    for column_name, data_type in columns_dict.items():
        df = df.withColumn(column_name, col(column_name).cast(data_type))
    return df

# Applying the conversion function to each DataFrame
df_pin_numeric = convert_columns_to_numeric(df_pin_transformed, pin_numeric_columns)
df_geo_numeric = convert_columns_to_numeric(df_geo_cleaned, geo_numeric_columns)
df_user_numeric = convert_columns_to_numeric(df_user_cleaned, user_numeric_columns)

display(df_pin_numeric)
display(df_geo_numeric)
display(df_user_numeric)

category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
christmas,This STEP-BY-STEP tutorial with video shows you how to add cascading ribbon on Christmas trees. Waterfall ribbon Christmas trees allow you to add any combinations of ribbon & me…,1,71000.0,https://i.pinimg.com/videos/thumbnails/originals/3b/de/b9/3bdeb9de975c6533ed4a40818df4d623.0000001.jpg,2008,video,"Karin Peters, Renovated Faith | Transforming Your Home & Heart",Local save in /data/christmas,"Diy Christmas Tree Garland,Elegant Christmas Trees,Diy Christmas Decorations Easy,Dollar Tree Christmas,Christmas Tree Toppers,Traditional Christmas Tree,Christmas Crafts,Christmas Décor,Christmas Tree Decorations Ribbon",EASIEST Way to Add Ribbon to a Christmas Tree (Simple Ribbon Hack),b117b8d7-2364-410e-8046-faa2024ce281
event-planning,Do you wish there were ways you can get great marketing for your business for free? Are you exploring ways to grow your event planning business but don’t yet have funds to pay f…,1,4000.0,https://i.pinimg.com/originals/c7/4a/7c/c74a7ccc1630bec91c5bbca72abdac7f.jpg,4507,image,EventPlanning.com | Learn How To Become An Event Planner,Local save in /data/event-planning,"Event Planning Template,Event Planning Quotes,Event Planning Checklist,Event Planning Business,Event Planning Design,Wedding Planning,Business Ideas,Catering Business,Wedding Ideas",Free Ways to Market Your Event Planning Business - Learn About Event Planning,54849a25-0e4c-4c9f-b732-6b25782a2e36
beauty,"Buy ""Neutrogena Makeup Remover Cleansing Face Wipes, Daily Cleansing Facial Towelettes to Remove Waterproof Makeup and Mascara, Alcohol-Free, Value Twin Pack, 25 Count, 2 Pack""…",1,184.0,https://i.pinimg.com/originals/b9/60/25/b96025d6275179059d1c4a4ead5d22da.jpg,1467,image,Megan Catalogna,Local save in /data/beauty,"Face Skin Care,Diy Skin Care,Anti Aging Skin Care,Natural Skin Care,Natural Beauty,Natural Hair,Piel Natural,Natural Life,Organic Skin Care","""Neutrogena Makeup Remover Cleansing Face Wipes, Daily Cleansing Facial Towelettes to Remove Waterproof Makeup and Mascara, Alcohol-Free, Value Twin Pack, 25 Count, 2 Pack""",b609b5c3-6b50-4cc5-b371-ff87fa61f352
art,"Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room, Bedroom Canvas Wall Art Ready to Hang Each canvas is professionally print…",1,305.0,https://i.pinimg.com/originals/06/63/7e/06637e1efa23c66e7b2194e4e23ad71a.jpg,20,image,Wall Canvas Mall,Local save in /data/art,"Wall Painting Frames,Framing Canvas Art,Bee Painting,Framed Canvas,Canvas Wall Art,Canvas Paintings,Bedroom Canvas,Bedroom Artwork,Canvas Home",Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room Bedroom Canvas Wall Art Ready to Hang - 1 Panel 60x30 / Floating Frame,70755e3d-dcad-4c71-a215-437a01739a02
event-planning,Pondering beginning a PARTY PLANNING and a Decorating business? Look at how to start a party planning business on a financial limit. If you love arranging and improving gatherin…,1,3000.0,https://i.pinimg.com/originals/db/91/3d/db913d245ae8d4da5c468ebf5f8a0ad3.jpg,4790,image,"Fun Party Plans | Planning Parties, Games, Shower Ideas, Weddings",Local save in /data/event-planning,"Wedding Event Planner,Wedding Events,Wedding Planning,Destination Wedding,Wedding Themes,Weddings,Party Events,Tent Wedding,Wedding Table",Surviving Your First Year as a Wedding Planner—Part 1 | Pointers For Planners,cca701ec-24ed-4cf9-b036-c0a87978ef27
finance,"You can retire early and have financial freedom. Learn how to live off your investments in early retirement. Money management and your money matters, so make the most of your fi…",1,28000.0,https://i.pinimg.com/originals/b5/c2/be/b5c2bef121cc458ebee325128fcc0974.png,5563,image,Dividends Diversify: Money Matters So Build Wealth & Be Rich,Local save in /data/finance,"Retirement Money,Investing For Retirement,Early Retirement,Investing Money,Retirement Planning,Financial Planning,Saving Money,Retirement Strategies,Financial Budget",How To Live Off Investments In Retirement – Dividends Diversify,007370e5-218a-4347-9ef4-19f3abed30bc
education,"Children, particularly young children, do not have the vocabulary or the experience to draw upon to express exactly how they are feeling. Download and use this printable poster…",1,1000.0,https://i.pinimg.com/originals/81/40/37/8140370a065b86bdc4cecab22811823a.png,4291,image,Educate2Empower Publishing,Local save in /data/education,"Mental Health Counseling,Kids Mental Health,Mental And Emotional Health,Social Emotional Learning,Counseling Activities,Art Therapy Activities,School Counseling,Play Therapy,Social Work Activities",Poster: How Are You Feeling Today? — Educate2Empower Publishing,62acf454-cb9f-4099-8a32-1a5b79d08380
christmas,"These are the best 70 Winter Decor Ideas for the home in the Wonderland Theme Decorations for your Rustic Farmhouse Living room, kitchen home decor ideas. After christmas party,…",1,164.0,https://i.pinimg.com/videos/thumbnails/originals/34/11/9e/34119e8339499f4c4d1c1e9405767259.0000001.jpg,2095,video,eyelashdance,Local save in /data/christmas,"Christmas Wood Crafts,Christmas Signs Wood,Rustic Christmas,Christmas Projects,Holiday Crafts,Christmas Time,Christmas Ornaments,Holiday Decor,Christmas Front Doors",Eyelashdance™Merry Christmas Front Door Hanging Sign With LED Light Wooden Sign,f8bfc64d-88dd-441a-9d5b-19e1a496c45d
finance,If you are a beginning investor this article is for you. Get more than 400 dividend stocks with this one Vanguard ETF. You don't need a lot of money. And you don't need to take…,1,28000.0,https://i.pinimg.com/originals/f1/14/ce/f114ce6e0657b64ade645124b9e3f8b1.jpg,5257,image,Dividends Diversify: Money Matters So Build Wealth & Be Rich,Local save in /data/finance,"Stock Market Investing,Investing In Stocks,Investing Money,Saving Money,Investment Tips,Investment Portfolio,Dividend Investing,Dividend Stocks,Planning Budget",VYM Review - Vanguard High Dividend Yield ETF – Dividends Diversify,dcd72c64-8bbf-496a-bd07-165ff8017c4b
diy-and-crafts,"If you enjoy thrift store crafts, you'll love this simple DIY upcycling project using the Cricut Joy! #ad Transform a thrifted photo frame into a beautiful and budget-friendly h…",1,40000.0,https://i.pinimg.com/videos/thumbnails/originals/8a/fd/86/8afd863592affcbcc5e7ae64386794dc.0000001.jpg,3272,video,Sustain My Craft Habit,Local save in /data/diy-and-crafts,"Diy Arts And Crafts,Diy Crafts To Sell,Diy Projects To Make And Sell,Diy Crafts Videos,Diy Vinyl Projects,Ideas For Cricut Projects,Vinil Cricut,Thrift Store Crafts,Cricut Craft Room",Spring Thrift Store Craft with Cricut Joy!,7ec6cac9-60be-4081-9030-d16e6963282f


country,ind,latitude,longitude,timestamp
Antarctica (the territory South of 60 deg S),5681,-84.9073,-105.769,2022-06-03T11:04:26
Antarctica (the territory South of 60 deg S),8681,-72.136,-130.529,2019-10-25T14:08:25
Antarctica (the territory South of 60 deg S),4790,45.2499,11.5767,2021-05-24T08:49:25
Heard Island and McDonald Islands,9701,45.5964,111.936,2017-10-31T17:42:09
Holy See (Vatican City State),7180,-22.7118,-167.739,2018-04-22T21:33:50
Slovakia (Slovak Republic),5203,-9.88959,14.5641,2019-06-04T16:23:54
Bouvet Island (Bouvetoya),9938,-88.516,-178.811,2018-04-03T18:40:47
Bouvet Island (Bouvetoya),9899,-88.516,-178.811,2019-11-10T10:07:08
Bouvet Island (Bouvetoya),5382,72.6957,136.639,2022-10-11T14:27:13
United States of America,1477,52.2604,-27.6119,2022-03-20T15:56:25


age,date_joined,first_name,ind,last_name
32,2015-12-18T05:07:36,Alexander,5260,Blanchard
23,2015-10-31T19:20:09,Alexandria,6051,Anderson
46,2016-07-02T08:06:40,Christina,2139,Carpenter
24,2016-08-24T23:59:06,Christopher,4630,Norris
29,2017-07-16T19:09:03,Abigail,5574,Henderson
41,2016-09-17T17:04:09,Alexandra,10476,Miller
54,2017-09-10T00:12:15,Christina,9951,Garcia
30,2016-01-22T19:09:18,Antonio,7350,Gonzalez
29,2017-05-30T10:56:50,Kathryn,316,Rasmussen
20,2015-11-20T09:08:00,Andrew,7695,Alexander


In [None]:
#Clean the data in the save_location column to include only the save location path

from pyspark.sql.functions import regexp_replace

# Clean the save_location column
df_pin_cleaned_location = df_pin_numeric.withColumn("save_location", regexp_replace("save_location", "^Local save in ", ""))

display(df_pin_cleaned_location)

category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
christmas,This STEP-BY-STEP tutorial with video shows you how to add cascading ribbon on Christmas trees. Waterfall ribbon Christmas trees allow you to add any combinations of ribbon & me…,1,71000.0,https://i.pinimg.com/videos/thumbnails/originals/3b/de/b9/3bdeb9de975c6533ed4a40818df4d623.0000001.jpg,2008,video,"Karin Peters, Renovated Faith | Transforming Your Home & Heart",/data/christmas,"Diy Christmas Tree Garland,Elegant Christmas Trees,Diy Christmas Decorations Easy,Dollar Tree Christmas,Christmas Tree Toppers,Traditional Christmas Tree,Christmas Crafts,Christmas Décor,Christmas Tree Decorations Ribbon",EASIEST Way to Add Ribbon to a Christmas Tree (Simple Ribbon Hack),b117b8d7-2364-410e-8046-faa2024ce281
event-planning,Do you wish there were ways you can get great marketing for your business for free? Are you exploring ways to grow your event planning business but don’t yet have funds to pay f…,1,4000.0,https://i.pinimg.com/originals/c7/4a/7c/c74a7ccc1630bec91c5bbca72abdac7f.jpg,4507,image,EventPlanning.com | Learn How To Become An Event Planner,/data/event-planning,"Event Planning Template,Event Planning Quotes,Event Planning Checklist,Event Planning Business,Event Planning Design,Wedding Planning,Business Ideas,Catering Business,Wedding Ideas",Free Ways to Market Your Event Planning Business - Learn About Event Planning,54849a25-0e4c-4c9f-b732-6b25782a2e36
beauty,"Buy ""Neutrogena Makeup Remover Cleansing Face Wipes, Daily Cleansing Facial Towelettes to Remove Waterproof Makeup and Mascara, Alcohol-Free, Value Twin Pack, 25 Count, 2 Pack""…",1,184.0,https://i.pinimg.com/originals/b9/60/25/b96025d6275179059d1c4a4ead5d22da.jpg,1467,image,Megan Catalogna,/data/beauty,"Face Skin Care,Diy Skin Care,Anti Aging Skin Care,Natural Skin Care,Natural Beauty,Natural Hair,Piel Natural,Natural Life,Organic Skin Care","""Neutrogena Makeup Remover Cleansing Face Wipes, Daily Cleansing Facial Towelettes to Remove Waterproof Makeup and Mascara, Alcohol-Free, Value Twin Pack, 25 Count, 2 Pack""",b609b5c3-6b50-4cc5-b371-ff87fa61f352
art,"Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room, Bedroom Canvas Wall Art Ready to Hang Each canvas is professionally print…",1,305.0,https://i.pinimg.com/originals/06/63/7e/06637e1efa23c66e7b2194e4e23ad71a.jpg,20,image,Wall Canvas Mall,/data/art,"Wall Painting Frames,Framing Canvas Art,Bee Painting,Framed Canvas,Canvas Wall Art,Canvas Paintings,Bedroom Canvas,Bedroom Artwork,Canvas Home",Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room Bedroom Canvas Wall Art Ready to Hang - 1 Panel 60x30 / Floating Frame,70755e3d-dcad-4c71-a215-437a01739a02
event-planning,Pondering beginning a PARTY PLANNING and a Decorating business? Look at how to start a party planning business on a financial limit. If you love arranging and improving gatherin…,1,3000.0,https://i.pinimg.com/originals/db/91/3d/db913d245ae8d4da5c468ebf5f8a0ad3.jpg,4790,image,"Fun Party Plans | Planning Parties, Games, Shower Ideas, Weddings",/data/event-planning,"Wedding Event Planner,Wedding Events,Wedding Planning,Destination Wedding,Wedding Themes,Weddings,Party Events,Tent Wedding,Wedding Table",Surviving Your First Year as a Wedding Planner—Part 1 | Pointers For Planners,cca701ec-24ed-4cf9-b036-c0a87978ef27
finance,"You can retire early and have financial freedom. Learn how to live off your investments in early retirement. Money management and your money matters, so make the most of your fi…",1,28000.0,https://i.pinimg.com/originals/b5/c2/be/b5c2bef121cc458ebee325128fcc0974.png,5563,image,Dividends Diversify: Money Matters So Build Wealth & Be Rich,/data/finance,"Retirement Money,Investing For Retirement,Early Retirement,Investing Money,Retirement Planning,Financial Planning,Saving Money,Retirement Strategies,Financial Budget",How To Live Off Investments In Retirement – Dividends Diversify,007370e5-218a-4347-9ef4-19f3abed30bc
education,"Children, particularly young children, do not have the vocabulary or the experience to draw upon to express exactly how they are feeling. Download and use this printable poster…",1,1000.0,https://i.pinimg.com/originals/81/40/37/8140370a065b86bdc4cecab22811823a.png,4291,image,Educate2Empower Publishing,/data/education,"Mental Health Counseling,Kids Mental Health,Mental And Emotional Health,Social Emotional Learning,Counseling Activities,Art Therapy Activities,School Counseling,Play Therapy,Social Work Activities",Poster: How Are You Feeling Today? — Educate2Empower Publishing,62acf454-cb9f-4099-8a32-1a5b79d08380
christmas,"These are the best 70 Winter Decor Ideas for the home in the Wonderland Theme Decorations for your Rustic Farmhouse Living room, kitchen home decor ideas. After christmas party,…",1,164.0,https://i.pinimg.com/videos/thumbnails/originals/34/11/9e/34119e8339499f4c4d1c1e9405767259.0000001.jpg,2095,video,eyelashdance,/data/christmas,"Christmas Wood Crafts,Christmas Signs Wood,Rustic Christmas,Christmas Projects,Holiday Crafts,Christmas Time,Christmas Ornaments,Holiday Decor,Christmas Front Doors",Eyelashdance™Merry Christmas Front Door Hanging Sign With LED Light Wooden Sign,f8bfc64d-88dd-441a-9d5b-19e1a496c45d
finance,If you are a beginning investor this article is for you. Get more than 400 dividend stocks with this one Vanguard ETF. You don't need a lot of money. And you don't need to take…,1,28000.0,https://i.pinimg.com/originals/f1/14/ce/f114ce6e0657b64ade645124b9e3f8b1.jpg,5257,image,Dividends Diversify: Money Matters So Build Wealth & Be Rich,/data/finance,"Stock Market Investing,Investing In Stocks,Investing Money,Saving Money,Investment Tips,Investment Portfolio,Dividend Investing,Dividend Stocks,Planning Budget",VYM Review - Vanguard High Dividend Yield ETF – Dividends Diversify,dcd72c64-8bbf-496a-bd07-165ff8017c4b
diy-and-crafts,"If you enjoy thrift store crafts, you'll love this simple DIY upcycling project using the Cricut Joy! #ad Transform a thrifted photo frame into a beautiful and budget-friendly h…",1,40000.0,https://i.pinimg.com/videos/thumbnails/originals/8a/fd/86/8afd863592affcbcc5e7ae64386794dc.0000001.jpg,3272,video,Sustain My Craft Habit,/data/diy-and-crafts,"Diy Arts And Crafts,Diy Crafts To Sell,Diy Projects To Make And Sell,Diy Crafts Videos,Diy Vinyl Projects,Ideas For Cricut Projects,Vinil Cricut,Thrift Store Crafts,Cricut Craft Room",Spring Thrift Store Craft with Cricut Joy!,7ec6cac9-60be-4081-9030-d16e6963282f


In [None]:
# Rename the index column to ind.

from pyspark.sql.functions import col

# Rename the index column to ind
df_pin_renamed = df_pin_cleaned_location.withColumnRenamed("index", "ind")

display(df_pin_renamed)

category,description,downloaded,follower_count,image_src,ind,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
christmas,This STEP-BY-STEP tutorial with video shows you how to add cascading ribbon on Christmas trees. Waterfall ribbon Christmas trees allow you to add any combinations of ribbon & me…,1,71000.0,https://i.pinimg.com/videos/thumbnails/originals/3b/de/b9/3bdeb9de975c6533ed4a40818df4d623.0000001.jpg,2008,video,"Karin Peters, Renovated Faith | Transforming Your Home & Heart",/data/christmas,"Diy Christmas Tree Garland,Elegant Christmas Trees,Diy Christmas Decorations Easy,Dollar Tree Christmas,Christmas Tree Toppers,Traditional Christmas Tree,Christmas Crafts,Christmas Décor,Christmas Tree Decorations Ribbon",EASIEST Way to Add Ribbon to a Christmas Tree (Simple Ribbon Hack),b117b8d7-2364-410e-8046-faa2024ce281
event-planning,Do you wish there were ways you can get great marketing for your business for free? Are you exploring ways to grow your event planning business but don’t yet have funds to pay f…,1,4000.0,https://i.pinimg.com/originals/c7/4a/7c/c74a7ccc1630bec91c5bbca72abdac7f.jpg,4507,image,EventPlanning.com | Learn How To Become An Event Planner,/data/event-planning,"Event Planning Template,Event Planning Quotes,Event Planning Checklist,Event Planning Business,Event Planning Design,Wedding Planning,Business Ideas,Catering Business,Wedding Ideas",Free Ways to Market Your Event Planning Business - Learn About Event Planning,54849a25-0e4c-4c9f-b732-6b25782a2e36
beauty,"Buy ""Neutrogena Makeup Remover Cleansing Face Wipes, Daily Cleansing Facial Towelettes to Remove Waterproof Makeup and Mascara, Alcohol-Free, Value Twin Pack, 25 Count, 2 Pack""…",1,184.0,https://i.pinimg.com/originals/b9/60/25/b96025d6275179059d1c4a4ead5d22da.jpg,1467,image,Megan Catalogna,/data/beauty,"Face Skin Care,Diy Skin Care,Anti Aging Skin Care,Natural Skin Care,Natural Beauty,Natural Hair,Piel Natural,Natural Life,Organic Skin Care","""Neutrogena Makeup Remover Cleansing Face Wipes, Daily Cleansing Facial Towelettes to Remove Waterproof Makeup and Mascara, Alcohol-Free, Value Twin Pack, 25 Count, 2 Pack""",b609b5c3-6b50-4cc5-b371-ff87fa61f352
art,"Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room, Bedroom Canvas Wall Art Ready to Hang Each canvas is professionally print…",1,305.0,https://i.pinimg.com/originals/06/63/7e/06637e1efa23c66e7b2194e4e23ad71a.jpg,20,image,Wall Canvas Mall,/data/art,"Wall Painting Frames,Framing Canvas Art,Bee Painting,Framed Canvas,Canvas Wall Art,Canvas Paintings,Bedroom Canvas,Bedroom Artwork,Canvas Home",Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room Bedroom Canvas Wall Art Ready to Hang - 1 Panel 60x30 / Floating Frame,70755e3d-dcad-4c71-a215-437a01739a02
event-planning,Pondering beginning a PARTY PLANNING and a Decorating business? Look at how to start a party planning business on a financial limit. If you love arranging and improving gatherin…,1,3000.0,https://i.pinimg.com/originals/db/91/3d/db913d245ae8d4da5c468ebf5f8a0ad3.jpg,4790,image,"Fun Party Plans | Planning Parties, Games, Shower Ideas, Weddings",/data/event-planning,"Wedding Event Planner,Wedding Events,Wedding Planning,Destination Wedding,Wedding Themes,Weddings,Party Events,Tent Wedding,Wedding Table",Surviving Your First Year as a Wedding Planner—Part 1 | Pointers For Planners,cca701ec-24ed-4cf9-b036-c0a87978ef27
finance,"You can retire early and have financial freedom. Learn how to live off your investments in early retirement. Money management and your money matters, so make the most of your fi…",1,28000.0,https://i.pinimg.com/originals/b5/c2/be/b5c2bef121cc458ebee325128fcc0974.png,5563,image,Dividends Diversify: Money Matters So Build Wealth & Be Rich,/data/finance,"Retirement Money,Investing For Retirement,Early Retirement,Investing Money,Retirement Planning,Financial Planning,Saving Money,Retirement Strategies,Financial Budget",How To Live Off Investments In Retirement – Dividends Diversify,007370e5-218a-4347-9ef4-19f3abed30bc
education,"Children, particularly young children, do not have the vocabulary or the experience to draw upon to express exactly how they are feeling. Download and use this printable poster…",1,1000.0,https://i.pinimg.com/originals/81/40/37/8140370a065b86bdc4cecab22811823a.png,4291,image,Educate2Empower Publishing,/data/education,"Mental Health Counseling,Kids Mental Health,Mental And Emotional Health,Social Emotional Learning,Counseling Activities,Art Therapy Activities,School Counseling,Play Therapy,Social Work Activities",Poster: How Are You Feeling Today? — Educate2Empower Publishing,62acf454-cb9f-4099-8a32-1a5b79d08380
christmas,"These are the best 70 Winter Decor Ideas for the home in the Wonderland Theme Decorations for your Rustic Farmhouse Living room, kitchen home decor ideas. After christmas party,…",1,164.0,https://i.pinimg.com/videos/thumbnails/originals/34/11/9e/34119e8339499f4c4d1c1e9405767259.0000001.jpg,2095,video,eyelashdance,/data/christmas,"Christmas Wood Crafts,Christmas Signs Wood,Rustic Christmas,Christmas Projects,Holiday Crafts,Christmas Time,Christmas Ornaments,Holiday Decor,Christmas Front Doors",Eyelashdance™Merry Christmas Front Door Hanging Sign With LED Light Wooden Sign,f8bfc64d-88dd-441a-9d5b-19e1a496c45d
finance,If you are a beginning investor this article is for you. Get more than 400 dividend stocks with this one Vanguard ETF. You don't need a lot of money. And you don't need to take…,1,28000.0,https://i.pinimg.com/originals/f1/14/ce/f114ce6e0657b64ade645124b9e3f8b1.jpg,5257,image,Dividends Diversify: Money Matters So Build Wealth & Be Rich,/data/finance,"Stock Market Investing,Investing In Stocks,Investing Money,Saving Money,Investment Tips,Investment Portfolio,Dividend Investing,Dividend Stocks,Planning Budget",VYM Review - Vanguard High Dividend Yield ETF – Dividends Diversify,dcd72c64-8bbf-496a-bd07-165ff8017c4b
diy-and-crafts,"If you enjoy thrift store crafts, you'll love this simple DIY upcycling project using the Cricut Joy! #ad Transform a thrifted photo frame into a beautiful and budget-friendly h…",1,40000.0,https://i.pinimg.com/videos/thumbnails/originals/8a/fd/86/8afd863592affcbcc5e7ae64386794dc.0000001.jpg,3272,video,Sustain My Craft Habit,/data/diy-and-crafts,"Diy Arts And Crafts,Diy Crafts To Sell,Diy Projects To Make And Sell,Diy Crafts Videos,Diy Vinyl Projects,Ideas For Cricut Projects,Vinil Cricut,Thrift Store Crafts,Cricut Craft Room",Spring Thrift Store Craft with Cricut Joy!,7ec6cac9-60be-4081-9030-d16e6963282f


In [None]:
# Reorder the DataFrame columns
df_pin_clean = df_pin_renamed.select(
    "ind",
    "unique_id",
    "title",
    "description",
    "follower_count",
    "poster_name",
    "tag_list",
    "is_image_or_video",
    "image_src",
    "save_location",
    "category"
)

display(df_pin_clean)

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
2008,b117b8d7-2364-410e-8046-faa2024ce281,EASIEST Way to Add Ribbon to a Christmas Tree (Simple Ribbon Hack),This STEP-BY-STEP tutorial with video shows you how to add cascading ribbon on Christmas trees. Waterfall ribbon Christmas trees allow you to add any combinations of ribbon & me…,71000.0,"Karin Peters, Renovated Faith | Transforming Your Home & Heart","Diy Christmas Tree Garland,Elegant Christmas Trees,Diy Christmas Decorations Easy,Dollar Tree Christmas,Christmas Tree Toppers,Traditional Christmas Tree,Christmas Crafts,Christmas Décor,Christmas Tree Decorations Ribbon",video,https://i.pinimg.com/videos/thumbnails/originals/3b/de/b9/3bdeb9de975c6533ed4a40818df4d623.0000001.jpg,/data/christmas,christmas
4507,54849a25-0e4c-4c9f-b732-6b25782a2e36,Free Ways to Market Your Event Planning Business - Learn About Event Planning,Do you wish there were ways you can get great marketing for your business for free? Are you exploring ways to grow your event planning business but don’t yet have funds to pay f…,4000.0,EventPlanning.com | Learn How To Become An Event Planner,"Event Planning Template,Event Planning Quotes,Event Planning Checklist,Event Planning Business,Event Planning Design,Wedding Planning,Business Ideas,Catering Business,Wedding Ideas",image,https://i.pinimg.com/originals/c7/4a/7c/c74a7ccc1630bec91c5bbca72abdac7f.jpg,/data/event-planning,event-planning
1467,b609b5c3-6b50-4cc5-b371-ff87fa61f352,"""Neutrogena Makeup Remover Cleansing Face Wipes, Daily Cleansing Facial Towelettes to Remove Waterproof Makeup and Mascara, Alcohol-Free, Value Twin Pack, 25 Count, 2 Pack""","Buy ""Neutrogena Makeup Remover Cleansing Face Wipes, Daily Cleansing Facial Towelettes to Remove Waterproof Makeup and Mascara, Alcohol-Free, Value Twin Pack, 25 Count, 2 Pack""…",184.0,Megan Catalogna,"Face Skin Care,Diy Skin Care,Anti Aging Skin Care,Natural Skin Care,Natural Beauty,Natural Hair,Piel Natural,Natural Life,Organic Skin Care",image,https://i.pinimg.com/originals/b9/60/25/b96025d6275179059d1c4a4ead5d22da.jpg,/data/beauty,beauty
20,70755e3d-dcad-4c71-a215-437a01739a02,Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room Bedroom Canvas Wall Art Ready to Hang - 1 Panel 60x30 / Floating Frame,"Framed Canvas Home Artwork Decoration Abstract Mountain Nature Scenery Canvas Wall Art for Living Room, Bedroom Canvas Wall Art Ready to Hang Each canvas is professionally print…",305.0,Wall Canvas Mall,"Wall Painting Frames,Framing Canvas Art,Bee Painting,Framed Canvas,Canvas Wall Art,Canvas Paintings,Bedroom Canvas,Bedroom Artwork,Canvas Home",image,https://i.pinimg.com/originals/06/63/7e/06637e1efa23c66e7b2194e4e23ad71a.jpg,/data/art,art
4790,cca701ec-24ed-4cf9-b036-c0a87978ef27,Surviving Your First Year as a Wedding Planner—Part 1 | Pointers For Planners,Pondering beginning a PARTY PLANNING and a Decorating business? Look at how to start a party planning business on a financial limit. If you love arranging and improving gatherin…,3000.0,"Fun Party Plans | Planning Parties, Games, Shower Ideas, Weddings","Wedding Event Planner,Wedding Events,Wedding Planning,Destination Wedding,Wedding Themes,Weddings,Party Events,Tent Wedding,Wedding Table",image,https://i.pinimg.com/originals/db/91/3d/db913d245ae8d4da5c468ebf5f8a0ad3.jpg,/data/event-planning,event-planning
5563,007370e5-218a-4347-9ef4-19f3abed30bc,How To Live Off Investments In Retirement – Dividends Diversify,"You can retire early and have financial freedom. Learn how to live off your investments in early retirement. Money management and your money matters, so make the most of your fi…",28000.0,Dividends Diversify: Money Matters So Build Wealth & Be Rich,"Retirement Money,Investing For Retirement,Early Retirement,Investing Money,Retirement Planning,Financial Planning,Saving Money,Retirement Strategies,Financial Budget",image,https://i.pinimg.com/originals/b5/c2/be/b5c2bef121cc458ebee325128fcc0974.png,/data/finance,finance
4291,62acf454-cb9f-4099-8a32-1a5b79d08380,Poster: How Are You Feeling Today? — Educate2Empower Publishing,"Children, particularly young children, do not have the vocabulary or the experience to draw upon to express exactly how they are feeling. Download and use this printable poster…",1000.0,Educate2Empower Publishing,"Mental Health Counseling,Kids Mental Health,Mental And Emotional Health,Social Emotional Learning,Counseling Activities,Art Therapy Activities,School Counseling,Play Therapy,Social Work Activities",image,https://i.pinimg.com/originals/81/40/37/8140370a065b86bdc4cecab22811823a.png,/data/education,education
2095,f8bfc64d-88dd-441a-9d5b-19e1a496c45d,Eyelashdance™Merry Christmas Front Door Hanging Sign With LED Light Wooden Sign,"These are the best 70 Winter Decor Ideas for the home in the Wonderland Theme Decorations for your Rustic Farmhouse Living room, kitchen home decor ideas. After christmas party,…",164.0,eyelashdance,"Christmas Wood Crafts,Christmas Signs Wood,Rustic Christmas,Christmas Projects,Holiday Crafts,Christmas Time,Christmas Ornaments,Holiday Decor,Christmas Front Doors",video,https://i.pinimg.com/videos/thumbnails/originals/34/11/9e/34119e8339499f4c4d1c1e9405767259.0000001.jpg,/data/christmas,christmas
5257,dcd72c64-8bbf-496a-bd07-165ff8017c4b,VYM Review - Vanguard High Dividend Yield ETF – Dividends Diversify,If you are a beginning investor this article is for you. Get more than 400 dividend stocks with this one Vanguard ETF. You don't need a lot of money. And you don't need to take…,28000.0,Dividends Diversify: Money Matters So Build Wealth & Be Rich,"Stock Market Investing,Investing In Stocks,Investing Money,Saving Money,Investment Tips,Investment Portfolio,Dividend Investing,Dividend Stocks,Planning Budget",image,https://i.pinimg.com/originals/f1/14/ce/f114ce6e0657b64ade645124b9e3f8b1.jpg,/data/finance,finance
3272,7ec6cac9-60be-4081-9030-d16e6963282f,Spring Thrift Store Craft with Cricut Joy!,"If you enjoy thrift store crafts, you'll love this simple DIY upcycling project using the Cricut Joy! #ad Transform a thrifted photo frame into a beautiful and budget-friendly h…",40000.0,Sustain My Craft Habit,"Diy Arts And Crafts,Diy Crafts To Sell,Diy Projects To Make And Sell,Diy Crafts Videos,Diy Vinyl Projects,Ideas For Cricut Projects,Vinil Cricut,Thrift Store Crafts,Cricut Craft Room",video,https://i.pinimg.com/videos/thumbnails/originals/8a/fd/86/8afd863592affcbcc5e7ae64386794dc.0000001.jpg,/data/diy-and-crafts,diy-and-crafts


## Cleaning the df_geo Dataframe

In [None]:
# Create a new column coordinates that contains an array based on the latitude and longitude columns

from pyspark.sql.functions import array

df_geo_with_coordinates = df_geo_cleaned.withColumn("coordinates", array("latitude", "longitude"))

display(df_geo_with_coordinates)

country,ind,latitude,longitude,timestamp,coordinates
Antarctica (the territory South of 60 deg S),5681,-84.9073,-105.769,2022-06-03T11:04:26,"List(-84.9073, -105.769)"
Antarctica (the territory South of 60 deg S),8681,-72.136,-130.529,2019-10-25T14:08:25,"List(-72.136, -130.529)"
Antarctica (the territory South of 60 deg S),4790,45.2499,11.5767,2021-05-24T08:49:25,"List(45.2499, 11.5767)"
Heard Island and McDonald Islands,9701,45.5964,111.936,2017-10-31T17:42:09,"List(45.5964, 111.936)"
Holy See (Vatican City State),7180,-22.7118,-167.739,2018-04-22T21:33:50,"List(-22.7118, -167.739)"
Slovakia (Slovak Republic),5203,-9.88959,14.5641,2019-06-04T16:23:54,"List(-9.88959, 14.5641)"
Bouvet Island (Bouvetoya),9938,-88.516,-178.811,2018-04-03T18:40:47,"List(-88.516, -178.811)"
Bouvet Island (Bouvetoya),9899,-88.516,-178.811,2019-11-10T10:07:08,"List(-88.516, -178.811)"
Bouvet Island (Bouvetoya),5382,72.6957,136.639,2022-10-11T14:27:13,"List(72.6957, 136.639)"
United States of America,1477,52.2604,-27.6119,2022-03-20T15:56:25,"List(52.2604, -27.6119)"


In [None]:
# Drop the latitude and longitude columns from the DataFrame

df_geo_with_coordinates = df_geo_with_coordinates.drop("latitude", "longitude")

display(df_geo_with_coordinates)

country,ind,timestamp,coordinates
Antarctica (the territory South of 60 deg S),5681,2022-06-03T11:04:26,"List(-84.9073, -105.769)"
Antarctica (the territory South of 60 deg S),8681,2019-10-25T14:08:25,"List(-72.136, -130.529)"
Antarctica (the territory South of 60 deg S),4790,2021-05-24T08:49:25,"List(45.2499, 11.5767)"
Heard Island and McDonald Islands,9701,2017-10-31T17:42:09,"List(45.5964, 111.936)"
Holy See (Vatican City State),7180,2018-04-22T21:33:50,"List(-22.7118, -167.739)"
Slovakia (Slovak Republic),5203,2019-06-04T16:23:54,"List(-9.88959, 14.5641)"
Bouvet Island (Bouvetoya),9938,2018-04-03T18:40:47,"List(-88.516, -178.811)"
Bouvet Island (Bouvetoya),9899,2019-11-10T10:07:08,"List(-88.516, -178.811)"
Bouvet Island (Bouvetoya),5382,2022-10-11T14:27:13,"List(72.6957, 136.639)"
United States of America,1477,2022-03-20T15:56:25,"List(52.2604, -27.6119)"


In [None]:
# Convert the timestamp column from a string to a timestamp data type

from pyspark.sql.functions import col, to_timestamp

df_geo_with_coordinates = df_geo_with_coordinates.withColumn("timestamp", to_timestamp(col("timestamp")))

display(df_geo_with_coordinates)

country,ind,timestamp,coordinates
Antarctica (the territory South of 60 deg S),5681,2022-06-03T11:04:26.000+0000,"List(-84.9073, -105.769)"
Antarctica (the territory South of 60 deg S),8681,2019-10-25T14:08:25.000+0000,"List(-72.136, -130.529)"
Antarctica (the territory South of 60 deg S),4790,2021-05-24T08:49:25.000+0000,"List(45.2499, 11.5767)"
Heard Island and McDonald Islands,9701,2017-10-31T17:42:09.000+0000,"List(45.5964, 111.936)"
Holy See (Vatican City State),7180,2018-04-22T21:33:50.000+0000,"List(-22.7118, -167.739)"
Slovakia (Slovak Republic),5203,2019-06-04T16:23:54.000+0000,"List(-9.88959, 14.5641)"
Bouvet Island (Bouvetoya),9938,2018-04-03T18:40:47.000+0000,"List(-88.516, -178.811)"
Bouvet Island (Bouvetoya),9899,2019-11-10T10:07:08.000+0000,"List(-88.516, -178.811)"
Bouvet Island (Bouvetoya),5382,2022-10-11T14:27:13.000+0000,"List(72.6957, 136.639)"
United States of America,1477,2022-03-20T15:56:25.000+0000,"List(52.2604, -27.6119)"


In [None]:
# Reorder the DataFrame columns to have the following column order

df_geo_clean = df_geo_with_coordinates.select("ind", "country", "coordinates", "timestamp")
display(df_geo_clean)

ind,country,coordinates,timestamp
5681,Antarctica (the territory South of 60 deg S),"List(-84.9073, -105.769)",2022-06-03T11:04:26.000+0000
8681,Antarctica (the territory South of 60 deg S),"List(-72.136, -130.529)",2019-10-25T14:08:25.000+0000
4790,Antarctica (the territory South of 60 deg S),"List(45.2499, 11.5767)",2021-05-24T08:49:25.000+0000
9701,Heard Island and McDonald Islands,"List(45.5964, 111.936)",2017-10-31T17:42:09.000+0000
7180,Holy See (Vatican City State),"List(-22.7118, -167.739)",2018-04-22T21:33:50.000+0000
5203,Slovakia (Slovak Republic),"List(-9.88959, 14.5641)",2019-06-04T16:23:54.000+0000
9938,Bouvet Island (Bouvetoya),"List(-88.516, -178.811)",2018-04-03T18:40:47.000+0000
9899,Bouvet Island (Bouvetoya),"List(-88.516, -178.811)",2019-11-10T10:07:08.000+0000
5382,Bouvet Island (Bouvetoya),"List(72.6957, 136.639)",2022-10-11T14:27:13.000+0000
1477,United States of America,"List(52.2604, -27.6119)",2022-03-20T15:56:25.000+0000


## Clean the df_user DataFrame

In [None]:
# Create a new column user_name that concatenates the information found in the first_name and last_name columns

from pyspark.sql.functions import concat_ws

df_user_cleaned = df_user_cleaned.withColumn("user_name", concat_ws(" ", col("first_name"), col("last_name")))
display(df_user_cleaned)

age,date_joined,first_name,ind,last_name,user_name
32,2015-12-18T05:07:36,Alexander,5260,Blanchard,Alexander Blanchard
23,2015-10-31T19:20:09,Alexandria,6051,Anderson,Alexandria Anderson
46,2016-07-02T08:06:40,Christina,2139,Carpenter,Christina Carpenter
24,2016-08-24T23:59:06,Christopher,4630,Norris,Christopher Norris
29,2017-07-16T19:09:03,Abigail,5574,Henderson,Abigail Henderson
41,2016-09-17T17:04:09,Alexandra,10476,Miller,Alexandra Miller
54,2017-09-10T00:12:15,Christina,9951,Garcia,Christina Garcia
30,2016-01-22T19:09:18,Antonio,7350,Gonzalez,Antonio Gonzalez
29,2017-05-30T10:56:50,Kathryn,316,Rasmussen,Kathryn Rasmussen
20,2015-11-20T09:08:00,Andrew,7695,Alexander,Andrew Alexander


In [None]:
# Drop the first_name and last_name columns from the DataFrame

df_user_name = df_user_cleaned.drop("first_name", "last_name")
display(df_user_name)

age,date_joined,ind,user_name
32,2015-12-18T05:07:36,5260,Alexander Blanchard
23,2015-10-31T19:20:09,6051,Alexandria Anderson
46,2016-07-02T08:06:40,2139,Christina Carpenter
24,2016-08-24T23:59:06,4630,Christopher Norris
29,2017-07-16T19:09:03,5574,Abigail Henderson
41,2016-09-17T17:04:09,10476,Alexandra Miller
54,2017-09-10T00:12:15,9951,Christina Garcia
30,2016-01-22T19:09:18,7350,Antonio Gonzalez
29,2017-05-30T10:56:50,316,Kathryn Rasmussen
20,2015-11-20T09:08:00,7695,Andrew Alexander


In [None]:
# Convert the date_joined column from a string to a timestamp data type

from pyspark.sql.functions import to_timestamp

df_user_dates = df_user_name.withColumn("date_joined", to_timestamp("date_joined"))
display(df_user_dates)

age,date_joined,ind,user_name
32,2015-12-18T05:07:36.000+0000,5260,Alexander Blanchard
23,2015-10-31T19:20:09.000+0000,6051,Alexandria Anderson
46,2016-07-02T08:06:40.000+0000,2139,Christina Carpenter
24,2016-08-24T23:59:06.000+0000,4630,Christopher Norris
29,2017-07-16T19:09:03.000+0000,5574,Abigail Henderson
41,2016-09-17T17:04:09.000+0000,10476,Alexandra Miller
54,2017-09-10T00:12:15.000+0000,9951,Christina Garcia
30,2016-01-22T19:09:18.000+0000,7350,Antonio Gonzalez
29,2017-05-30T10:56:50.000+0000,316,Kathryn Rasmussen
20,2015-11-20T09:08:00.000+0000,7695,Andrew Alexander


In [None]:
# Reorder the DataFrame columns

df_user_clean = df_user_dates.select("ind", "user_name", "age", "date_joined")
display(df_user_clean)

ind,user_name,age,date_joined
5260,Alexander Blanchard,32,2015-12-18T05:07:36.000+0000
6051,Alexandria Anderson,23,2015-10-31T19:20:09.000+0000
2139,Christina Carpenter,46,2016-07-02T08:06:40.000+0000
4630,Christopher Norris,24,2016-08-24T23:59:06.000+0000
5574,Abigail Henderson,29,2017-07-16T19:09:03.000+0000
10476,Alexandra Miller,41,2016-09-17T17:04:09.000+0000
9951,Christina Garcia,54,2017-09-10T00:12:15.000+0000
7350,Antonio Gonzalez,30,2016-01-22T19:09:18.000+0000
316,Kathryn Rasmussen,29,2017-05-30T10:56:50.000+0000
7695,Andrew Alexander,20,2015-11-20T09:08:00.000+0000


## Querying the data

In [None]:
'''
Find the most popular category in each country
Find the most popular Pinterest category people post to based on their country.
Your query should return a DataFrame that contains the following columns:
country
category
category_count, a new column containing the desired query output
'''


# Join the user and pin dataframes on user_name, then group by country and category to find the most popular category per country
df_joined = df_geo_clean.join(df_pin_clean, df_geo_clean.ind == df_pin_clean.ind, "inner")
df_category_count = df_joined.groupBy("country", "category").count().withColumnRenamed("count", "category_count")

# Use a window function to rank categories within each country by their count
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col
windowSpec = Window.partitionBy("country").orderBy(col("category_count").desc())

df_ranked = df_category_count.withColumn("rank", rank().over(windowSpec))

# Filter for the top category in each country
df_top_category_per_country = df_ranked.filter(df_ranked.rank == 1).select("country", "category", "category_count")

display(df_top_category_per_country)

country,category,category_count
Afghanistan,finance,4
Albania,home-decor,2
Algeria,quotes,4
American Samoa,travel,1
Andorra,education,2
Angola,beauty,1
Anguilla,education,1
Antarctica (the territory South of 60 deg S),event-planning,1
Antarctica (the territory South of 60 deg S),finance,1
Antarctica (the territory South of 60 deg S),tattoos,1


In [None]:
'''
# Find which was the most popular category each year
Find how many posts each category had between 2018 and 2022.
Your query should return a DataFrame that contains the following columns:
post_year, a new column that contains only the year from the timestamp column
category
category_count, a new column containing the desired query output
'''

from pyspark.sql.functions import year, rank
# Joing df_pin_cleaned to df_geo_cleaned to add timestamp column.
df_joined = df_geo_clean.join(df_pin_clean, df_geo_clean.ind == df_pin_clean.ind, "inner")

# Extract year from timestamp and count posts per category per year
df_posts_by_year = df_joined.withColumn("post_year", year("timestamp")) \
    .filter((col("post_year") >= 2018) & (col("post_year") <= 2022)) \
    .groupBy("post_year", "category") \
    .count() \
    .withColumnRenamed("count", "category_count")

display(df_posts_by_year)

post_year,category,category_count
2019,christmas,2
2021,christmas,3
2019,education,4
2020,education,2
2021,event-planning,4
2018,art,6
2022,finance,4
2022,education,2
2018,tattoos,2
2019,home-decor,6


In [None]:
'''
Find user with the most follower in each country.
Step 1: For each country find the user with the most followers.
Your query should return a DataFrame that contains the following columns:
country
poster_name
follower_count
Step 2: Based on the above query, find the country with the user with most followers.
Your query should return a DataFrame that contains the following columns:
country
follower_count
This DataFrame should have only one entry.
'''

from pyspark.sql.window import Window
from pyspark.sql.functions import col, max, row_number

# Step 1: Find the user with the most followers in each country
# Joing df_pin_cleaned to df_geo_cleaned to selct follower_count column and country column
df_joined = df_geo_clean.join(df_pin_clean, df_geo_clean.ind == df_pin_clean.ind, "inner")
windowSpec = Window.partitionBy("country").orderBy(col("follower_count").desc())
df_most_followers_per_country = df_joined.withColumn("rank", row_number().over(windowSpec)) \
    .filter(col("rank") == 1) \
    .select("country", "poster_name", "follower_count")

# display(df_most_followers_per_country)

# Step 2: Find the country with the user with the most followers overall
windowSpecOverall = Window.orderBy(col("follower_count").desc())
df_country_with_most_followers = df_most_followers_per_country.withColumn("rank", row_number().over(windowSpecOverall)) \
    .filter(col("rank") == 1) \
    .select("country", "follower_count")

display(df_country_with_most_followers)

country,follower_count
Algeria,5000000


In [None]:
'''
Find the most popular category for each age group.
What is the most popular category people post to based on the following age groups:
18-24
25-35
36-50
+50
Your query should return a DataFrame that contains the following columns:
age_group, a new column based on the original age column
category
category_count, a new column containing the desired query output
'''


from pyspark.sql.functions import when, col

# Join df_pin_clean to df_users_clean to add age column
df_join_age = df_user_clean.join(df_pin_clean, df_user_clean.ind == df_pin_clean.ind, "inner")
# Define age groups
df_join_age_grouped = df_join_age.withColumn(
    "age_group",
    when((col("age") >= 18) & (col("age") <= 24), "18-24")
    .when((col("age") >= 25) & (col("age") <= 35), "25-35")
    .when((col("age") >= 36) & (col("age") <= 50), "36-50")
    .otherwise("50+")
)

# Group by age_group and category, then count posts
df_category_count_by_age_group = df_join_age_grouped.groupBy("age_group", "category") \
    .count() \
    .withColumnRenamed("count", "category_count")

# Define window specification for ranking categories within each age group
from pyspark.sql.window import Window
# Define the window spec
windowSpec = Window.partitionBy("age_group").orderBy(col("category_count").desc())

# Rank categories within each age group and filter for the top category
from pyspark.sql.functions import rank
df_top_category_by_age_group = df_category_count_by_age_group.withColumn("rank", rank().over(windowSpec)) \
    .filter(col("rank") == 1) \
    .select("age_group", "category", "category_count")

display(df_top_category_by_age_group)

age_group,category,category_count
18-24,quotes,8
25-35,travel,6
36-50,art,3
36-50,event-planning,3
36-50,beauty,3
36-50,vehicles,3
50+,event-planning,3


In [None]:
'''
Find the media follower count for different age groups.
What is the median follower count for users in the following age groups:
18-24
25-35
36-50
+50
Your query should return a DataFrame that contains the following columns:
age_group, a new column based on the original age column
median_follower_count, a new column containing the desired query output
'''

from pyspark.sql.functions import expr

# Join df_user_clean to df_pin_clean to add follower_count column
df_join_follower_count = df_user_clean.join(df_pin_clean, df_user_clean.ind == df_pin_clean.ind, "inner")

# Define age groups
df_join_follower_count_grouped = df_join_follower_count.withColumn(
    "age_group",
    when((col("age") >= 18) & (col("age") <= 24), "18-24")
    .when((col("age") >= 25) & (col("age") <= 35), "25-35")
    .when((col("age") >= 36) & (col("age") <= 50), "36-50")
    .otherwise("50+")
)

# Calculate median follower count for each age group
windowSpec = Window.partitionBy("age_group")
df_median_follower_count = df_join_follower_count_grouped.withColumn(
    "median_follower_count",
    expr("percentile_approx(follower_count, 0.5)").over(windowSpec)
).select("age_group", "median_follower_count").distinct()

display(df_median_follower_count)

age_group,median_follower_count
18-24,104000
25-35,28000
36-50,13000
50+,2000


In [None]:
'''
Find how many user have joined each year.
Find how many users have joined between 2015 and 2020.
Your query should return a DataFrame that contains the following columns:
post_year, a new column that contains only the year from the timestamp column
number_users_joined, a new column containing the desired query output
'''

from pyspark.sql.functions import year

# Extract year from timestamp and filter users who joined between 2015 and 2020
df_users_joined = df_user_clean.withColumn("post_year", year("date_joined")) \
    .filter((col("post_year") >= 2015) & (col("post_year") <= 2020))

# Group by post_year and count unique users
df_users_joined_count = df_users_joined.groupBy("post_year") \
    .agg(countDistinct("ind").alias("number_users_joined"))

display(df_users_joined_count)

post_year,number_users_joined
2015,43
2016,58
2017,23


In [None]:
# Find the media follower count of users based on their joining year.
'''
Find the median follower count of users have joined between 2015 and 2020.
Your query should return a DataFrame that contains the following columns:
post_year, a new column that contains only the year from the timestamp column
median_follower_count, a new column containing the desired query output
'''

from pyspark.sql.window import Window
from pyspark.sql.functions import year, expr

# Extract year from timestamp for users who joined between 2015 and 2020
df_users_filtered = df_user_clean.withColumn("post_year", year("date_joined")) \
    .filter((col("post_year") >= 2015) & (col("post_year") <= 2020))

# Join df_users_filtered with df_pin_clean to get follower_count
df_join_follower = df_users_filtered.join(df_pin_clean, "ind", "inner")

# Define window specification partitioned by post_year
windowSpec = Window.partitionBy("post_year")

# Calculate median follower count for each post_year
df_median_follower_by_year = df_join_follower.withColumn(
    "median_follower_count",
    expr("percentile_approx(follower_count, 0.5)").over(windowSpec)
).select("post_year", "median_follower_count").distinct()

display(df_median_follower_by_year)

post_year,median_follower_count
2015,110000
2016,23000
2017,3000


In [None]:
'''
Find median count of followers based on their joining year and age group.
Find the median follower count of users that have joined between 2015 and 2020, based on which age group they are part of.
Your query should return a DataFrame that contains the following columns:
age_group, a new column based on the original age column
post_year, a new column that contains only the year from the timestamp column
median_follower_count, a new column containing the desired query output
'''


from pyspark.sql.functions import when, expr

# Define age groups based on age
df_users_age_group = df_user_clean.withColumn(
    "age_group",
    when(col("age") < 18, "Under 18")
    .when((col("age") >= 18) & (col("age") <= 24), "18-24")
    .when((col("age") >= 25) & (col("age") <= 34), "25-34")
    .when((col("age") >= 35) & (col("age") <= 44), "35-44")
    .when((col("age") >= 45) & (col("age") <= 54), "45-54")
    .when((col("age") >= 55) & (col("age") <= 64), "55-64")
    .otherwise("65+")
)

# Extract year from timestamp and filter users who joined between 2015 and 2020
df_users_filtered_age_group = df_users_age_group.withColumn("post_year", year("date_joined")) \
    .filter((col("post_year") >= 2015) & (col("post_year") <= 2020))

# Join df_users_filtered_age_group with df_pin_clean to get follower_count
df_join_follower_age_group = df_users_filtered_age_group.join(df_pin_clean, "ind", "inner")

# Define window specification partitioned by post_year and age_group
windowSpecAgeGroup = Window.partitionBy("post_year", "age_group")

# Calculate median follower count for each post_year and age_group
df_median_follower_by_year_age_group = df_join_follower_age_group.withColumn(
    "median_follower_count",
    expr("percentile_approx(follower_count, 0.5)").over(windowSpecAgeGroup)
).select("age_group", "post_year", "median_follower_count").distinct()

display(df_median_follower_by_year_age_group)

age_group,post_year,median_follower_count
18-24,2015,132000
25-34,2015,42000
35-44,2015,176
45-54,2015,2000
18-24,2016,42000
25-34,2016,23000
35-44,2016,3000
45-54,2016,4000
55-64,2016,12000
18-24,2017,77
