In [0]:
df_pin = spark.table("pin_data")
df_geo= spark.table("geo_data")
df_user=spark.table("user_data")
df_pin.createOrReplaceTempView("df_pin")
df_geo.createOrReplaceTempView("df_geo")
df_user.createOrReplaceTempView("df_user")



In [0]:
df_pin.printSchema()

In [0]:
df_geo.printSchema()

In [0]:
df_user.printSchema()

In [0]:
popular_categories_df = spark.sql("""
    WITH CategoryCounts AS (
        SELECT 
            df_pin.category as category,
            COUNT(df_pin.category) as category_count,
            df_geo.country as country,
            ROW_NUMBER() OVER(PARTITION BY df_geo.country ORDER BY COUNT(df_pin.category) DESC) as row_num
        FROM 
            df_pin
        JOIN
            df_geo
        ON  
            df_geo.ind = df_pin.ind
        GROUP BY 
            df_geo.country, df_pin.category
    )
    SELECT 
        country,
        category,
        category_count
    FROM 
        CategoryCounts
    WHERE 
        row_num = 1
""")
display(popular_categories_df)


country,category,category_count
Afghanistan,education,34
Albania,beauty,31
Algeria,quotes,64
American Samoa,beauty,20
Andorra,tattoos,21
Angola,diy-and-crafts,18
Anguilla,diy-and-crafts,11
Antarctica (the territory South of 60 deg S),finance,10
Antigua and Barbuda,christmas,26
Argentina,tattoos,22


In [0]:
posts_per_category_2018_2022 = spark.sql("""
    SELECT
        YEAR(geo.timestamp) as post_year,
        pin.category as category,
        COUNT(pin.category) as category_count
    FROM 
        df_pin pin
    JOIN
        df_geo geo
    ON
        geo.ind = pin.ind
    WHERE
        YEAR(geo.timestamp) BETWEEN 2018 AND 2022
    GROUP BY
        YEAR(geo.timestamp), pin.category
    ORDER BY
        post_year, category_count DESC
""")

display(posts_per_category_2018_2022)


post_year,category,category_count
2018,art,89
2018,travel,60
2018,education,60
2018,diy-and-crafts,59
2018,quotes,58
2018,beauty,52
2018,vehicles,44
2018,christmas,40
2018,mens-fashion,35
2018,tattoos,26


In [0]:
top_user_per_country=spark.sql("""
    with CountryMaxFollowers as (
        select
        geo.country as country,
        pin.poster_name as poster_name,
        pin.follower_count as follower_count,
        row_number() over (partition by country order by pin.follower_count desc)
        as rank
        from
            df_pin pin
        join
            df_geo geo
        on
            geo.ind=pin.ind 

        
    )
    select 
        country,
        poster_name,
        follower_count
    from CountryMaxFollowers
    where
    rank=1
                               """)
display(top_user_per_country)
top_user_per_country.createOrReplaceTempView("top_user_per_country_view")

country_with_user_with_most_followers=spark.sql("""
SELECT 
        country, 
        MAX(follower_count) as follower_count
    FROM 
        top_user_per_country_view
    GROUP BY 
        country
    ORDER BY 
        follower_count DESC
    LIMIT 1
                                         """)
display(country_with_user_with_most_followers)

country,poster_name,follower_count
Afghanistan,9GAG,3000000
Albania,The Minds Journal,5000000
Algeria,YourTango,942000
American Samoa,Mamas Uncut,8000000
Andorra,Teachers Pay Teachers,1000000
Angola,Tastemade,8000000
Anguilla,"Kristen | Lifestyle, Mom Tips & Teacher Stuff Blog",92000
Antarctica (the territory South of 60 deg S),Refinery29,1000000
Antigua and Barbuda,Country Living Magazine,1000000
Argentina,Next Luxury,800000


country,follower_count
American Samoa,8000000


In [0]:
category_popularity_by_age_group=spark.sql("""
    with agegroups as (
        SELECT
            ind,
            CASE 
                WHEN age between 18 and 24 then "18-24"
                WHEN age between 25 and 35 then "25-35"
                WHEN age between 36 and 50 then "36-50"
                ELSE "+50"
            END AS age_group
        from
        df_user
    ),
    category_age_group as(
        select 
            agegroups.age_group,
            pin.category,
            count(pin.category) as category_count,
            row_number() over(partition by age_group order by count(pin.category)desc) as rank
        from
            df_pin pin
        join
            agegroups
        on 
            pin.ind=agegroups.ind
        group by
            agegroups.age_group,
            pin.category
)
        SELECT
    age_group,
    category,
    category_count
FROM category_age_group
where rank=1



""")
display(category_popularity_by_age_group)

age_group,category,category_count
+50,beauty,45
18-24,tattoos,132
25-35,travel,90
36-50,diy-and-crafts,65


In [0]:
median_follower_count=spark.sql("""
    with agegroups as (
        SELECT
            ind,
            CASE 
                WHEN age between 18 and 24 then "18-24"
                WHEN age between 25 and 35 then "25-35"
                WHEN age between 36 and 50 then "36-50"
                ELSE "+50"
            END AS age_group
        from
        df_user
    )
    select
         percentile_approx(pin.follower_count, 0.5) AS median_follower_count,
        agegroups.age_group as age_group
    from 
        df_pin pin
    join
        agegroups
    on
        agegroups.ind=pin.ind
    group by
        age_group

""")
display(median_follower_count)

median_follower_count,age_group
6000,36-50
1000,+50
108000,18-24
28000,25-35


In [0]:
yearly_users_joining=spark.sql("""
    SELECT
        year(date_joined) as post_year,
        count(ind) as number_users_joined
    FROM 
        df_user
    GROUP BY
        year(date_joined)
        """)
display(yearly_users_joining)

post_year,number_users_joined
2015.0,54
,464
2016.0,66
2017.0,24
