### Q1. [User with Most Approved Flags](https://platform.stratascratch.com/coding/2104-user-with-most-approved-flags/official-solution?code_type=6)


In [2]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

from pyspark.sql import SparkSession
from pyspark.sql import SparkSession


# Creating a spark session
spark = SparkSession.builder.appName('SparkLearning').getOrCreate()

In [3]:
# Import your libraries
import pyspark
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, BooleanType, StructField, StructType, DateType
from pyspark.sql.window import Window

In [9]:
# DataFrames
user_flags_schema = StructType([
    StructField("user_firstname", StringType(), nullable=True),
    StructField("user_lastname", StringType(), nullable=True),
    StructField("video_id", StringType(), nullable=True),
    StructField("flag_id", StringType(), nullable=True)
])

flag_review_schema = StructType([
    StructField("flag_id", StringType(), nullable=True),
    StructField("reviewed_by_yt", BooleanType(), nullable=True),
    StructField("reviewed_date", DateType(), nullable=True),
    StructField("reviewed_outcome", StringType(), nullable=True)
])

user_flags = spark.createDataFrame([], schema=user_flags_schema)
user_flags.printSchema()

flag_review = spark.createDataFrame([], schema=flag_review_schema)
flag_review.printSchema()

root
 |-- user_firstname: string (nullable = true)
 |-- user_lastname: string (nullable = true)
 |-- video_id: string (nullable = true)
 |-- flag_id: string (nullable = true)

root
 |-- flag_id: string (nullable = true)
 |-- reviewed_by_yt: boolean (nullable = true)
 |-- reviewed_date: date (nullable = true)
 |-- reviewed_outcome: string (nullable = true)



In [10]:
# Approach :=> 1
result = user_flags.join(flag_review, on="flag_id", how="inner")
result = result.filter(F.lower(result["reviewed_outcome"]) == "approved")
result = result.withColumn("username", F.concat(result["user_firstname"], F.lit(" "), result["user_lastname"]))
result = result.groupby("username").agg(F.countDistinct("video_id").alias("video_count"))
result = result.withColumn("rank", F.rank().over(Window.orderBy(F.desc("video_count"))))
result = result.filter(result["rank"] == 1).select("username")
result.show()

+--------+
|username|
+--------+
+--------+



In [12]:
# Approach :=> 2
df = (
    user_flags.join(flag_review, on="flag_id", how="inner")
    .filter(F.col("reviewed_outcome") == "APPROVED")
    .groupBy(F.col("user_firstname"), F.col("user_lastname"))
    .agg(F.countDistinct("video_id").alias("total_video_count"))
)

# Rank users by total_video_count and Filter to keep only the top-ranked users
windowSpec = Window.orderBy(df["total_video_count"].desc())
df = df.withColumn("rank", F.dense_rank().over(windowSpec)) \
        .filter(F.col("rank") == 1) \
        .drop("total_video_count", "rank")


# select all the usernames
df = df.select(F.expr("user_firstname || ' ' || user_lastname").alias("username"))

df.show()

+--------+
|username|
+--------+
+--------+

