### Q1. [User with Most Approved Flags](https://platform.stratascratch.com/coding/2104-user-with-most-approved-flags/official-solution?code_type=6)


In [2]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

from pyspark.sql import SparkSession
from pyspark.sql import SparkSession


# Creating a spark session
spark = SparkSession.builder.appName('SparkLearning').getOrCreate()

In [16]:
# Import your libraries
import pyspark
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, BooleanType, StructField, StructType, DateType
from pyspark.sql.window import Window
from InputToDataFrame import convert_input_to_df

In [30]:
# user_flag_data
input_data = '''
user_firstname	user_lastname	video_id	flag_id
Richard	Hasson	y6120QOlsfU	0cazx3
Mark	May	Ct6BUPvE2sM	1cn76u
Gina	Korman	dQw4w9WgXcQ	1i43zk
Mark	May	Ct6BUPvE2sM	1n0vef
Mark	May	jNQXAC9IVRw	1sv6ib
Gina	Korman	dQw4w9WgXcQ	20xekb
Mark	May	5qap5aO4i9A	4cvwuv
'''


user_flags_data = convert_input_to_df(input_data)
print(user_flags_data)

[('Richard', 'Hasson', 'y6120QOlsfU', '0cazx3'), ('Mark', 'May', 'Ct6BUPvE2sM', '1cn76u'), ('Gina', 'Korman', 'dQw4w9WgXcQ', '1i43zk'), ('Mark', 'May', 'Ct6BUPvE2sM', '1n0vef'), ('Mark', 'May', 'jNQXAC9IVRw', '1sv6ib'), ('Gina', 'Korman', 'dQw4w9WgXcQ', '20xekb'), ('Mark', 'May', '5qap5aO4i9A', '4cvwuv')]


In [21]:
input_data = '''
0cazx3	FALSE		
1cn76u	TRUE	2022-03-15	REMOVED
1i43zk	TRUE	2022-03-15	REMOVED
1n0vef	TRUE	2022-03-15	REMOVED
1sv6ib	TRUE	2022-03-15	APPROVED
20xekb	TRUE	2022-03-17	REMOVED
4cvwuv	TRUE	2022-03-15	APPROVED	
4sd6dv	TRUE	2022-03-14	REMOVED
6jjkvn	TRUE	2022-03-16	APPROVED
7ks264	TRUE	2022-03-15	APPROVED
'''

flag_review_data = convert_input_to_df(input_data)
print(flag_review_data)

[('1cn76u', 'TRUE', '2022-03-15', 'REMOVED'), ('1i43zk', 'TRUE', '2022-03-15', 'REMOVED'), ('1n0vef', 'TRUE', '2022-03-15', 'REMOVED'), ('1sv6ib', 'TRUE', '2022-03-15', 'APPROVED'), ('20xekb', 'TRUE', '2022-03-17', 'REMOVED'), ('4cvwuv', 'TRUE', '2022-03-15', 'APPROVED')]


In [35]:
# DataFrames
user_flags_schema = StructType([
    StructField("user_firstname", StringType(), nullable=True),
    StructField("user_lastname", StringType(), nullable=True),
    StructField("video_id", StringType(), nullable=True),
    StructField("flag_id", StringType(), nullable=True)
])

flag_review_schema = StructType([
    StructField("flag_id", StringType(), nullable=True),
    StructField("reviewed_by_yt", StringType(), nullable=True),
    StructField("reviewed_date", StringType(), nullable=True),
    StructField("reviewed_outcome", StringType(), nullable=True)
])

user_flags = spark.createDataFrame(user_flags_data, schema=user_flags_schema)
user_flags.printSchema()
user_flags.show()

print("=" * 30)

flag_review = spark.createDataFrame(flag_review_data, schema=flag_review_schema)
flag_review.printSchema()
flag_review.show()

root
 |-- user_firstname: string (nullable = true)
 |-- user_lastname: string (nullable = true)
 |-- video_id: string (nullable = true)
 |-- flag_id: string (nullable = true)

+--------------+-------------+-----------+-------+
|user_firstname|user_lastname|   video_id|flag_id|
+--------------+-------------+-----------+-------+
|       Richard|       Hasson|y6120QOlsfU| 0cazx3|
|          Mark|          May|Ct6BUPvE2sM| 1cn76u|
|          Gina|       Korman|dQw4w9WgXcQ| 1i43zk|
|          Mark|          May|Ct6BUPvE2sM| 1n0vef|
|          Mark|          May|jNQXAC9IVRw| 1sv6ib|
|          Gina|       Korman|dQw4w9WgXcQ| 20xekb|
|          Mark|          May|5qap5aO4i9A| 4cvwuv|
+--------------+-------------+-----------+-------+

root
 |-- flag_id: string (nullable = true)
 |-- reviewed_by_yt: string (nullable = true)
 |-- reviewed_date: string (nullable = true)
 |-- reviewed_outcome: string (nullable = true)

+-------+--------------+-------------+----------------+
|flag_id|reviewed_by_y

In [33]:
# Approach :=> 1
result = user_flags.join(flag_review, on="flag_id", how="inner")
result = result.filter(F.lower(result["reviewed_outcome"]) == "approved")
result = result.withColumn("username", F.concat(result["user_firstname"], F.lit(" "), result["user_lastname"]))
result = result.groupby("username").agg(F.countDistinct("video_id").alias("video_count"))
result = result.withColumn("rank", F.rank().over(Window.orderBy(F.desc("video_count"))))
result = result.filter(result["rank"] == 1).select("username")
result.show()

+--------+
|username|
+--------+
|Mark May|
+--------+



In [34]:
# Approach :=> 2
df = (
    user_flags.join(flag_review, on="flag_id", how="inner")
    .filter(F.col("reviewed_outcome") == "APPROVED")
    .groupBy(F.col("user_firstname"), F.col("user_lastname"))
    .agg(F.countDistinct("video_id").alias("total_video_count"))
)

# Rank users by total_video_count and Filter to keep only the top-ranked users
windowSpec = Window.orderBy(df["total_video_count"].desc())
df = df.withColumn("rank", F.dense_rank().over(windowSpec)) \
        .filter(F.col("rank") == 1) \
        .drop("total_video_count", "rank")


# select all the usernames
df = df.select(F.expr("user_firstname || ' ' || user_lastname").alias("username"))

df.show()

+--------+
|username|
+--------+
|Mark May|
+--------+



# Q2.