In [3]:
'''
Given a dataset containing user information with duplicate user_id values, write a PySpark query to remove duplicate rows while retaining the row with the latest created_date for each user_id. The result should contain the latest entry for each user.

Starter Code
Below is the starter code to create the input DataFrame:

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from datetime import date

# Initialize Spark session
spark = SparkSession.builder \
    .appName("RemoveDuplicates") \
    .getOrCreate()

# Define schema
schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("user_name", StringType(), True),
    StructField("created_date", DateType(), True),
    StructField("email", StringType(), True)
])

# Sample data
data = [
    (1, "Alice", date(2023, 5, 10), "alice@example.com"),
    (1, "Alice", date(2023, 6, 15), "alice_new@example.com"),
    (2, "Bob", date(2023, 7, 1), "bob@example.com"),
    (3, "Charlie", date(2023, 5, 20), "charlie@example.com"),
    (3, "Charlie", date(2023, 6, 25), "charlie_updated@example.com"),
    (4, "David", date(2023, 8, 5), "david@example.com")
]

# Create DataFrame
user_df = spark.createDataFrame(data, schema)
user_df.show()
Sample Output Schema and Example Table
Use display(df) to show the final DataFrame.

Output Schema:

Column	Type
user_id	Integer
user_name	String
created_date	Date
email	String
Example Output:

user_id	user_name	created_date	email
1	Alice	2023-06-15	alice_new@example.com
2	Bob	2023-07-01	bob@example.com
3	Charlie	2023-06-25	charlie_updated@example.com
4	David	2023-08-05	david@example.com
Explanation
The user_id column contains duplicate values.
The row with the most recent created_date for each user_id is retained.
The output excludes all older entries for the same user_id.
'''
# Initialize Spark session
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from datetime import date

spark = SparkSession.builder.appName('Spark Playground').getOrCreate()

# Define schema
schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("user_name", StringType(), True),
    StructField("created_date", DateType(), True),
    StructField("email", StringType(), True)
])

# Sample data
data = [
    (1, "Alice", date(2023, 5, 10), "alice@example.com"),
    (1, "Alice", date(2023, 6, 15), "alice_new@example.com"),
    (2, "Bob", date(2023, 7, 1), "bob@example.com"),
    (3, "Charlie", date(2023, 5, 20), "charlie@example.com"),
    (3, "Charlie", date(2023, 6, 25), "charlie_updated@example.com"),
    (4, "David", date(2023, 8, 5), "david@example.com")
]

# Create DataFrame
user_df = spark.createDataFrame(data, schema)

window_spec = Window.partitionBy("user_id").orderBy(F.col("created_date").desc())

df_result = (
  user_df.withColumn("row_number", F.row_number().over(window_spec))
  .filter(F.col("row_number") == 1)
  .drop("row_number")
)

df_result.show()

+-------+---------+------------+--------------------+
|user_id|user_name|created_date|               email|
+-------+---------+------------+--------------------+
|      1|    Alice|  2023-06-15|alice_new@example...|
|      2|      Bob|  2023-07-01|     bob@example.com|
|      3|  Charlie|  2023-06-25|charlie_updated@e...|
|      4|    David|  2023-08-05|   david@example.com|
+-------+---------+------------+--------------------+

