In [15]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import java.time.Instant
import java.time.ZoneId
import java.time.format.DateTimeFormatter

// Step 1: Initialize SparkSession
val spark = SparkSession.builder()
  .appName("Time-Based Data Partitioning for Ratings")
  .getOrCreate()

// Step 2: Load ratings.csv as a DataFrame from GCP
val ratingsPath = "gs://deva_vasadi/movie_lens_data/ratings.csv"
val ratingsDF = spark.read.option("header", "true").csv(ratingsPath)

// Step 3: Convert the timestamp field to a year column
val ratingsWithYearDF = ratingsDF.withColumn(
  "year",
  year(from_unixtime(col("timestamp").cast("long")))
)
ratingsWithYearDF.show()

+------+-------+------+---------+----+
|userId|movieId|rating|timestamp|year|
+------+-------+------+---------+----+
|     1|     17|   4.0|944249077|1999|
|     1|     25|   1.0|944250228|1999|
|     1|     29|   2.0|943230976|1999|
|     1|     30|   5.0|944249077|1999|
|     1|     32|   5.0|943228858|1999|
|     1|     34|   2.0|943228491|1999|
|     1|     36|   1.0|944249008|1999|
|     1|     80|   5.0|944248943|1999|
|     1|    110|   3.0|943231119|1999|
|     1|    111|   5.0|944249008|1999|
|     1|    161|   1.0|943231162|1999|
|     1|    166|   5.0|943228442|1999|
|     1|    176|   4.0|944079496|1999|
|     1|    223|   3.0|944082810|1999|
|     1|    232|   5.0|943228442|1999|
|     1|    260|   5.0|943228696|1999|
|     1|    302|   4.0|944253272|1999|
|     1|    306|   5.0|944248888|1999|
|     1|    307|   5.0|944253207|1999|
|     1|    322|   4.0|944053801|1999|
+------+-------+------+---------+----+
only showing top 20 rows



spark = org.apache.spark.sql.SparkSession@40930058
ratingsPath = gs://deva_vasadi/movie_lens_data/ratings.csv
ratingsDF = [userId: string, movieId: string ... 2 more fields]
ratingsWithYearDF = [userId: string, movieId: string ... 3 more fields]


[userId: string, movieId: string ... 3 more fields]

In [14]:
// Step 4: Transformation - Filter out invalid or incomplete records
val validRatingsDF = ratingsWithYearDF
  .filter(col("userId").isNotNull && col("movieId").isNotNull && col("rating").isNotNull && col("timestamp").isNotNull)

// Step 5: Save Data Partitioned by UserId to HDFS
println("Saving...")

val outputPath = "hdfs:///user/casestudies/casestudy5"
validRatingsDF.limit(1000000).write
    .partitionBy("year")
    .format("parquet").mode("overwrite").save(outputPath)

println("Program complete")

Saving...
Program complete


validRatingsDF = [userId: string, movieId: string ... 3 more fields]
outputPath = hdfs:///user/casestudies/casestudy5


hdfs:///user/casestudies/casestudy5