In [4]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import scala.util.Random

// Step 1: Initialize SparkSession
val spark = SparkSession.builder()
  .appName("Generate Metadata JSON")
  .getOrCreate()

// Step 2: Load movies.csv as a DataFrame
val moviesPath = "gs://deva_vasadi/movie_lens_data/movies.csv"
val moviesDF = spark.read.option("header", "true").csv(moviesPath)

// Step 3: Extract releaseYear from title or assign a random year
val extractYear = udf((title: String) => {
  val yearPattern = "\\((\\d{4})\\)".r
  yearPattern.findFirstMatchIn(title).map(_.group(1)).getOrElse {
    (1980 + Random.nextInt(2023 - 1980 + 1)).toString
  }
})

// Generate metadata DataFrame
val metadataDF = moviesDF
  .select("movieId", "title")
  .withColumn("releaseYear", extractYear(col("title")))

// Step 4: Write the DataFrame as a single JSON file to GCS
val outputPath = "gs://deva_vasadi/movie_lens_data/metadata.json"

metadataDF.coalesce(1) // Ensures a single output file
  .write
  .mode("overwrite")
  .json(outputPath)

println(s"Metadata written successfully to $outputPath!")


Metadata written successfully to gs://deva_vasadi/movie_lens_data/metadata.json!


spark = org.apache.spark.sql.SparkSession@2781f5b0
moviesPath = gs://deva_vasadi/movie_lens_data/movies.csv
moviesDF = [movieId: string, title: string ... 1 more field]
extractYear = SparkUserDefinedFunction($Lambda$5365/0x0000000801f4b840@3ff1a291,StringType,List(Some(class[value[0]: string])),Some(class[value[0]: string]),None,true,true)
metadataDF = [movieId: string, title: string ... 1 more field]
outputPath = gs://deva_vasadi/movie_lens_data/metadata.json


gs://deva_vasadi/movie_lens_data/metadata.json

In [5]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.json4s._
import org.json4s.jackson.JsonMethods._
import java.io.PrintWriter
import scala.util.Random

// Step 4: Read metadata.json from GCS and combine with movies data
val metadataRDD = spark.sparkContext.textFile("gs://deva_vasadi/movie_lens_data/metadata.json")
val parsedMetadataRDD = metadataRDD.map { line =>
  implicit val formats = DefaultFormats
  val json = parse(line)
  val movieId = (json \ "movieId").extract[String]
  val releaseYear = (json \ "releaseYear").extract[String]
  (movieId, releaseYear)
}

// Convert metadata RDD to DataFrame
val metadataFromJsonDF = parsedMetadataRDD.toDF("movieId", "releaseYear")
//metadataFromJsonDF.show()

metadataRDD = gs://deva_vasadi/movie_lens_data/metadata.json MapPartitionsRDD[18] at textFile at <console>:51
parsedMetadataRDD = MapPartitionsRDD[19] at map at <console>:52
metadataFromJsonDF = [movieId: string, releaseYear: string]


[movieId: string, releaseYear: string]

In [8]:
// Validation Step: Check if all movies in moviesDF have years in their titles
val missingYearCount = moviesDF.filter(!col("title").rlike("\\(\\d{4}\\)$")).count()

if (missingYearCount > 0) {
  println(s"Validation failed: $missingYearCount movies do not have years in their titles.")
} else {
  println("Validation passed: All movies have years in their titles.")
}

Validation failed: 797 movies do not have years in their titles.


missingYearCount = 797


797

In [6]:
// Join metadata with moviesDF
val enrichedMoviesDF = moviesDF.join(metadataFromJsonDF, Seq("movieId"), "left").map(row => {
  val movieId = row.getString(row.fieldIndex("movieId"))
  var title = row.getString(row.fieldIndex("title"))
  val releaseYear = row.getString(row.fieldIndex("releaseYear"))

  // Append releaseYear to title if it's missing
  if (!title.matches(".*\\(\\d{4}\\)$")) {
    title = s"$title ($releaseYear)"
  }

  (movieId, title, row.getString(row.fieldIndex("genres")))
}).toDF("movieId", "title", "genres")

// Step 5: Save final DataFrame to HDFS in Parquet format
val outputPath = "hdfs:///user/casestudies/casestudy3/enriched-movies"
enrichedMoviesDF.write.mode("overwrite").parquet(outputPath)

println("Enriched movies data saved successfully!")

Enriched movies data saved successfully!


enrichedMoviesDF = [movieId: string, title: string ... 1 more field]
outputPath = hdfs:///user/casestudies/casestudy3/enriched-movies


hdfs:///user/casestudies/casestudy3/enriched-movies

In [7]:
// Validation Step: Check if all movies in enrichedMoviesDF have years in their titles
val missingYearCount = enrichedMoviesDF.filter(!col("title").rlike("\\(\\d{4}\\)$")).count()

if (missingYearCount > 0) {
  println(s"Validation failed: $missingYearCount movies do not have years in their titles.")
} else {
  println("Validation passed: All movies have years in their titles.")
}

Validation passed: All movies have years in their titles.


missingYearCount = 0


0