# Big Data Project - Job 1

In [1]:
import org.apache.spark

Intitializing Scala interpreter ...

Spark Web UI available at http://host.docker.internal:4041
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1742550239363)
SparkSession available as 'spark'


import org.apache.spark


In [2]:
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
/**
 * This method creates an RDD with rows [asin, reviewText, overall, category, summary]
 * given the path of a csv file
 * */
def create(path: String, spark: SparkSession):  RDD[(String, String, Double, String, String)] = {
  spark.read
    .option("header", "true")
    .option("quote", "\"")
    .option("delimiter", ",")
    .option("multiline", "true")
    .option("escape", "\"")
    .csv(path).rdd
    .map(row => {
      val asin = row.getAs[String]("asin")
      val reviewText = row.getAs[String]("reviewText")
      val overall = try {
        row.getAs[String]("overall").toDouble
      } catch {
        case e: Exception => 0.0
      }
      val category = row.getAs[String]("category")
      val summary = row.getAs[String]("summary")

      (asin, reviewText, overall, category, summary)
    })}

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
create: (path: String, spark: org.apache.spark.sql.SparkSession)org.apache.spark.rdd.RDD[(String, String, Double, String, String)]


In [3]:
/**
 * This method cleans a string substituting all the special characters except from ' and
 * the multiple blank spaces with a blank space. It also trims the string.
 * */
def cleanString(s: String): String = {
  s.toLowerCase()
    .replaceAll("[^a-zA-z0-9 ']", " ")
    .replaceAll("\\[", " ")
    .replaceAll("\\]", " ")
    .replaceAll("\\s+", " ")
    .trim()
}

def classifyRating(rating: Double): String = {
  rating match {
    case r if r <= 2 => "low rating"
    case r if r == 3 => "medium rating"
    case _ => "high rating"
  }
}

cleanString: (s: String)String
classifyRating: (rating: Double)String


In [4]:
val sparkSession: SparkSession = SparkSession.builder().getOrCreate()

// create Rdd from csv files
val rddReviewAppliances = create("../../../../dataset/Appliances_5_part0.csv", sparkSession)
val rddReviewSoftware = create("../../../../dataset/Software_5_part0.csv", sparkSession)

sparkSession.sparkContext.getPersistentRDDs.foreach(_._2.unpersist())

// union of the three Rdds
val rddUnion = rddReviewAppliances
  .union(rddReviewSoftware)

sparkSession: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@602a21a1
rddReviewAppliances: org.apache.spark.rdd.RDD[(String, String, Double, String, String)] = MapPartitionsRDD[9] at map at <console>:38
rddReviewSoftware: org.apache.spark.rdd.RDD[(String, String, Double, String, String)] = MapPartitionsRDD[19] at map at <console>:38
rddUnion: org.apache.spark.rdd.RDD[(String, String, Double, String, String)] = UnionRDD[20] at union at <console>:37


In [5]:
val categoryRatingReviewWords =
  rddUnion
    // map category as key, remove summary and id, clean review string and replace rating with class of rating
    .map({case (id, review, rating, category, summary) => (category, (cleanString(review), classifyRating(rating)))})
    // remove rows where the cleaned string is empty
    .filter(x =>  x._2._1 != "")
    // replace review with the number of words in it
    .map({case (category, (review, rating)) => (category, (rating, review.split(" ").length))})

categoryRatingReviewWords: org.apache.spark.rdd.RDD[(String, (String, Int))] = MapPartitionsRDD[23] at map at <console>:36


In [6]:
val totAllWordPerCategory =
  categoryRatingReviewWords
    // map category as key and the number of words as value while keeping the partitioning
    .mapValues(x => x._2)
    // compute the total number of words for each category, adding the values
    .aggregateByKey(0.0)((a, l) => a + l, (a1, a2) => a1 + a2)

totAllWordPerCategory: org.apache.spark.rdd.RDD[(String, Double)] = ShuffledRDD[25] at aggregateByKey at <console>:32


In [7]:
// first we join these two Rdds to add the total number of words in each category
val wordFreqSubCategory =
  categoryRatingReviewWords
    .join(totAllWordPerCategory)
    // adding category, rating class and the total number of words per category to the key.
    // We need this in order to compute the number of words for each different key while keeping the number of words per category.
    .map({case (category, ((classification, words), allWords)) => ((category, classification, allWords), words)})
    .reduceByKey(_ + _)
    // map to compute the ratio between the number of words for each class of rating and the total number of words for each category
    // map to write as DF on file
    .map({case ((category, classification, allWords), words) =>
      (category, classification, words/allWords)})

wordFreqSubCategory: org.apache.spark.rdd.RDD[(String, String, Double)] = MapPartitionsRDD[31] at map at <console>:38


In [8]:
wordFreqSubCategory.collect()

res0: Array[(String, String, Double)] = Array((Software_5,high rating,0.6494401176760477), (Software_5,low rating,0.18477121949098377), (Software_5,medium rating,0.16578866283296856), (Appliances_5,low rating,0.1593303235515425), (Appliances_5,high rating,0.7444507148231754), (Appliances_5,medium rating,0.09621896162528217))
