# Lecture 10: InClass Demo

In [None]:
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._

## 1. RDD Creation
### Note: to display the content in RDD, .collect() and .foreach(func) are used in the following examples

In [None]:

// textFile() method 
val lines = sc.textFile("file:///home/dr_wang1982/infs3208/data/txtDemo.txt")
lines.collect().foreach(println)

// parallelize() method
//val dataA = Array(1, 2, 3, 4, 5)
//val rddA = sc.parallelize(dataA)

//val dataS = List("MapReduce is good","Spark is fast","Spark is better than MapReduce")
//val rddS = sc.parallelize(dataS)

//rddA.collect().foreach(println)
//println("-------------")    
//rddS.collect().foreach(println)

// Spark Web UI - http://35.197.180.110:8080/

## 2. RDD Operations

In [None]:
// transformations
rddA.map(x => x + 10).collect().foreach(println)
println("-------------")
rddA.map(x => x + 10).filter(x => x > 12).collect().foreach(println)

In [None]:
// flatMap
rddS.collect().flatMap(l => l.split(" ")).foreach(println)

In [None]:
{
// pseudo set transformation
val rddSet1 = sc.parallelize(List("coffee", "coffee", "panda", "monkey", "tea"))
val rddSet2 = sc.parallelize(List("coffee", "monkey", "kitty"))

//rddSet1.distinct().collect().foreach(println)
//println("-------------")
//rddSet1.union(rddSet2).collect().foreach(println)
//println("-------------")
//rddSet1.intersection(rddSet2).collect().foreach(println)
//rddSet2.intersection(rddSet1).collect().foreach(println)
//println("-------------")
//rddSet1.subtract(rddSet2).collect().foreach(println)
//println("-------------")
//rddA.cartesian(rddS).collect().foreach(println)
}

In [None]:
{
// Action operations
// reduce(func)
rddA.collect().foreach(println)
val s = rddA.reduce((x,y) => (x * y))
println(s"The multiplication of the RDD itself: $s")
println("-------------")    
// take(n)
rddA.take(3).foreach(println)
println("-------------")
}

In [None]:
val rddSample = sc.parallelize(Array(1,2,3,3))
//println(rddSample.reduce( (x,y) => x+y ))
//rddSample.collect.foreach(println)
//println(rddSample.count())
//rddSample.take(2).foreach(println)
//rddSample.top(2).foreach(println)
//rddSample.countByValue().foreach(println)
//rddSample.collect.foreach(println)
println(rddSample.partitions.size)

In [None]:
// Partition & repartition
//println(rddSample.partitions.size)
val rddSampleRePar = rddSample.repartition(1)
println(rddSample.partitions.size)
println(rddSampleRePar.partitions.size)

## 3. Key/Value Pair Creation and Transformations

In [None]:
// use map()
val rddPairS = rddS.flatMap(x => x.split(" ")).map(x => (x,1))
rddPairS.collect.foreach(println)

In [None]:
// reduceByKey
rddPairS.reduceByKey((a,b) => (a+b)).collect.foreach(println)


val rddStudent = sc.parallelize(List("s123456","s123456","s123456", "s123456", "s654321", "s654321", "s654321", "s654321", "s654321"))
val rddScores = sc.parallelize(Array(78,80,65,90,80,40,50,90,80))
println("-------------")
val rddPairScores = rddStudent.zip(rddScores)
rddPairScores.collect.foreach(println)
//rddPairScores.reduceByKey((a,b) => (a+b)).collect.foreach(println)
println("-------------")
// groupByKey
rddPairS.groupByKey().collect.foreach(println)
//rddPairScores.groupByKey().collect.foreach(println)

In [None]:
// keys and values
rddPairS.keys.collect.foreach(println)
rddPairS.values.collect.foreach(println)

In [None]:
// sortByKey
val rddCount = rddPairS.reduceByKey((a,b) => (a+b))
rddCount.sortByKey().collect.foreach(print)
println()
rddCount.sortByKey(false).collect.foreach(print)

In [None]:
// join
val rddStuNo = sc.parallelize(List("s123456", "s654321"))
val rddStuName = sc.parallelize(List("John", "Mary"))
val rddStuDemo = rddStuNo.zip(rddStuName)
rddStuDemo.collect.foreach(println)
// previously, we have an RDD (rddPairScores) has the score information
// Let's join two RDDs
rddStuDemo.join(rddPairScores).collect.foreach(println)
// should be careful about the order.
rddPairScores.join(rddStuDemo).collect.foreach(println)

## 4. RDD Programming Examples

In [None]:
// Note this is a simple demo without NLP pre-processing steps, such as Stop words removal, Stemming, Tokenization, etc. 
// Please refer to NLP material for a better outcome.
val lines = sc.textFile("file:///home/dr_wang1982/infs3208/data/shakespeare.txt");
val rddWC1 = lines.flatMap(line => line.split(" "))
val rddWC2 = rddWC1.map(word => (word, 1))
val rddCount = rddWC2.reduceByKey((a, b) => (a+b))

//rddCount.collect.foreach(println)
//lines.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey((a,b) =>(a+b)).collect.foreach(println)

rddCount.sortBy(_._2,false).collect.foreach(println)

In [None]:
// Calculate averaged marks
{
rddPairScores.collect.foreach(println)
println("-------------")
val rdd1 = rddPairScores.mapValues(x => (x,1))
rdd1.collect.foreach(println)
//rdd1.values.collect.foreach(println)
val rdd2 = rdd1.reduceByKey((x,y) => (x._1+y._1, x._2+y._2))
rdd2.collect.foreach(println)
val rdd3 = rdd2.mapValues(x => x._1/x._2)
println("-------------")
rdd3.collect.foreach(println)
}

In [None]:
// Get top value of sale records
// data is stored in csv format: separation is ","
{
val n = 5
val lines = sc.textFile("file:///home/dr_wang1982/infs3208/data/sales.txt")
// lines.collect.foreach(println)
val rdd1 = lines.map(l => l.split(","))
// rdd1.collect().foreach(println)
val rdd2 = rdd1.map(_(3))
val rdd3 = rdd2.map(_.toDouble)
//rdd3.collect.foreach(println)
val rdd4 = rdd3.sortBy(a => a,false)
// rdd4.collect.foreach(println)
println("-------------")
println(s"Top $n values are:")
rdd4.take(n).foreach(println)
println("-------------")
//Get Min or Max    
rdd3.sortBy(a => a,false).take(1).foreach(println)
rdd3.sortBy(a => a).take(1).foreach(println)

}


In [None]:
// sorting across multiple files
{
val lines = sc.textFile("file:///home/dr_wang1982/infs3208/data/sort",4)
var index = 0
lines.collect.foreach(println)

println("sorted results")
println("-------------")
lines.map(a => a.toInt).sortBy(a=>a,false).collect.foreach(println)
println("-------------")    
val rdd1 = lines.map(a => a.toInt)
//val rdd2 = rdd1.sortByKey()
val rdd2 = rdd1.sortBy(a => a)
rdd2.collect.foreach(println)
//rdd2.saveAsTextFile("file:///home/dr_wang1982/infs3208/data/results")
}

In [None]:
// Movie Rating Example
{
val ratingSmall = "file:///home/dr_wang1982/infs3208/data/movie/ratings_small.csv"
val moviesSmall = "file:///home/dr_wang1982/infs3208/data/movie/movies_small.csv"
val moviesLatest = "file:///home/dr_wang1982/infs3208/data/movie/movies_latest.csv"
// Movie Rating
val ratingLines = sc.textFile(ratingSmall)
//ratingLines.collect.foreach(println)
val rdd1 = ratingLines.map(line => line.split(","))
val rdd2 = rdd1.map(x => (x(1).toInt, x(2).toDouble))
//rdd2.collect.foreach(println)
//rdd2.groupByKey().collect.foreach(println)

val rdd3 = rdd2.groupByKey().map(d => (d._1,d._2.sum/d._2.size))
// or use mapValues()
//val rdd3 = rdd2.groupByKey().mapValues(ratings => ratings.sum/ratings.size)
//rdd3.collect.foreach(println)

val movieLines = sc.textFile(moviesSmall)
// movieLines.collect.foreach(println)
val rdd4 = movieLines.map(line => line.split(",")).map(x => (x(0).toInt, x(1)))
//rdd3.join(rdd4).collect.foreach(println)
//rdd4.join(rdd3).collect.foreach(println)
//rdd4.join(rdd3).map(x => (x._1, x._2._1, x._2._2)).collect.foreach(println)
val allRankings = rdd4.join(rdd3).map(x => (x._1, x._2._1, x._2._2))
val top100 = allRankings.sortBy(x => x._3, false).take(100)
top100.foreach(println)
}

In [None]:
// A shorter version
val rdd1 = sc.textFile("file:///home/dr_wang1982/infs3208/data/movie/ratings_small.csv").map(line => line.split(",")).map(x => (x(1).toInt, x(2).toDouble)).groupByKey().map(d => {
    val avg = d._2.sum/d._2.size
    (d._1,avg)
    })
val top100 = sc.textFile("file:///home/dr_wang1982/infs3208/data/movie/movies_small.csv").map(line => line.split(",")).map(x => (x(0).toInt, x(1))).join(rdd1).map(x => (x._1, x._2._1, x._2._2)).sortBy(x => x._3, false).take(100)
top100.foreach(println)

In [None]:
// One-line version
val top100 = sc.textFile("file:///home/dr_wang1982/infs3208/data/movie/movies_small.csv").map(line => line.split(",")).map(x => (x(0).toInt, x(1))).join(sc.textFile("file:///home/dr_wang1982/infs3208/data/movie/ratings_small.csv").map(line => line.split(",")).map(x => (x(1).toInt, x(2).toDouble)).groupByKey().map(d => (d._1,d._2.sum/d._2.size))).map(x => (x._1, x._2._1, x._2._2)).sortBy(x => x._3, false).take(100)
top100.foreach(println)