### Initialize Spark session

In [15]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SaveMode

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path

println("Initializing Spark context...")
val conf = new SparkConf().setAppName("Example App")
val spark: SparkSession = SparkSession.builder.config(conf).getOrCreate()
//if you look in Spark Master UI, a application will be running after this

Initializing Spark context...


conf = org.apache.spark.SparkConf@3311e290
spark = org.apache.spark.sql.SparkSession@1384d805


lastException: Throwable = null


org.apache.spark.sql.SparkSession@1384d805

### Initialize HDFS client

In [16]:
val hdfsPrefix = sys.env("HDFS_URL")
val hadoopConf = new Configuration()
hadoopConf.set("fs.defaultFS", sys.env("HDFS_URL"))
val hdfs = FileSystem.get(hadoopConf)

hdfsPrefix = hdfs://namenode1:8020
hadoopConf = Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml
hdfs = DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-240526695_40, ugi=root (auth:SIMPLE)]]


DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-240526695_40, ugi=root (auth:SIMPLE)]]

### Load real samples and obfuscate them

In [4]:
println("Copying samples file to HDFS...")
//NEVER SAVE REAL SAMPLES IN GIT!
val srcPath = new Path("/notebooks/obfuscate-geo-samples/fake-samples.csv")
val destPath = new Path("hdfs:///tmp/real-samples.csv")
hdfs.copyFromLocalFile(srcPath, destPath)

Copying samples file to HDFS...


srcPath = /notebooks/obfuscate-geo-samples/fake-samples.csv
destPath = hdfs:/tmp/real-samples.csv


lastException: Throwable = null


hdfs:/tmp/real-samples.csv

In [5]:
println("Load CSV from HDFS to Dataframe")
val df = spark.read
          .format("com.databricks.spark.csv")
          .option("inferSchema", "true")
          .option("header", "true")
          .load(hdfsPrefix + "/tmp/real-samples.csv")
// df.describe().show()

Load CSV from HDFS to Dataframe


df = [time: timestamp, personId: string ... 3 more fields]


[time: timestamp, personId: string ... 3 more fields]

In [14]:
println("Create partitioned file by personId in HDFS")
import org.apache.spark.HashPartitioner
val df1 = df.repartition(50, $"personId")
df1.write
   .format("com.databricks.spark.csv")
   .mode("overwrite")
   .save(hdfsPrefix + "/tmp/real-samples-partitioned.csv")

Create partitioned file by personId in HDFS


lastException = null


Name: java.util.NoSuchElementException
Message: None.get
StackTrace:   at scala.None$.get(Option.scala:347)
  at scala.None$.get(Option.scala:345)
  at org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker$.metrics(BasicWriteStatsTracker.scala:173)
  at org.apache.spark.sql.execution.command.DataWritingCommand$class.metrics(DataWritingCommand.scala:51)
  at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.metrics$lzycompute(InsertIntoHadoopFsRelationCommand.scala:47)
  at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.metrics(InsertIntoHadoopFsRelationCommand.scala:47)
  at org.apache.spark.sql.execution.command.DataWritingCommandExec.metrics$lzycompute(commands.scala:100)
  at org.apache.spark.sql.execution.command.DataWritingCommandExec.metrics(commands.scala:100)
  at org.apache.spark.sql.execution.SparkPlanInfo$.fromSparkPlan(SparkPlanInfo.scala:56)
  at org.apache.spark.sql.execution.SQLExecution$$anonfun$wit

In [7]:
println("Load partitioned CSV from HDFS")
val df2 = spark.read
    .format("com.databricks.spark.csv")
    .option("inferSchema", "true")
    .option("header", "true")
    .load(hdfsPrefix + "/tmp/real-samples-partitioned.csv")

println("Partitions loaded: " + df2.rdd.partitions.size)

Load partitioned CSV from HDFS
Partitions loaded: 2


df2 = [2019-04-18T03:44:31.000Z: timestamp, bbb: string ... 3 more fields]


[2019-04-18T03:44:31.000Z: timestamp, bbb: string ... 3 more fields]

In [8]:
import java.util.UUID.randomUUID

println("Group by user")
val rdd2 = df2.rdd.groupBy(r => r(1))

println("Create random ids and translate positions randomly for each user")
val rdd3 = rdd2.map(e => {
    val rnd = scala.util.Random
    val (k, v) = e
    val idd = randomUUID().toString
    val latd = rnd.nextFloat*0.1
    val lond = rnd.nextFloat*0.1
    val nm = v.map(a => {
        (a(0),idd,a(2).asInstanceOf[Double]+latd,a(3).asInstanceOf[Double]+lond,a(4))
    })
    (k, nm)
})

println("Flatten results")
val rdd4 = rdd3.flatMap(e => {
    val (k, v) = e
    v
})

println("Sort results by timestamp")
val rdd5 = rdd4.sortBy[String](e => {
    val (t, v, v2, v3, v4) = e
    t.toString
})
val rdd6 = rdd5.map(e => {
    (e._1.toString, e._2.toString, e._3.toString, e._4.toString, e._5.toString)
})

// rdd5.foreach(e => {
//   println("#3333#" + e)
// })

Group by user
Create random ids and translate positions randomly for each user
Flatten results
Sort results by timestamp


rdd2 = ShuffledRDD[32] at groupBy at <console>:37
rdd3 = MapPartitionsRDD[33] at map at <console>:40
rdd4 = MapPartitionsRDD[34] at flatMap at <console>:53
rdd5 = MapPartitionsRDD[39] at sortBy at <console>:59
rdd6 = MapPartitionsRDD[40] at map at <console>:63


MapPartitionsRDD[40] at map at <console>:63

### Save obfuscated file

In [9]:
println("Saving to HDFS")
val fp = hdfsPrefix + "/output/obfuscated-samples.csv"
hdfs.delete(new org.apache.hadoop.fs.Path(fp), true)
rdd6.saveAsTextFile(fp)

Saving to HDFS


fp = hdfs://namenode1:8020/output/obfuscated-samples.csv


hdfs://namenode1:8020/output/obfuscated-samples.csv

In [10]:
import java.io.{BufferedWriter, FileWriter}
import scala.collection.JavaConversions._
import scala.collection.mutable.ListBuffer
import au.com.bytecode.opencsv.CSVWriter
import scala.util.Random

println("Save obfuscated file to workspace")
val obr = rdd6.collect()
val obr2 = obr.toList.map(e => Array(e._1.toString, e._2.toString, e._3.toString, e._4.toString, e._5.toString))
val outputFile = new BufferedWriter(new FileWriter("/notebooks/obfuscate-geo-samples/obfuscated-samples.csv"))
val csvWriter = new CSVWriter(outputFile, ',', CSVWriter.NO_QUOTE_CHARACTER)
val csvSchema = Array("time", "person_id", "lat", "lon", "type")
println("Writing CSV file")
csvWriter.writeNext(csvSchema)
csvWriter.writeAll(obr2.toList)
outputFile.close()

Save obfuscated file to workspace
Writing CSV file


obr = Array((2019-04-18 03:49:11.0,5535f25f-0b1d-40c2-a228-692363f9d9d1,-8.564107401733398,-31.26030477534485,gps), (2019-04-18 03:55:33.0,f9e65ed5-f4aa-4624-8e1f-25e3fa44b062,-22.093605798416135,-43.417351112451556,gps), (2019-04-18 03:57:52.0,2aa7fe75-4fb9-456e-b574-adab37f46996,-7.656091567745208,-34.14666087457276,gps), (2019-04-18 04:03:31.0,56ed6483-d2e7-4382-a79d-93aaa1e5f7b3,-3.405296737012863,-60.96523625393486,gps), (2019-04-18 04:09:27.0,5535f25f-0b1d-40c2-a228-692363f9d9d1,-8.564119401733398,-32.26032277534485,gps), (2019-04-18 04:17:3...


Array((2019-04-18 03:49:11.0,5535f25f-0b1d-40c2-a228-692363f9d9d1,-8.564107401733398,-31.26030477534485,gps), (2019-04-18 03:55:33.0,f9e65ed5-f4aa-4624-8e1f-25e3fa44b062,-22.093605798416135,-43.417351112451556,gps), (2019-04-18 03:57:52.0,2aa7fe75-4fb9-456e-b574-adab37f46996,-7.656091567745208,-34.14666087457276,gps), (2019-04-18 04:03:31.0,56ed6483-d2e7-4382-a79d-93aaa1e5f7b3,-3.405296737012863,-60.96523625393486,gps), (2019-04-18 04:09:27.0,5535f25f-0b1d-40c2-a228-692363f9d9d1,-8.564119401733398,-32.26032277534485,gps), (2019-04-18 04:17:3...

### Remove temp files

In [11]:
val tmpPath = new Path("/tmp")
hdfs.delete(tmpPath, true)

tmpPath = /tmp


true

### Stop application

In [12]:
println("Stop Spark session")
spark.stop()
//if you look in Spark Master UI, no application will be running after stop

Stop Spark session
