### Initialize Spark session

In [2]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SaveMode

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path

println("Initializing Spark context...")
val conf = new SparkConf()
            .setAppName("Obfuscate samples")
            .set("spark.cores.max", "2")
val spark: SparkSession = SparkSession.builder.config(conf).getOrCreate()
//if you look in Spark Master UI, a application will be running after this

Initializing Spark context...


conf = org.apache.spark.SparkConf@7f270858
spark = org.apache.spark.sql.SparkSession@1a27ceab


org.apache.spark.sql.SparkSession@1a27ceab

### Initialize HDFS client

In [3]:
val hdfsPrefix = sys.env("HDFS_URL")
val hadoopConf = new Configuration()
hadoopConf.set("fs.defaultFS", sys.env("HDFS_URL"))
val hdfs = FileSystem.get(hadoopConf)

hdfsPrefix = hdfs://namenode1:8020
hadoopConf = Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml
hdfs = DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_705393058_42, ugi=root (auth:SIMPLE)]]


DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_705393058_42, ugi=root (auth:SIMPLE)]]

### Load real samples and obfuscate them

In [4]:
println("Copying samples file to HDFS...")
//NEVER SAVE REAL SAMPLES IN GIT!
val srcPath = new Path("/notebooks/obfuscate-geo-samples/real-samples.csv")
val destPath = new Path("hdfs:///tmp/real-samples.csv")
hdfs.copyFromLocalFile(srcPath, destPath)

Copying samples file to HDFS...


srcPath = /notebooks/obfuscate-geo-samples/real-samples.csv
destPath = hdfs:/tmp/real-samples.csv


hdfs:/tmp/real-samples.csv

In [14]:
println("Load CSV from HDFS to Dataframe")
val df = spark.read
          .format("com.databricks.spark.csv")
          .option("inferSchema", "true")
          .option("header", "true")
          .load(hdfsPrefix + "/tmp/real-samples.csv")
println("Partitions: " + df.rdd.partitions.size)

Load CSV from HDFS to Dataframe
Partitions: 2


df = [time: timestamp, personId: string ... 3 more fields]


[time: timestamp, personId: string ... 3 more fields]

In [17]:
println("Create partitioned file by personId in HDFS")
import org.apache.spark.HashPartitioner
val df1 = df.repartition(50, $"personId")
println("Partitions: " + df1.rdd.partitions.size)
df1.write
   .format("csv")
   .mode("overwrite")
   .save(hdfsPrefix + "/tmp/real-samples-partitioned.csv")

Create partitioned file by personId in HDFS
Partitions: 50


df1 = [time: timestamp, personId: string ... 3 more fields]


[time: timestamp, personId: string ... 3 more fields]

In [19]:
println("Load partitioned CSV from HDFS")
val df2 = spark.read
    .format("com.databricks.spark.csv")
    .option("inferSchema", "true")
    .option("header", "true")
    .load(hdfsPrefix + "/tmp/real-samples-partitioned.csv")

println("Partitions loaded: " + df2.rdd.partitions.size)

Load partitioned CSV from HDFS
Partitions loaded: 2


df2 = [2020-04-18T03:43:16.000Z: timestamp, AAAAAAAAAAAAAAAAAAAAAKVBLyUmGgdtVg: string ... 3 more fields]


[2020-04-18T03:43:16.000Z: timestamp, AAAAAAAAAAAAAAAAAAAAAKVBLyUmGgdtVg: string ... 3 more fields]

In [20]:
import java.util.UUID.randomUUID

println("Group by user")
val rdd2 = df2.rdd.groupBy(r => r(1))

println("Create random ids and translate positions randomly for each user")
val rdd3 = rdd2.map(e => {
    val rnd = scala.util.Random
    val (k, v) = e
    val idd = randomUUID().toString
    val latd = rnd.nextFloat*0.1
    val lond = rnd.nextFloat*0.1
    val nm = v.map(a => {
        (a(0),idd,a(2).asInstanceOf[Double]+latd,a(3).asInstanceOf[Double]+lond,a(4))
    })
    (k, nm)
})

println("Flatten results")
val rdd4 = rdd3.flatMap(e => {
    val (k, v) = e
    v
})

println("Sort results by timestamp")
val rdd5 = rdd4.sortBy[String](e => {
    val (t, v, v2, v3, v4) = e
    t.toString
})
val rdd6 = rdd5.map(e => {
    (e._1.toString, e._2.toString, e._3.toString, e._4.toString, e._5.toString)
})

// rdd5.foreach(e => {
//   println("#3333#" + e)
// })

Group by user
Create random ids and translate positions randomly for each user
Flatten results
Sort results by timestamp


rdd2 = ShuffledRDD[173] at groupBy at <console>:61
rdd3 = MapPartitionsRDD[174] at map at <console>:64
rdd4 = MapPartitionsRDD[175] at flatMap at <console>:77
rdd5 = MapPartitionsRDD[180] at sortBy at <console>:83
rdd6 = MapPartitionsRDD[181] at map at <console>:87


MapPartitionsRDD[181] at map at <console>:87

### Save obfuscated file

In [None]:
println("Saving to HDFS")

val fp = hdfsPrefix + "/output/obfuscated-samples.csv"
hdfs.delete(new org.apache.hadoop.fs.Path(fp), true)
val df = rdd6.toDF("time","personId","lat","lon","type")
val df22 = df.repartition(50, $"personId")
df22.write
    .format("csv")
    .option("header", "true")
    .save(fp)
println("Saved obfuscated samples using " + df22.rdd.getNumPartitions + " partitions")

Saving to HDFS


In [9]:
import java.io.{BufferedWriter, FileWriter}
import scala.collection.JavaConversions._
import scala.collection.mutable.ListBuffer
import au.com.bytecode.opencsv.CSVWriter
import scala.util.Random

println("Save obfuscated file to workspace")
val obr = rdd6.collect()
val obr2 = obr.toList.map(e => Array(e._1.toString, e._2.toString, e._3.toString, e._4.toString, e._5.toString))
val outputFile = new BufferedWriter(new FileWriter("/notebooks/obfuscate-geo-samples/obfuscated-samples.csv"))
val csvWriter = new CSVWriter(outputFile, ',', CSVWriter.NO_QUOTE_CHARACTER)
val csvSchema = Array("time", "person_id", "lat", "lon", "type")
println("Writing CSV file")
csvWriter.writeNext(csvSchema)
csvWriter.writeAll(obr2.toList)
outputFile.close()

Save obfuscated file to workspace
Writing CSV file


obr = Array((2020-04-18 04:03:31.0,a61813bc-67f9-400d-a9b3-8b99f8db4bb0,-2.9402094603157045,-59.944673426347734,gps), (2020-04-18 04:09:57.0,96f39f44-e120-4159-a562-9a64cdf9c845,-8.119173935214995,-35.31061597655297,gps), (2020-04-18 04:24:21.0,a61813bc-67f9-400d-a9b3-8b99f8db4bb0,-2.9402204603157043,-59.944698426347735,gps), (2020-04-18 04:30:05.0,96f39f44-e120-4159-a562-9a64cdf9c845,-8.119234935214996,-35.310576976552966,gps), (2020-04-18 04:38:06.0,f4e3e5cb-2439-4de3-8657-b2d9037eefc6,-27.55300440788078,-48.42691769392586,gps), (2020-04-18 04:4...


Array((2020-04-18 04:03:31.0,a61813bc-67f9-400d-a9b3-8b99f8db4bb0,-2.9402094603157045,-59.944673426347734,gps), (2020-04-18 04:09:57.0,96f39f44-e120-4159-a562-9a64cdf9c845,-8.119173935214995,-35.31061597655297,gps), (2020-04-18 04:24:21.0,a61813bc-67f9-400d-a9b3-8b99f8db4bb0,-2.9402204603157043,-59.944698426347735,gps), (2020-04-18 04:30:05.0,96f39f44-e120-4159-a562-9a64cdf9c845,-8.119234935214996,-35.310576976552966,gps), (2020-04-18 04:38:06.0,f4e3e5cb-2439-4de3-8657-b2d9037eefc6,-27.55300440788078,-48.42691769392586,gps), (2020-04-18 04:4...

### Remove temp files

In [10]:
val tmpPath = new Path("/tmp")
hdfs.delete(tmpPath, true)

tmpPath = /tmp


true

### Stop application

In [11]:
println("Stop Spark session")
spark.stop()
//if you look in Spark Master UI, no application will be running after stop

Stop Spark session


Waiting for a Spark session to start...