### Initialize HDFS client

In [1]:
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path

val hdfsPrefix = sys.env("HDFS_URL")
val hadoopConf = new Configuration()
hadoopConf.set("fs.defaultFS", sys.env("HDFS_URL"))
val hdfs = FileSystem.get(hadoopConf)

hdfsPrefix = hdfs://namenode1:8020
hadoopConf = Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml
hdfs = DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1885073787_40, ugi=root (auth:SIMPLE)]]


DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1885073787_40, ugi=root (auth:SIMPLE)]]

### Initialize Spark session

In [2]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

println("Copying Application Jar to HDFS so that workers can see it too")
val rnd = scala.util.Random
val appJarPath = "/app/csv-to-kafka.jar"
hdfs.copyFromLocalFile(new Path("/app/app.jar"), new Path(appJarPath))

println("Initializing Spark context...")
val conf = new SparkConf()
               .setAppName("CSV to Kafka")
               .set("spark.cores.max", "2")
               .set("spark.jars", hdfsPrefix + appJarPath)
val spark: SparkSession = SparkSession.builder.config(conf).getOrCreate()
//if you look in Spark Master UI, a application will be running after this

Copying Application Jar to HDFS so that workers can see it too
Initializing Spark context...


rnd = scala.util.Random$@3ed733e9
appJarPath = /app/csv-to-kafka.jar
conf = org.apache.spark.SparkConf@6a6e5077
spark = org.apache.spark.sql.SparkSession@7da32949


org.apache.spark.sql.SparkSession@7da32949

### Load partitioned CSV data from HDFS

In [3]:
println("Load partitioned CSV from HDFS")
val df = spark.read
    .format("com.databricks.spark.csv")
    .option("inferSchema", "true")
    .option("header", "true")
    .load(hdfsPrefix + "/output/obfuscated-samples.csv")

println("Partitions loaded: " + df.rdd.partitions.size)
df.describe().show()

Load partitioned CSV from HDFS
Partitions loaded: 2
+-------+--------------------+------------------+-------------------+----+
|summary|            personId|               lat|                lon|type|
+-------+--------------------+------------------+-------------------+----+
|  count|                   4|                 4|                  4|   4|
|   mean|                null|-5.966223979991914|  -49.1150839571495|null|
| stddev|                null|3.1309343545864228| 18.583792823102733|null|
|    min|f57ded61-bb16-4f5...|-8.824724236896516| -68.94285644612884| gps|
|    max|fe398302-eace-4ed...|-3.157759723087311|-32.287343468170164| gps|
+-------+--------------------+------------------+-------------------+----+



df = [time: timestamp, personId: string ... 3 more fields]


[time: timestamp, personId: string ... 3 more fields]

### Send data to Kafka topic

In [4]:
import org.apache.spark.sql.avro._
val schemaRegistryAddr = "http://schema-registry:8081"


schemaRegistryAddr = http://schema-registry:8081


http://schema-registry:8081

In [8]:
println("Writing dataframe locations to Kafka topic 'locations'")
df.selectExpr("CAST(personId AS STRING) AS key", "to_json(struct(*)) AS value")
    .write
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:29092")
    .option("topic", "locations")
    .save()
println("done")

Writing dataframe locations to Kafka topic 'locations'
done


lastException: Throwable = null


### Cleanup

In [None]:
hdfs.delete(appJarPath, true)

In [25]:
println("Stop Spark session")
spark.stop()
//if you look in Spark Master UI, no application will be running after stop

Stop Spark session
