# Spylon Kernel Test with Spark 2.4.4

I use a local SBT installation via /misc/build/0/classes.

This must use the same Scala version as Spark - which is 2.11

# Initialization of the spark context

Note that we can set things like driver memory etc.

If `launcher._spark_home` is not set it will default to looking at the `SPARK_HOME` environment variable.

I run on a cluster owned by the hadoop user who is a member of my group devel.

I build new features for Scala and access them via /misc/build/0/classes. I have to restart the kernel to access any new classes. And must relaunch Spark to access changes.

I can't change the spark.sql.warehouse.dir

In [1]:
%%init_spark
launcher.num_executors = 4
launcher.executor_cores = 2
launcher.driver_memory = '4g'
launcher.conf.set("spark.sql.warehouse.dir", "file:/misc/build/0/spark-eg0/spark-warehouse")
launcher.conf.set("spark.sql.catalogImplementation", "hive")
launcher.conf.set("spark.hadoop.fs.permissions.umask-mode", "002")
launcher.conf.set("spark.driver.extraClassPath", ":/misc/build/0/classes/:/usr/share/java/postgresql.jar")

In [2]:
import artikus.spark.U

Intitializing Scala interpreter ...

Spark Web UI available at http://k1:8088/proxy/application_1579265248054_0002
SparkContext available as 'sc' (version = 2.4.4, master = yarn, app id = application_1579265248054_0002)
SparkSession available as 'spark'


import artikus.spark.U


In [3]:
val cl = spark.getClass().getClassLoader()
cl.asInstanceOf[java.net.URLClassLoader].getURLs.map(x => x.toString())

cl: ClassLoader = sun.misc.Launcher$AppClassLoader@7106e68e
res0: Array[String] = Array(file:/misc/build/0/spark-eg0/target/scala-2.11/classes/, file:/usr/share/java/postgresql.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/conf/, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/commons-net-3.1.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/jackson-jaxrs-1.9.13.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/jsr305-1.3.9.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/mesos-1.4.0-shaded-protobuf.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/hadoop-annotations-2.7.3.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/logging-interceptor-3.12.0.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/generex-1.0.1.jar, file:/misc/share/0/spark-2.4.4-bin...

In [4]:
// These are from the /misc/build/0/classes
U.identity
U.printClass(spark)
U.alert("hello")

class org.apache.spark.sql.SparkSession
hello


In [5]:
U.classes(spark)

res2: Seq[String] = ArraySeq(file:/misc/build/0/spark-eg0/target/scala-2.11/classes/, file:/usr/share/java/postgresql.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/conf/, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/commons-net-3.1.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/jackson-jaxrs-1.9.13.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/jsr305-1.3.9.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/mesos-1.4.0-shaded-protobuf.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/hadoop-annotations-2.7.3.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/logging-interceptor-3.12.0.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/generex-1.0.1.jar, file:/misc/share/0/spark-2.4.4-bin-hadoop2.7/jars/hadoop-mapreduce-client-core-2.7.3.jar, fil...

In [6]:
U.flist(".")

res3: List[java.io.File] = List(./spark-util.ipynb, ./classpath.lst, ./text1.csv, ./akka.log, ./README, ./build.sbt, ./spark0.ipynb, ./.gitmodules, ./.gitignore, ./rev-users.csv, ./.sbtopts, ./derby.log, ./weaves.log, ./.Rhistory, ./rev-devices.csv, ./.emacs.desktop, ./flights.csv, ./LICENSE, ./spark1.ipynb)


In [7]:
%%python
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Of no use for a Spylon notebook

## Spark Configuration

Some basic operations.


In [8]:
spark // spark is the SQL session

res4: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@46b129f5


In [9]:
spark.version

res5: String = 2.4.4


In [10]:
spark.conf.getAll foreach (x => println(x._1 + " --> " + x._2))

spark.sql.warehouse.dir --> file:/misc/build/0/spark-eg0/spark-warehouse
spark.hadoop.fs.permissions.umask-mode --> 002
spark.driver.host --> j1.host
spark.serializer.objectStreamReset --> 100
spark.driver.port --> 36643
spark.rdd.compress --> True
spark.executorEnv.PYTHONPATH --> {{PWD}}/pyspark.zip<CPS>{{PWD}}/py4j-0.10.7-src.zip
spark.repl.class.uri --> spark://j1.host:36643/classes
spark.repl.class.outputDir --> /var/tmp/tmpis43fpb2
spark.app.name --> spylon-kernel
spark.driver.memory --> 4g
spark.executor.instances --> 4
spark.ui.showConsoleProgress --> true
spark.executor.id --> driver
spark.submit.deployMode --> client
spark.master --> yarn
spark.yarn.archive --> hdfs:///user/spark/spark-libs-2_4_4.jar
spark.ui.filters --> org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter
spark.driver.extraClassPath --> :/misc/build/0/classes/:/usr/share/java/postgresql.jar
spark.executor.extraClassPath --> /misc/build/0/classes:/usr/share/java/postgresql.jar
spark.sql.catalogImplementa

In [11]:
val dbs = spark.catalog.listDatabases()

dbs: org.apache.spark.sql.Dataset[org.apache.spark.sql.catalog.Database] = [name: string, description: string ... 1 more field]


In [12]:
U.printClass(dbs)
dbs.show()

class org.apache.spark.sql.Dataset
+-------+--------------------+--------------------+
|   name|         description|         locationUri|
+-------+--------------------+--------------------+
|default|Default Hive data...|file:/misc/build/...|
+-------+--------------------+--------------------+



In [13]:
val d0 = spark.catalog.listDatabases().take(1)
d0(0).locationUri

d0: Array[org.apache.spark.sql.catalog.Database] = Array(Database[name='default', description='Default Hive database', path='file:/misc/build/0/spark-srcs/spark-warehouse'])
res8: String = file:/misc/build/0/spark-srcs/spark-warehouse


## SparkSession operations

Basic operations
https://jaceklaskowski.gitbooks.io/mastering-spark-sql/spark-sql-SparkSession.html#createDataset

In [14]:
val strings = spark.emptyDataset[String]
strings.printSchema

root
 |-- value: string (nullable = true)



strings: org.apache.spark.sql.Dataset[String] = [value: string]


In [15]:
val one = spark.createDataset(Seq(1))
one.show
one.printSchema

+-----+
|value|
+-----+
|    1|
+-----+

root
 |-- value: integer (nullable = false)



one: org.apache.spark.sql.Dataset[Int] = [value: int]


In [16]:
// Use an implicit requires a "spark" in the namespace.
import spark.implicits._

val one = Seq(1).toDS
one.show
one.printSchema

+-----+
|value|
+-----+
|    1|
+-----+

root
 |-- value: integer (nullable = false)



import spark.implicits._
one: org.apache.spark.sql.Dataset[Int] = [value: int]


In [17]:
// Using spark.range()
val range0 = spark.range(start = 0, end = 4, step = 2, numPartitions = 5)
range0.show

+---+
| id|
+---+
|  0|
|  2|
+---+



range0: org.apache.spark.sql.Dataset[Long] = [id: bigint]


In [18]:
// More packing

In [19]:
val sc = spark.sparkContext

sc: org.apache.spark.SparkContext = org.apache.spark.SparkContext@1d547714


In [20]:
val data = Seq("a", "b", "c", "d") zip (0 to 4)

U.printClass(data)

class scala.collection.immutable.$colon$colon


data: Seq[(String, Int)] = List((a,0), (b,1), (c,2), (d,3))


In [21]:
val data = Seq("foo", "bar", "baz") zip 1 :: 2 :: 3 :: Nil
val data1 = Seq("foo", "bar", "bar") zip 4 :: 5 :: 6 :: Nil

data: Seq[(String, Int)] = List((foo,1), (bar,2), (baz,3))
data1: Seq[(String, Int)] = List((foo,4), (bar,5), (bar,6))


In [22]:
val ds = spark.createDataset(data)

val ds1 = sc.parallelize(data)

U.printClass(ds)
U.printClass(ds1)

val ds2 = sc.parallelize(data1)

ds1.join(ds2).take(5)

class org.apache.spark.sql.Dataset
class org.apache.spark.rdd.ParallelCollectionRDD


ds: org.apache.spark.sql.Dataset[(String, Int)] = [_1: string, _2: int]
ds1: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[5] at parallelize at <console>:35
ds2: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[6] at parallelize at <console>:40
res15: Array[(String, (Int, Int))] = Array((bar,(2,6)), (bar,(2,5)), (foo,(1,4)))


In [23]:
// Local file URI
// non-existent file loads
// /misc/build/0/prog-scala-2nd-ed-code-examples
val local2 = U.local1(".")

local2: String => java.net.URI = <function1>


In [24]:
val f1 = "rev-users.csv"
val file = sc.textFile(local2(f1).toString())
U.printClass(file)

class org.apache.spark.rdd.MapPartitionsRDD


f1: String = rev-users.csv
file: org.apache.spark.rdd.RDD[String] = file:/misc/build/0/spark-eg0/rev-users.csv MapPartitionsRDD[11] at textFile at <console>:33


In [25]:
// This file has a header row
// Take the first row, index into it, split and return a sequence
val h2 = file.take(1)(0).split(",").toSeq

// Get the remainder by using subtract
// convert the header row back to an RDD using parallelize
val r1 = file.subtract(sc.parallelize(file.take(1)))

h2: Seq[String] = WrappedArray(user_id, birth_year, country, city, created_date, user_settings_crypto_unlocked, plan, attributes_notifications_marketing_push, attributes_notifications_marketing_email, num_contacts, num_referrals, num_successful_referrals)
r1: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[16] at subtract at <console>:38


In [26]:
// Look at the underlying row
r1.take(1)

res17: Array[String] = Array(user_1113,1954,GB,Billericay,2018-01-26 07:34:13.040468,1,PREMIUM,,,2,0,0)


In [27]:
// Now map over the quantities
// The transformations are only applied when we take(), use the column names from h2.
val df0 = r1.map(_.split(",")).map{case Array(a,b,c,d,e,f,g,h,i,j,k,l) => 
(a,b.toInt,c,d,e,f.toInt,g,h,i,j.toInt,k.toInt,l.toInt)}.toDF(h2:_*)
df0.take(1)

df0: org.apache.spark.sql.DataFrame = [user_id: string, birth_year: int ... 10 more fields]
res18: Array[org.apache.spark.sql.Row] = Array([user_1113,1954,GB,Billericay,2018-01-26 07:34:13.040468,1,PREMIUM,,,2,0,0])


In [28]:
val f2 = "rev-devices.csv"
val file2 = sc.textFile(local2(f2).toString())
U.printClass(file2)

class org.apache.spark.rdd.MapPartitionsRDD


f2: String = rev-devices.csv
file2: org.apache.spark.rdd.RDD[String] = file:/misc/build/0/spark-eg0/rev-devices.csv MapPartitionsRDD[24] at textFile at <console>:33


In [31]:
// But error results here if file does not exist
// Or returns empty array if it is empty
val lens = file.map(s => s.length)
file.take(5)
lens.take(5)

lens: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[33] at map at <console>:34
res22: Array[Int] = Array(212, 73, 68, 74, 72)


In [30]:
val x0 = file.take(1)

// Some arbitrary file processing - append a number to each line
val pairs = file.map(s => (s, 911))
val counts = pairs.reduceByKey((a, b) => a + b)

In [30]:
val counts1 = counts.repartition(1)

U.rmdir("counts1")
counts1.saveAsTextFile(local2("counts1").toString())

org.apache.spark.SparkException:  Job aborted.

In [None]:
val pairs = file.map(x => (x.split(",")(0), x))

val pairs1 = pairs.join(pairs)

In [None]:
// Make some (K, V) tuples

println(x0(0))

val x1 = x0(0).split(",").toSeq

In [None]:
val df0 = file.map(_.split(",")).map{case Array(a,b,c,d,e,f,g,h,i,j,k,l) => 
(a,b,c,d,e,f,g,h,i,j,k,l)}.toDF(x1:_*)

In [None]:
// The x1:_* is to be preferred to this

// val fileToDf = file.map(_.split(",")).map{case Array(a,b,c,d,e,f,g,h,i,j,k,l) => 
// (a,b,c,d,e,f,g,h,i,j,k,l)}.toDF("user_id", "birth_year", "country", "city", "created_date", "user_settings_crypto_unlocked", "plan", "attributes_notifications_marketing_push", "attributes_notifications_marketing_email", "num_contacts", "num_referrals", "num_successful_referrals")

In [None]:
val df0 = file.map(_.split(",")).map{case Array(a,b,c,d,e,f,g,h,i,j,k,l) => 
(a,b.toInt,c,d,e,f,g,h,i,j,k,l)}.toDF(x1:_*)

In [None]:
fileToDf.show(3)

In [None]:
file.map(_.split(",")).take(1)

In [None]:
val df1 = file.subtract(sc.parallelize(file.take(1)))

In [None]:
U.printClass(sc)

In [None]:
df1.take(1)

In [None]:
def split(f1:String, sep:String)(implicit sc: org.apache.spark.SparkContext) : org.apache.spark.rdd.RDD[String] = {
    val f = sc.textFile(f1)
    return f
}

In [None]:
split(local2(f1).toString(), ",")(sc)

In [None]:
U.printClass(sc)