# Spylon Kernel Test with Spark 3.4.0

This has been updated from Spark 2.4. I use a local SBT installation via /misc/build/0/classes.

This must use the same Scala version as Spark - which is 2.13 (it was 2.11).

I haven't recompiled the Scala source code in src - the artikus.spark classes.

Once a Spark context is instantiated, it should be accessible from http://j1:4040 if the host of this notebook is j1. This hostname is spark.driver.host

## Initialization of the spark context

Note that we can set things like driver memory etc.

If `launcher._spark_home` is not set it will default to looking at the `SPARK_HOME` environment variable.

I run on a cluster owned by the hadoop user who is a member of my group devel.

I build new features for Scala and access them via /misc/build/0/classes. I have to restart the kernel to access any new classes. And must relaunch Spark to access changes.

I can't change the spark.sql.warehouse.dir

In [None]:
%%init_spark
launcher.num_executors = 4
launcher.executor_cores = 2
launcher.driver_memory = '4g'
launcher.conf.set("spark.sql.warehouse.dir", "file:/misc/build/0/spark-eg0/spark-warehouse")
launcher.conf.set("spark.sql.catalogImplementation", "hive")
launcher.conf.set("spark.hadoop.fs.permissions.umask-mode", "002")
launcher.conf.set("spark.driver.extraClassPath", ":/misc/build/0/classes/:/usr/share/java/postgresql.jar")

In [None]:
import artikus.spark.U

In [None]:
val cl = spark.getClass().getClassLoader()
cl.asInstanceOf[java.net.URLClassLoader].getURLs.map(x => x.toString())

In [None]:
// These are from the /misc/build/0/classes
U.identity
U.printClass(spark)
U.alert("hello")

In [None]:
U.classes(spark)

In [None]:
U.flist(".")

In [None]:
%%python
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Of no use for a Spylon notebook

## Spark Configuration

Some basic operations.


In [None]:
spark // spark is the SQL session

In [None]:
spark.version

In [None]:
spark.conf.getAll foreach (x => println(x._1 + " --> " + x._2))

In [None]:
val dbs = spark.catalog.listDatabases()

In [None]:
U.printClass(dbs)
dbs.show()

In [None]:
val d0 = spark.catalog.listDatabases().take(1)
d0(0).locationUri

## SparkSession operations

Basic operations
https://jaceklaskowski.gitbooks.io/mastering-spark-sql/spark-sql-SparkSession.html#createDataset

In [None]:
val strings = spark.emptyDataset[String]
strings.printSchema

In [None]:
val one = spark.createDataset(Seq(1))
one.show
one.printSchema

In [None]:
// Use an implicit requires a "spark" in the namespace.
import spark.implicits._

val one = Seq(1).toDS
one.show
one.printSchema

In [None]:
// Using spark.range()
val range0 = spark.range(start = 0, end = 4, step = 2, numPartitions = 5)
range0.show

In [None]:
// More packing

In [None]:
val sc = spark.sparkContext

In [None]:
val data = Seq("a", "b", "c", "d") zip (0 to 4)

U.printClass(data)

In [None]:
val data = Seq("foo", "bar", "baz") zip 1 :: 2 :: 3 :: Nil
val data1 = Seq("foo", "bar", "bar") zip 4 :: 5 :: 6 :: Nil

In [None]:
val ds = spark.createDataset(data)

val ds1 = sc.parallelize(data)

U.printClass(ds)
U.printClass(ds1)

val ds2 = sc.parallelize(data1)

ds1.join(ds2).take(5)

In [None]:
// Local file URI
// non-existent file loads
// /misc/build/0/prog-scala-2nd-ed-code-examples
val local2 = U.local1(".")

In [None]:
val f1 = "rev-users.csv"
val file = sc.textFile(local2(f1).toString())
U.printClass(file)

In [None]:
// This file has a header row
// Take the first row, index into it, split and return a sequence
val h2 = file.take(1)(0).split(",").toSeq

// Get the remainder by using subtract
// convert the header row back to an RDD using parallelize
val r1 = file.subtract(sc.parallelize(file.take(1)))

In [None]:
// Look at the underlying row
r1.take(1)

In [None]:
// Now map over the quantities
// The transformations are only applied when we take(), use the column names from h2.
val df0 = r1.map(_.split(",")).map{case Array(a,b,c,d,e,f,g,h,i,j,k,l) => 
(a,b.toInt,c,d,e,f.toInt,g,h,i,j.toInt,k.toInt,l.toInt)}.toDF(h2:_*)
df0.take(1)

In [None]:
val f2 = "rev-devices.csv"
val file2 = sc.textFile(local2(f2).toString())
U.printClass(file2)

In [None]:
// But error results here if file does not exist
// Or returns empty array if it is empty
val lens = file.map(s => s.length)
file.take(5)
lens.take(5)

In [None]:
val x0 = file.take(1)

// Some arbitrary file processing - append a number to each line
val pairs = file.map(s => (s, 911))
val counts = pairs.reduceByKey((a, b) => a + b)

In [None]:
val counts1 = counts.repartition(1)

U.rmdir("counts1")
counts1.saveAsTextFile(local2("counts1").toString())

In [None]:
val pairs = file.map(x => (x.split(",")(0), x))

val pairs1 = pairs.join(pairs)

In [None]:
// Make some (K, V) tuples

println(x0(0))

val x1 = x0(0).split(",").toSeq

In [None]:
val df0 = file.map(_.split(",")).map{case Array(a,b,c,d,e,f,g,h,i,j,k,l) => 
(a,b,c,d,e,f,g,h,i,j,k,l)}.toDF(x1:_*)

In [None]:
// The x1:_* is to be preferred to this

// val fileToDf = file.map(_.split(",")).map{case Array(a,b,c,d,e,f,g,h,i,j,k,l) => 
// (a,b,c,d,e,f,g,h,i,j,k,l)}.toDF("user_id", "birth_year", "country", "city", "created_date", "user_settings_crypto_unlocked", "plan", "attributes_notifications_marketing_push", "attributes_notifications_marketing_email", "num_contacts", "num_referrals", "num_successful_referrals")

In [None]:
val df0 = file.map(_.split(",")).map{case Array(a,b,c,d,e,f,g,h,i,j,k,l) => 
(a,b.toInt,c,d,e,f,g,h,i,j,k,l)}.toDF(x1:_*)

In [None]:
fileToDf.show(3)

In [None]:
file.map(_.split(",")).take(1)

In [None]:
val df1 = file.subtract(sc.parallelize(file.take(1)))

In [None]:
U.printClass(sc)

In [None]:
df1.take(1)

In [None]:
def split(f1:String, sep:String)(implicit sc: org.apache.spark.SparkContext) : org.apache.spark.rdd.RDD[String] = {
    val f = sc.textFile(f1)
    return f
}

In [None]:
split(local2(f1).toString(), ",")(sc)

In [None]:
U.printClass(sc)