# SparkMagic

In [1]:
%help

Magic,Example,Explanation
info,%%info,Outputs session information for the current Livy endpoint.
cleanup,%%cleanup -f,"Deletes all sessions for the current Livy endpoint, including this notebook's session. The force flag is mandatory."
delete,%%delete -f -s 0,Deletes a session by number for the current Livy endpoint. Cannot delete this kernel's session.
logs,%%logs,Outputs the current session's Livy logs.
configure,"%%configure -f {""executorMemory"": ""1000M"", ""executorCores"": 4}",Configure the session creation parameters. The force flag is mandatory if a session has already been  created and the session will be dropped and recreated. Look at Livy's POST /sessions Request Body for a list of valid parameters. Parameters must be passed in as a JSON string.
spark,%%spark -o df df = spark.read.parquet('...,"Executes spark commands.  Parameters:  -o VAR_NAME: The Spark dataframe of name VAR_NAME will be available in the %%local Python context as a  Pandas dataframe with the same name.  -m METHOD: Sample method, either take or sample.  -n MAXROWS: The maximum number of rows of a dataframe that will be pulled from Livy to Jupyter.  If this number is negative, then the number of rows will be unlimited.  -r FRACTION: Fraction used for sampling."
sql,%%sql -o tables -q SHOW TABLES,"Executes a SQL query against the variable sqlContext (Spark v1.x) or spark (Spark v2.x).  Parameters:  -o VAR_NAME: The result of the SQL query will be available in the %%local Python context as a  Pandas dataframe.  -q: The magic will return None instead of the dataframe (no visualization).  -m, -n, -r are the same as the %%spark parameters above."
local,%%local a = 1,All the code in subsequent lines will be executed locally. Code must be valid Python code.


In [None]:
%%configure -f 
{
    "name":"sparkmagic-demo",
    "kind": "spark",
    "executorMemory": "4G", 
    "executorCores": 4,
    "driverMemory": "1000M", 
    "numExecutors": 2
}

In [2]:
%%spark
sc.version

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,,spark,idle,,,✔


SparkSession available as 'spark'.
res1: String = 2.1.0

In [3]:
spark.conf.get("spark.sql.catalogImplementation")

res2: String = in-memory

In [4]:
spark.conf.getAll.foreach(println)

(spark.driver.host,172.26.0.2)
(spark.livy.spark_major_version,2)
(spark.driver.port,42821)
(hive.metastore.warehouse.dir,file:/usr/spark-2.1.0/spark-warehouse/)
(spark.repl.class.uri,spark://172.26.0.2:42821/classes)
(spark.jars,file:/apps/livy-server-0.3.0/rsc-jars/spark-tags_2.10-2.1.0.jar,file:/apps/livy-server-0.3.0/rsc-jars/livy-api-0.3.0.jar,file:/apps/livy-server-0.3.0/rsc-jars/unused-1.0.0.jar,file:/apps/livy-server-0.3.0/rsc-jars/netty-all-4.0.29.Final.jar,file:/apps/livy-server-0.3.0/rsc-jars/livy-rsc-0.3.0.jar,file:/apps/livy-server-0.3.0/repl_2.11-jars/commons-codec-1.9.jar,file:/apps/livy-server-0.3.0/repl_2.11-jars/livy-core_2.11-0.3.0.jar,file:/apps/livy-server-0.3.0/repl_2.11-jars/livy-repl_2.11-0.3.0.jar)
(spark.repl.class.outputDir,/tmp/spark793968166953382542)
(spark.app.name,livy-session-1)
(spark.driver.memory,1000M)
(spark.executor.id,driver)
(spark.submit.deployMode,client)
(spark.yarn.maxAppAttempts,1)
(spark.master,local)
(spark.yarn.submit.waitAppCompletion,f

In [5]:
println(sc.version)
sc.getConf.getAll.
    foreach(println)
//     mkString("\n")

(spark.master,local)
(spark.repl.class.uri,spark://172.26.0.2:42821/classes)
(spark.sql.catalogImplementation,in-memory)
(spark.app.name,livy-session-1)
(spark.driver.host,172.26.0.2)
(spark.yarn.submit.waitAppCompletion,false)
(spark.executor.id,driver)
(spark.executor.cores,2)
(spark.driver.port,42821)
(spark.jars,file:/apps/livy-server-0.3.0/rsc-jars/spark-tags_2.10-2.1.0.jar,file:/apps/livy-server-0.3.0/rsc-jars/livy-api-0.3.0.jar,file:/apps/livy-server-0.3.0/rsc-jars/unused-1.0.0.jar,file:/apps/livy-server-0.3.0/rsc-jars/netty-all-4.0.29.Final.jar,file:/apps/livy-server-0.3.0/rsc-jars/livy-rsc-0.3.0.jar,file:/apps/livy-server-0.3.0/repl_2.11-jars/commons-codec-1.9.jar,file:/apps/livy-server-0.3.0/repl_2.11-jars/livy-core_2.11-0.3.0.jar,file:/apps/livy-server-0.3.0/repl_2.11-jars/livy-repl_2.11-0.3.0.jar)
(spark.app.id,local-1600530350627)
(spark.yarn.maxAppAttempts,1)
(spark.livy.spark_major_version,2)
(spark.submit.deployMode,client)
(spark.repl.class.outputDir,/tmp/spark79396816

# Data loading

### Create DataFrame from CSV file

In [6]:
val df = spark.read.
    option("header", "true").
    option("inferSchema", "true").
    option("delimiter", ",").
    csv("file:///data/insurance_sample.csv")

df: org.apache.spark.sql.DataFrame = [policyID: int, statecode: string ... 16 more fields]

In [7]:
df.printSchema()

root
 |-- policyID: integer (nullable = true)
 |-- statecode: string (nullable = true)
 |-- county: string (nullable = true)
 |-- eq_site_limit: double (nullable = true)
 |-- hu_site_limit: double (nullable = true)
 |-- fl_site_limit: double (nullable = true)
 |-- fr_site_limit: double (nullable = true)
 |-- tiv_2011: double (nullable = true)
 |-- tiv_2012: double (nullable = true)
 |-- eq_site_deductible: double (nullable = true)
 |-- hu_site_deductible: double (nullable = true)
 |-- fl_site_deductible: double (nullable = true)
 |-- fr_site_deductible: integer (nullable = true)
 |-- point_latitude: double (nullable = true)
 |-- point_longitude: double (nullable = true)
 |-- line: string (nullable = true)
 |-- construction: string (nullable = true)
 |-- point_granularity: integer (nullable = true)

In [8]:
df.show(5)

+--------+---------+-----------+-------------+-------------+-------------+-------------+---------+----------+------------------+------------------+------------------+------------------+--------------+---------------+-----------+------------+-----------------+
|policyID|statecode|     county|eq_site_limit|hu_site_limit|fl_site_limit|fr_site_limit| tiv_2011|  tiv_2012|eq_site_deductible|hu_site_deductible|fl_site_deductible|fr_site_deductible|point_latitude|point_longitude|       line|construction|point_granularity|
+--------+---------+-----------+-------------+-------------+-------------+-------------+---------+----------+------------------+------------------+------------------+------------------+--------------+---------------+-----------+------------+-----------------+
|  119736|       FL|CLAY COUNTY|     498960.0|     498960.0|     498960.0|     498960.0| 498960.0|  792148.9|               0.0|            9979.2|               0.0|                 0|     30.102261|     -81.711777|Resi

### Save data into Table

In [9]:
import org.apache.spark.sql.{SQLContext, SaveMode}

df.write.mode(SaveMode.Overwrite).
    saveAsTable("insurance")

# Tables

In [None]:
val df = spark.sql("SHOW DATABASES")
df.show()

In [None]:
spark.catalog.listTables("default").show()

### Data query

In [None]:
%%sql
SELECT * FROM insurance LIMIT 2