## Dataframe from RDD - SparkSQL is for strutured data

In [2]:
val a = sc.parallelize(1 to 10) // create a simple RDD

a: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:25


In [3]:
a.collect

res0: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)


In [4]:
val b = a.map(x => (x,x+1)) // add the first placeholder + 1 to the second place to get a pair of values

b: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[1] at map at <console>:26


In [5]:
b.collect

res1: Array[(Int, Int)] = Array((1,2), (2,3), (3,4), (4,5), (5,6), (6,7), (7,8), (8,9), (9,10), (10,11))


In [7]:
val dataFrame = b.toDF("first","second")  // create a DataFrame w/ 2 columns

dataFrame: org.apache.spark.sql.DataFrame = [first: int, second: int]


In [9]:
dataFrame.show

+-----+------+
|first|second|
+-----+------+
|    1|     2|
|    2|     3|
|    3|     4|
|    4|     5|
|    5|     6|
|    6|     7|
|    7|     8|
|    8|     9|
|    9|    10|
|   10|    11|
+-----+------+



## DataFrame from a List

In [28]:
val cartoonCharacters = List(("Bugs", 1),("Elmer", 5),("Daffy", 3),("Sylvester", 6),("Tweety",7),
                             ("Sam",9),("Porky",8),("Fog Horn",10),("Coyote",2),("Road Runner",4))
// this is the same for a Sequence as well
//  val cartoonCharacters = Seq(("Bugs", 1),("Elmer", 5),("Daffy", 3),("Sylvester", 6),("Tweety",7),
//                             ("Sam",9),("Porky",8),("Fog Horn",10),("Coyote",2),("Road Runner",4))

cartoonCharacters: List[(String, Int)] = List((Bugs,1), (Elmer,5), (Daffy,3), (Sylvester,6), (Tweety,7), (Sam,9), (Porky,8), (Fog Horn,10), (Coyote,2), (Road Runner,4))


In [32]:
val characterPopularity = cartoonCharacters.toDF("Name", "Popularity")

characterPopularity: org.apache.spark.sql.DataFrame = [Name: string, Popularity: int]


In [33]:
characterPopularity.show

+-----------+----------+
|       Name|Popularity|
+-----------+----------+
|       Bugs|         1|
|      Elmer|         5|
|      Daffy|         3|
|  Sylvester|         6|
|     Tweety|         7|
|        Sam|         9|
|      Porky|         8|
|   Fog Horn|        10|
|     Coyote|         2|
|Road Runner|         4|
+-----------+----------+



In [34]:
// run some queries
characterPopularity.registerTempTable("cartoonCharacters")

In [192]:
spark.sql("SELECT * FROM cartoonCharacters WHERE Name = 'Bugs'").show

+----+----------+
|Name|Popularity|
+----+----------+
|Bugs|         1|
+----+----------+



In [41]:
spark.sql("select count(*) from cartoonCharacters").show

+--------+
|count(1)|
+--------+
|      10|
+--------+



## Inferring Schema using StructType

In [162]:
import org.apache.spark.sql.{SQLContext,Row}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

import org.apache.spark.sql.{SQLContext, Row}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}


In [167]:
val schema = StructType(Array(StructField("Name", StringType,true),
                    StructField("Popluarity Rank", IntegerType, true)))

schema: org.apache.spark.sql.types.StructType = StructType(StructField(Name,StringType,true), StructField(Popluarity Rank,IntegerType,true))


In [169]:
val cartoonCharStructured = sc.parallelize(Seq("Tom", "Jerry", "Spike")).map(x => (x,2+x.length))

cartoonCharStructured: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[155] at map at <console>:28


In [170]:
cartoonCharStructured.collect

res69: Array[(String, Int)] = Array((Tom,5), (Jerry,7), (Spike,7))


In [172]:
val charStructuredRows = cartoonCharStructured.map(x => Row(x._1, x._2))

charStructuredRows: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[157] at map at <console>:29


In [173]:
charStructuredRows.collect

res70: Array[org.apache.spark.sql.Row] = Array([Tom,5], [Jerry,7], [Spike,7])


In [179]:
val charDataFrame = spark.createDataFrame(charStructuredRows, schema)

charDataFrame: org.apache.spark.sql.DataFrame = [Name: string, Popluarity Rank: int]


In [187]:
charDataFrame.show

+-----+---------------+
| Name|Popluarity Rank|
+-----+---------------+
|  Tom|              5|
|Jerry|              7|
|Spike|              7|
+-----+---------------+



### For Inferred schema in class see the spark-shell-sparksql-inferred-schema-in-class.txt file

## Loading different formats

In [185]:
val starwarsCharacters = (spark.read.format("com.databricks.spark.csv")
                          .option("header","true")
                          .option("inferSchema","true")
                          .option("delimiter",",").load("/spark-files/StarWars.csv"))

starwarsCharacters: org.apache.spark.sql.DataFrame = [name: string, height: int ... 5 more fields]


In [186]:
starwarsCharacters.show

+----------------+------+------+--------+---------+-------+-----------+
|            name|height|weight|eyecolor|haircolor|   jedi|    species|
+----------------+------+------+--------+---------+-------+-----------+
| nakin Skywalker|   188|    84|    blue|    blond|   jedi|      human|
|   Padme Amidala|   165|    45|   brown|    brown|no_jedi|      human|
|  Luke Skywalker|   172|    77|    blue|    blond|   jedi|      human|
|  Leia Skywalker|   150|    49|   brown|    brown|no_jedi|      human|
|    Qui-Gon Jinn|   193|    89|    blue|    brown|   jedi|      human|
|  Obi-Wan Kenobi|   182|    77|bluegray|   auburn|   jedi|      human|
|        Han Solo|   180|    80|   brown|    brown|no_jedi|      human|
| Sheev Palpatine|   173|    75|    blue|      red|no_jedi|      human|
|           R2-D2|    96|    32|    null|     null|no_jedi|      droid|
|           C-3PO|   167|    75|    null|     null|no_jedi|      droid|
|            Yoda|    66|    17|   brown|    brown|   jedi|     

In [190]:
starwarsCharacters.registerTempTable("StarWarsCharacters")

In [216]:
spark.sql("SELECT * FROM StarWarsCharacters WHERE species = 'wookiee'").show

+---------+------+------+--------+---------+-------+-------+
|     name|height|weight|eyecolor|haircolor|   jedi|species|
+---------+------+------+--------+---------+-------+-------+
|Chewbacca|   228|   112|    blue|    brown|no_jedi|wookiee|
+---------+------+------+--------+---------+-------+-------+



### XML

<pre>spark-shell --packages com.databricks:spark-xml_2.10:0.4.1
Ivy Default Cache set to: /home/hadoopuser/.ivy2/cache
The jars for the packages stored in: /home/hadoopuser/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/spark/spark-2.4.4/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
com.databricks#spark-xml_2.10 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-997ea178-376d-4ea1-9db6-008d5230a16f;1.0
	confs: [default]
	found com.databricks#spark-xml_2.10;0.4.1 in central
:: resolution report :: resolve 236ms :: artifacts dl 3ms
	:: modules in use:
	com.databricks#spark-xml_2.10;0.4.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   1   |   0   |   0   |   0   ||   1   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-997ea178-376d-4ea1-9db6-008d5230a16f
	confs: [default]
	0 artifacts copied, 1 already retrieved (0kB/7ms)
24/06/16 23:55:02 WARN util.Utils: Your hostname, hadoopuser-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
24/06/16 23:55:02 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/06/16 23:55:02 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to &quot;WARN&quot;.
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/16 23:55:11 WARN util.Utils: Service &apos;SparkUI&apos; could not bind on port 4040. Attempting port 4041.
24/06/16 23:55:11 WARN util.Utils: Service &apos;SparkUI&apos; could not bind on port 4041. Attempting port 4042.
24/06/16 23:55:11 WARN util.Utils: Service &apos;SparkUI&apos; could not bind on port 4042. Attempting port 4043.
24/06/16 23:55:11 WARN util.Utils: Service &apos;SparkUI&apos; could not bind on port 4043. Attempting port 4044.
Spark context Web UI available at http://10.0.2.15:4044
Spark context available as &apos;sc&apos; (master = local[*], app id = local-1718607311354).
Spark session available as &apos;spark&apos;.
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  &apos;_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.4.4
      /_/
         
Using Scala version 2.11.12 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_261)
Type in expressions to have them evaluated.
Type :help for more information.

scala&gt; val employeeDF = spark.read.format(&quot;com.databricks.spark.xml&quot;).option(&quot;inferSchema&quot;,&quot;true&quot;).option(&quot;rootTag&quot;,&quot;employees&quot;).option(&quot;rowTag&quot;,&quot;employee&quot;).load(&quot;/spark-files/employees.xml&quot;)
employeeDF: org.apache.spark.sql.DataFrame = [address: struct&lt;city: string, country: string ... 1 more field&gt;, dept_no: bigint ... 3 more fields]

scala&gt; employeeDF.show
+--------------------+-------+--------+------+------+
|             address|dept_no|emp_name|emp_no|salary|
+--------------------+-------+--------+------+------+
|[Paris, London, 2...|      2|     jon|    10| 15000|
|[Texas, America, ...|      5|    Adom|    11| 25000|
+--------------------+-------+--------+------+------+


scala&gt; 
</pre>

### JSON

In [218]:
val usstates = spark.read.json("/spark-files/us_states.json")

usstates: org.apache.spark.sql.DataFrame = [census_division: string, census_region: string ... 2 more fields]


In [219]:
usstates.collect

res81: Array[org.apache.spark.sql.Row] = Array([East South Central,South,Alabama,AL], [Pacific,West,Alaska,AK], [Mountain,West,Arizona,AZ], [West South Central,South,Arkansas,AR], [Pacific,West,California,CA], [Mountain,West,Colorado,CO], [New England,Northeast,Connecticut,CT], [South Atlantic,South,Delaware,DE], [South Atlantic,South,District Of Columbia,DC], [South Atlantic,South,Florida,FL], [South Atlantic,South,Georgia,GA], [Pacific,West,Hawaii,HI], [Mountain,West,Idaho,ID], [East North Central,Midwest,Illinois,IL], [East North Central,Midwest,Indiana,IN], [West North Central,Midwest,Iowa,IA], [West North Central,Midwest,Kansas,KS], [East South Central,South,Kentucky,KY], [West South Central,South,Louisiana,LA], [New England,Northeast,Maine,ME], [South Atlantic,South,Maryland,MD], ...

In [220]:
usstates.printSchema

root
 |-- census_division: string (nullable = true)
 |-- census_region: string (nullable = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)



In [221]:
usstates.registerTempTable("usstates")

In [224]:
spark.sql("SELECT * FROM usstates").show(51)

+------------------+-------------+--------------------+-----+
|   census_division|census_region|                name|state|
+------------------+-------------+--------------------+-----+
|East South Central|        South|             Alabama|   AL|
|           Pacific|         West|              Alaska|   AK|
|          Mountain|         West|             Arizona|   AZ|
|West South Central|        South|            Arkansas|   AR|
|           Pacific|         West|          California|   CA|
|          Mountain|         West|            Colorado|   CO|
|       New England|    Northeast|         Connecticut|   CT|
|    South Atlantic|        South|            Delaware|   DE|
|    South Atlantic|        South|District Of Columbia|   DC|
|    South Atlantic|        South|             Florida|   FL|
|    South Atlantic|        South|             Georgia|   GA|
|           Pacific|         West|              Hawaii|   HI|
|          Mountain|         West|               Idaho|   ID|
|East No