## Dataframe from RDD - SparkSQL is for strutured data

In [2]:
val a = sc.parallelize(1 to 10) // create a simple RDD

a: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:25


In [3]:
a.collect

res0: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)


In [4]:
val b = a.map(x => (x,x+1)) // add the first placeholder + 1 to the second place to get a pair of values

b: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[1] at map at <console>:26


In [5]:
b.collect

res1: Array[(Int, Int)] = Array((1,2), (2,3), (3,4), (4,5), (5,6), (6,7), (7,8), (8,9), (9,10), (10,11))


In [7]:
val dataFrame = b.toDF("first","second")  // create a DataFrame w/ 2 columns

dataFrame: org.apache.spark.sql.DataFrame = [first: int, second: int]


In [9]:
dataFrame.show

+-----+------+
|first|second|
+-----+------+
|    1|     2|
|    2|     3|
|    3|     4|
|    4|     5|
|    5|     6|
|    6|     7|
|    7|     8|
|    8|     9|
|    9|    10|
|   10|    11|
+-----+------+



## DataFrame from a List

In [28]:
val cartoonCharacters = List(("Bugs", 1),("Elmer", 5),("Daffy", 3),("Sylvester", 6),("Tweety",7),
                             ("Sam",9),("Porky",8),("Fog Horn",10),("Coyote",2),("Road Runner",4))
// this is the same for a Sequence as well
//  val cartoonCharacters = Seq(("Bugs", 1),("Elmer", 5),("Daffy", 3),("Sylvester", 6),("Tweety",7),
//                             ("Sam",9),("Porky",8),("Fog Horn",10),("Coyote",2),("Road Runner",4))

cartoonCharacters: List[(String, Int)] = List((Bugs,1), (Elmer,5), (Daffy,3), (Sylvester,6), (Tweety,7), (Sam,9), (Porky,8), (Fog Horn,10), (Coyote,2), (Road Runner,4))


In [32]:
val characterPopularity = cartoonCharacters.toDF("Name", "Popularity")

characterPopularity: org.apache.spark.sql.DataFrame = [Name: string, Popularity: int]


In [33]:
characterPopularity.show

+-----------+----------+
|       Name|Popularity|
+-----------+----------+
|       Bugs|         1|
|      Elmer|         5|
|      Daffy|         3|
|  Sylvester|         6|
|     Tweety|         7|
|        Sam|         9|
|      Porky|         8|
|   Fog Horn|        10|
|     Coyote|         2|
|Road Runner|         4|
+-----------+----------+



In [34]:
// run some queries
characterPopularity.registerTempTable("cartoonCharacters")

In [192]:
spark.sql("SELECT * FROM cartoonCharacters WHERE Name = 'Bugs'").show

+----+----------+
|Name|Popularity|
+----+----------+
|Bugs|         1|
+----+----------+



In [41]:
spark.sql("select count(*) from cartoonCharacters").show

+--------+
|count(1)|
+--------+
|      10|
+--------+



## Inferring Schema using StructType

In [162]:
import org.apache.spark.sql.{SQLContext,Row}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

import org.apache.spark.sql.{SQLContext, Row}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}


In [167]:
val schema = StructType(Array(StructField("Name", StringType,true),
                    StructField("Popluarity Rank", IntegerType, true)))

schema: org.apache.spark.sql.types.StructType = StructType(StructField(Name,StringType,true), StructField(Popluarity Rank,IntegerType,true))


In [169]:
val cartoonCharStructured = sc.parallelize(Seq("Tom", "Jerry", "Spike")).map(x => (x,2+x.length))

cartoonCharStructured: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[155] at map at <console>:28


In [170]:
cartoonCharStructured.collect

res69: Array[(String, Int)] = Array((Tom,5), (Jerry,7), (Spike,7))


In [172]:
val charStructuredRows = cartoonCharStructured.map(x => Row(x._1, x._2))

charStructuredRows: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[157] at map at <console>:29


In [173]:
charStructuredRows.collect

res70: Array[org.apache.spark.sql.Row] = Array([Tom,5], [Jerry,7], [Spike,7])


In [179]:
val charDataFrame = spark.createDataFrame(charStructuredRows, schema)

charDataFrame: org.apache.spark.sql.DataFrame = [Name: string, Popluarity Rank: int]


In [187]:
charDataFrame.show

+-----+---------------+
| Name|Popluarity Rank|
+-----+---------------+
|  Tom|              5|
|Jerry|              7|
|Spike|              7|
+-----+---------------+



### For Inferred schema in class see the spark-shell-sparksql-inferred-schema-in-class.txt file

## Loading different formats

In [185]:
val starwarsCharacters = (spark.read.format("com.databricks.spark.csv")
                          .option("header","true")
                          .option("inferSchema","true")
                          .option("delimiter",",").load("/spark-files/StarWars.csv"))

starwarsCharacters: org.apache.spark.sql.DataFrame = [name: string, height: int ... 5 more fields]


In [186]:
starwarsCharacters.show

+----------------+------+------+--------+---------+-------+-----------+
|            name|height|weight|eyecolor|haircolor|   jedi|    species|
+----------------+------+------+--------+---------+-------+-----------+
| nakin Skywalker|   188|    84|    blue|    blond|   jedi|      human|
|   Padme Amidala|   165|    45|   brown|    brown|no_jedi|      human|
|  Luke Skywalker|   172|    77|    blue|    blond|   jedi|      human|
|  Leia Skywalker|   150|    49|   brown|    brown|no_jedi|      human|
|    Qui-Gon Jinn|   193|    89|    blue|    brown|   jedi|      human|
|  Obi-Wan Kenobi|   182|    77|bluegray|   auburn|   jedi|      human|
|        Han Solo|   180|    80|   brown|    brown|no_jedi|      human|
| Sheev Palpatine|   173|    75|    blue|      red|no_jedi|      human|
|           R2-D2|    96|    32|    null|     null|no_jedi|      droid|
|           C-3PO|   167|    75|    null|     null|no_jedi|      droid|
|            Yoda|    66|    17|   brown|    brown|   jedi|     

In [190]:
starwarsCharacters.registerTempTable("StarWarsCharacters")

In [191]:
spark.sql("SELECT * FROM StarWarsCharacters WHERE species = 'wookiee'").show

+---------+------+------+--------+---------+-------+-------+
|     name|height|weight|eyecolor|haircolor|   jedi|species|
+---------+------+------+--------+---------+-------+-------+
|Chewbacca|   228|   112|    blue|    brown|no_jedi|wookiee|
+---------+------+------+--------+---------+-------+-------+

