## Dataframe from RDD - SparkSQL is for strutured data

In [2]:
val a = sc.parallelize(1 to 10) // create a simple RDD

a: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:25


In [3]:
a.collect

res0: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)


In [4]:
val b = a.map(x => (x,x+1)) // add the first placeholder + 1 to the second place to get a pair of values

b: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[1] at map at <console>:26


In [5]:
b.collect

res1: Array[(Int, Int)] = Array((1,2), (2,3), (3,4), (4,5), (5,6), (6,7), (7,8), (8,9), (9,10), (10,11))


In [7]:
val dataFrame = b.toDF("first","second")  // create a DataFrame w/ 2 columns

dataFrame: org.apache.spark.sql.DataFrame = [first: int, second: int]


In [9]:
dataFrame.show

+-----+------+
|first|second|
+-----+------+
|    1|     2|
|    2|     3|
|    3|     4|
|    4|     5|
|    5|     6|
|    6|     7|
|    7|     8|
|    8|     9|
|    9|    10|
|   10|    11|
+-----+------+



## DataFrame from a List

In [28]:
val cartoonCharacters = List(("Bugs", 1),("Elmer", 5),("Daffy", 3),("Sylvester", 6),("Tweety",7),
                             ("Sam",9),("Porky",8),("Fog Horn",10),("Coyote",2),("Road Runner",4))
// this is the same for a Sequence as well
//  val cartoonCharacters = Seq(("Bugs", 1),("Elmer", 5),("Daffy", 3),("Sylvester", 6),("Tweety",7),
//                             ("Sam",9),("Porky",8),("Fog Horn",10),("Coyote",2),("Road Runner",4))

cartoonCharacters: List[(String, Int)] = List((Bugs,1), (Elmer,5), (Daffy,3), (Sylvester,6), (Tweety,7), (Sam,9), (Porky,8), (Fog Horn,10), (Coyote,2), (Road Runner,4))


In [32]:
val characterPopularity = cartoonCharacters.toDF("Name", "Popularity")

characterPopularity: org.apache.spark.sql.DataFrame = [Name: string, Popularity: int]


In [33]:
characterPopularity.show

+-----------+----------+
|       Name|Popularity|
+-----------+----------+
|       Bugs|         1|
|      Elmer|         5|
|      Daffy|         3|
|  Sylvester|         6|
|     Tweety|         7|
|        Sam|         9|
|      Porky|         8|
|   Fog Horn|        10|
|     Coyote|         2|
|Road Runner|         4|
+-----------+----------+



In [34]:
// run some queries
characterPopularity.registerTempTable("cartoonCharacters")

In [40]:
spark.sql("select * from cartoonCharacters where Name = 'Bugs'").show

+----+----------+
|Name|Popularity|
+----+----------+
|Bugs|         1|
+----+----------+



In [41]:
spark.sql("select count(*) from cartoonCharacters").show

+--------+
|count(1)|
+--------+
|      10|
+--------+



## Infererring Schema from inside class

In [42]:
val yahooStocks = sc.textFile("/user/yahoo_stocks.csv")

yahooStocks: org.apache.spark.rdd.RDD[String] = /user/yahoo_stocks.csv MapPartitionsRDD[15] at textFile at <console>:25


In [43]:
yahooStocks.collect

res22: Array[String] = Array(Date,Open,High,Low,Close,Volume,Adj Close, 2001-01-02,30.3125,30.375,27.50,28.1875,21939200,14.09375, 2000-12-29,30.3125,31.1875,29.5625,30.0625,20893400,15.03125, 2000-12-28,29.4375,31.75,29.125,31.00,24374600,15.50, 2000-12-27,31.00,31.50,29.125,29.75,22045400,14.875, 2000-12-26,32.00,34.00,30.125,31.1875,37536200,15.59375, 2000-12-22,26.4375,29.875,26.0625,29.5625,28347400,14.78125, 2000-12-21,26.75,28.25,25.0625,25.625,27794400,12.8125, 2000-12-20,25.8125,28.375,25.50,27.9375,44862800,13.96875, 2000-12-19,30.5625,31.9687,28.00,28.00,36131600,14.00, 2000-12-18,33.875,34.00,30.25,32.00,31697600,16.00, 2000-12-15,32.00,34.00,31.0625,33.00,40448000,16.50, 2000-12-14,35.3125,35.9062,31.9375,32.00,20899800,16.00, 2000-12-13,38.3125,38.625,34.25,34.875,33640400...