# Spark DataFrames and Magellan

We advise the read of the [sql programming guide/sql](https://spark.apache.org/docs/2.1.1/sql-programming-guide.html#sql) and visit [Magellan repository](https://github.com/harsha2010/magellan).

In [6]:
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}

In [9]:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._

sqlContext = org.apache.spark.sql.SQLContext@21b85e2a




org.apache.spark.sql.SQLContext@21b85e2a

In [13]:
var dir_path = "hdfs:///user/emma/ecolidar/"
var offline_dir_path = "hdfs:///user/emma/ecolidar/"
var las_cell = "C_25EZ2"
var parquet_file = las_cell + ".parquet"

dir_path = hdfs:///user/emma/ecolidar/
offline_dir_path = hdfs:///user/emma/ecolidar/
las_cell = C_25EZ2
parquet_file = C_25EZ2.parquet


lastException: Throwable = null


C_25EZ2.parquet

## Load DataFrame

The dataframe is loaded from a Parquet file.

In [11]:
val df = sqlContext.read.parquet(parquet_file)

df = [x: int, y: int ... 3 more fields]


[x: int, y: int ... 3 more fields]

In [30]:
df.head(10)

0,1,2,3,4
712,2,125667105,-30943,33
736,2,125667415,-30633,87
678,2,125667715,-30333,-125
813,2,125668046,-30002,-50
900,2,125668356,-29692,4
774,2,125668657,-29391,49
736,2,125668940,-29108,76
749,2,125669238,-28810,118
800,2,125669528,-28520,-104
724,2,125669852,-28196,-36


## Create a Temporary table or view

It is possible to create a temporary table or view from a dataframe.

In [14]:
df.createOrReplaceTempView(las_cell + "_tab")

tempTab: Unit = ()


## Show existent tables

In [17]:
sqlContext.sql("show tables").show()

+--------+-----------+-----------+
|database|  tableName|isTemporary|
+--------+-----------+-----------+
|        |c_25ez2_tab|       true|
+--------+-----------+-----------+



lastException: Throwable = null


## Run a simple SQL query

In [19]:
val res = sqlContext.sql("select * from " + las_cell + "_tab limit 10")

res = [x: int, y: int ... 3 more fields]


[x: int, y: int ... 3 more fields]

In [20]:
res.show()

+---+---+---------+---------+------------------+
|  x|  y|        z|intensity|raw_classification|
+---+---+---------+---------+------------------+
|712|  2|125667105|   -30943|                33|
|736|  2|125667415|   -30633|                87|
|678|  2|125667715|   -30333|              -125|
|813|  2|125668046|   -30002|               -50|
|900|  2|125668356|   -29692|                 4|
|774|  2|125668657|   -29391|                49|
|736|  2|125668940|   -29108|                76|
|749|  2|125669238|   -28810|               118|
|800|  2|125669528|   -28520|              -104|
|724|  2|125669852|   -28196|               -36|
+---+---+---------+---------+------------------+



# Magellan

In [22]:
import magellan.{Point, Polygon}
import org.apache.spark.sql.magellan.dsl.expressions._
import org.apache.spark.sql.types._

## Data structures

In [28]:
//Only 2D points are supported.
val points = sc.parallelize(Seq((-1.0, -1.0), (-1.0, 1.0), (1.0, -1.0))).toDF("x", "y").select(point($"x", $"y").as("point"))

points.show()

+-----------------+
|            point|
+-----------------+
|Point(-1.0, -1.0)|
| Point(-1.0, 1.0)|
| Point(1.0, -1.0)|
+-----------------+



points = [point: point]


[point: point]

## Create a list of points from a DataFrame

In [29]:
val lidar_points = df.select(point($"x", $"y").as("point"))

lidar_points.show()

+-----------------+
|            point|
+-----------------+
|Point(712.0, 2.0)|
|Point(736.0, 2.0)|
|Point(678.0, 2.0)|
|Point(813.0, 2.0)|
|Point(900.0, 2.0)|
|Point(774.0, 2.0)|
|Point(736.0, 2.0)|
|Point(749.0, 2.0)|
|Point(800.0, 2.0)|
|Point(724.0, 2.0)|
|Point(622.0, 2.0)|
|Point(576.0, 2.0)|
|Point(724.0, 2.0)|
|Point(761.0, 2.0)|
|Point(690.0, 2.0)|
|Point(724.0, 2.0)|
|Point(775.0, 2.0)|
|Point(737.0, 2.0)|
|Point(737.0, 2.0)|
|Point(632.0, 1.0)|
+-----------------+
only showing top 20 rows



lidar_points = [point: point]


[point: point]