# Introduction to Spark DataFrames - Scala

[link_to_examples_used](https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-scala.html)



In [1]:
println(s"This notebook is tested with Spark 3.1.2.\nYou are using ${sc.version}.")

Intitializing Scala interpreter ...

Spark Web UI available at http://e507a37883e8:4040
SparkContext available as 'sc' (version = 3.1.2, master = local[*], app id = local-1632307229837)
SparkSession available as 'spark'


This notebook is tested with Spark 3.1.2.
You are using 3.1.2.


## Create DataFrames

In [2]:
// Create the case classes for our domain
case class Department(id: String, name: String)
case class Employee(firstName: String, lastName: String, email: String, salary: Int)
case class DepartmentWithEmployees(department: Department, employees: Seq[Employee])

// Create the Departments
val department1 = new Department("123456", "Computer Science")
val department2 = new Department("789012", "Mechanical Engineering")
val department3 = new Department("345678", "Theater and Drama")
val department4 = new Department("901234", "Indoor Recreation")

// Create the Employees
val employee1 = new Employee("michael", "armbrust", "no-reply@berkeley.edu", 100000)
val employee2 = new Employee("xiangrui", "meng", "no-reply@stanford.edu", 120000)
val employee3 = new Employee("matei", null, "no-reply@waterloo.edu", 140000)
val employee4 = new Employee(null, "wendell", "no-reply@princeton.edu", 160000)
val employee5 = new Employee("michael", "jackson", "no-reply@neverla.nd", 80000)

// Create the DepartmentWithEmployees instances from Departments and Employees
val departmentWithEmployees1 = new DepartmentWithEmployees(department1, Seq(employee1, employee2))
val departmentWithEmployees2 = new DepartmentWithEmployees(department2, Seq(employee3, employee4))
val departmentWithEmployees3 = new DepartmentWithEmployees(department3, Seq(employee5, employee4))
val departmentWithEmployees4 = new DepartmentWithEmployees(department4, Seq(employee2, employee3))

defined class Department
defined class Employee
defined class DepartmentWithEmployees
department1: Department = Department(123456,Computer Science)
department2: Department = Department(789012,Mechanical Engineering)
department3: Department = Department(345678,Theater and Drama)
department4: Department = Department(901234,Indoor Recreation)
employee1: Employee = Employee(michael,armbrust,no-reply@berkeley.edu,100000)
employee2: Employee = Employee(xiangrui,meng,no-reply@stanford.edu,120000)
employee3: Employee = Employee(matei,null,no-reply@waterloo.edu,140000)
employee4: Employee = Employee(null,wendell,no-reply@princeton.edu,160000)
employee5: Employee = Employee(michael,jackson,no-reply@neverla.nd,80000)
departmentWithEmployees1: DepartmentWithEmployees = DepartmentWithEmployees(Depar...


### Create DataFrames from a list of the case classes

In [3]:
val departmentsWithEmployeesSeq1 = Seq(departmentWithEmployees1, departmentWithEmployees2)
val df1 = departmentsWithEmployeesSeq1.toDF()


val departmentsWithEmployeesSeq2 = Seq(departmentWithEmployees3, departmentWithEmployees4)
val df2 = departmentsWithEmployeesSeq2.toDF()

departmentsWithEmployeesSeq1: Seq[DepartmentWithEmployees] = List(DepartmentWithEmployees(Department(123456,Computer Science),List(Employee(michael,armbrust,no-reply@berkeley.edu,100000), Employee(xiangrui,meng,no-reply@stanford.edu,120000))), DepartmentWithEmployees(Department(789012,Mechanical Engineering),List(Employee(matei,null,no-reply@waterloo.edu,140000), Employee(null,wendell,no-reply@princeton.edu,160000))))
df1: org.apache.spark.sql.DataFrame = [department: struct<id: string, name: string>, employees: array<struct<firstName:string,lastName:string,email:string,salary:int>>]
departmentsWithEmployeesSeq2: Seq[DepartmentWithEmployees] = List(DepartmentWithEmployees(Department(345678,Theater and Drama),List(Employee(michael,jackson,no-reply@neverla.nd,80000), Employee(null,wendell...


In [4]:
df1.printSchema()

root
 |-- department: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- employees: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- firstName: string (nullable = true)
 |    |    |-- lastName: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- salary: integer (nullable = false)



In [5]:
df2.printSchema()

root
 |-- department: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- employees: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- firstName: string (nullable = true)
 |    |    |-- lastName: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- salary: integer (nullable = false)



In [6]:
df1.show(false)

+--------------------------------+-----------------------------------------------------------------------------------------------------+
|department                      |employees                                                                                            |
+--------------------------------+-----------------------------------------------------------------------------------------------------+
|{123456, Computer Science}      |[{michael, armbrust, no-reply@berkeley.edu, 100000}, {xiangrui, meng, no-reply@stanford.edu, 120000}]|
|{789012, Mechanical Engineering}|[{matei, null, no-reply@waterloo.edu, 140000}, {null, wendell, no-reply@princeton.edu, 160000}]      |
+--------------------------------+-----------------------------------------------------------------------------------------------------+



In [7]:
df2.show(false)

+---------------------------+-------------------------------------------------------------------------------------------------+
|department                 |employees                                                                                        |
+---------------------------+-------------------------------------------------------------------------------------------------+
|{345678, Theater and Drama}|[{michael, jackson, no-reply@neverla.nd, 80000}, {null, wendell, no-reply@princeton.edu, 160000}]|
|{901234, Indoor Recreation}|[{xiangrui, meng, no-reply@stanford.edu, 120000}, {matei, null, no-reply@waterloo.edu, 140000}]  |
+---------------------------+-------------------------------------------------------------------------------------------------+



## Work with DataFrames


### Union two DataFrames

In [8]:
val unionDF = df1.union(df2)

unionDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [department: struct<id: string, name: string>, employees: array<struct<firstName:string,lastName:string,email:string,salary:int>>]


In [9]:
unionDF.show(false)

+--------------------------------+-----------------------------------------------------------------------------------------------------+
|department                      |employees                                                                                            |
+--------------------------------+-----------------------------------------------------------------------------------------------------+
|{123456, Computer Science}      |[{michael, armbrust, no-reply@berkeley.edu, 100000}, {xiangrui, meng, no-reply@stanford.edu, 120000}]|
|{789012, Mechanical Engineering}|[{matei, null, no-reply@waterloo.edu, 140000}, {null, wendell, no-reply@princeton.edu, 160000}]      |
|{345678, Theater and Drama}     |[{michael, jackson, no-reply@neverla.nd, 80000}, {null, wendell, no-reply@princeton.edu, 160000}]    |
|{901234, Indoor Recreation}     |[{xiangrui, meng, no-reply@stanford.edu, 120000}, {matei, null, no-reply@waterloo.edu, 140000}]      |
+--------------------------------+-------

### Explode the employees column

In [10]:
import org.apache.spark.sql.functions._

val explodeDF = unionDF.select(explode($"employees"))

import org.apache.spark.sql.functions._
explodeDF: org.apache.spark.sql.DataFrame = [col: struct<firstName: string, lastName: string ... 2 more fields>]


In [11]:
explodeDF.show(false)

+--------------------------------------------------+
|col                                               |
+--------------------------------------------------+
|{michael, armbrust, no-reply@berkeley.edu, 100000}|
|{xiangrui, meng, no-reply@stanford.edu, 120000}   |
|{matei, null, no-reply@waterloo.edu, 140000}      |
|{null, wendell, no-reply@princeton.edu, 160000}   |
|{michael, jackson, no-reply@neverla.nd, 80000}    |
|{null, wendell, no-reply@princeton.edu, 160000}   |
|{xiangrui, meng, no-reply@stanford.edu, 120000}   |
|{matei, null, no-reply@waterloo.edu, 140000}      |
+--------------------------------------------------+



### Flatten the fields of the employee class into columns

In [12]:
val flattenDF = explodeDF.select($"col.*")
flattenDF.show(false)

+---------+--------+----------------------+------+
|firstName|lastName|email                 |salary|
+---------+--------+----------------------+------+
|michael  |armbrust|no-reply@berkeley.edu |100000|
|xiangrui |meng    |no-reply@stanford.edu |120000|
|matei    |null    |no-reply@waterloo.edu |140000|
|null     |wendell |no-reply@princeton.edu|160000|
|michael  |jackson |no-reply@neverla.nd   |80000 |
|null     |wendell |no-reply@princeton.edu|160000|
|xiangrui |meng    |no-reply@stanford.edu |120000|
|matei    |null    |no-reply@waterloo.edu |140000|
+---------+--------+----------------------+------+



flattenDF: org.apache.spark.sql.DataFrame = [firstName: string, lastName: string ... 2 more fields]


### Use filter() to return the rows that match a predicate

In [13]:
val filterDF = flattenDF.filter($"firstName" === "xiangrui" || $"firstName" === "michael").sort($"lastName".asc)

filterDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [firstName: string, lastName: string ... 2 more fields]


In [14]:
filterDF.show(false)

+---------+--------+---------------------+------+
|firstName|lastName|email                |salary|
+---------+--------+---------------------+------+
|michael  |armbrust|no-reply@berkeley.edu|100000|
|michael  |jackson |no-reply@neverla.nd  |80000 |
|xiangrui |meng    |no-reply@stanford.edu|120000|
|xiangrui |meng    |no-reply@stanford.edu|120000|
+---------+--------+---------------------+------+



### The where() clause is equivalent to filter()

In [15]:
val whereDF = flattenDF.where($"firstName" === "xiangrui" || $"firstName" === "michael").sort($"lastName".asc)

whereDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [firstName: string, lastName: string ... 2 more fields]


In [16]:
whereDF.show(false)

+---------+--------+---------------------+------+
|firstName|lastName|email                |salary|
+---------+--------+---------------------+------+
|michael  |armbrust|no-reply@berkeley.edu|100000|
|michael  |jackson |no-reply@neverla.nd  |80000 |
|xiangrui |meng    |no-reply@stanford.edu|120000|
|xiangrui |meng    |no-reply@stanford.edu|120000|
+---------+--------+---------------------+------+



### Replace null values with -- using DataFrame Na function

In [17]:
flattenDF.show(false)

+---------+--------+----------------------+------+
|firstName|lastName|email                 |salary|
+---------+--------+----------------------+------+
|michael  |armbrust|no-reply@berkeley.edu |100000|
|xiangrui |meng    |no-reply@stanford.edu |120000|
|matei    |null    |no-reply@waterloo.edu |140000|
|null     |wendell |no-reply@princeton.edu|160000|
|michael  |jackson |no-reply@neverla.nd   |80000 |
|null     |wendell |no-reply@princeton.edu|160000|
|xiangrui |meng    |no-reply@stanford.edu |120000|
|matei    |null    |no-reply@waterloo.edu |140000|
+---------+--------+----------------------+------+



In [18]:
val nonNullDF = flattenDF.na.fill("--")

nonNullDF: org.apache.spark.sql.DataFrame = [firstName: string, lastName: string ... 2 more fields]


In [19]:
nonNullDF.show(false)

+---------+--------+----------------------+------+
|firstName|lastName|email                 |salary|
+---------+--------+----------------------+------+
|michael  |armbrust|no-reply@berkeley.edu |100000|
|xiangrui |meng    |no-reply@stanford.edu |120000|
|matei    |--      |no-reply@waterloo.edu |140000|
|--       |wendell |no-reply@princeton.edu|160000|
|michael  |jackson |no-reply@neverla.nd   |80000 |
|--       |wendell |no-reply@princeton.edu|160000|
|xiangrui |meng    |no-reply@stanford.edu |120000|
|matei    |--      |no-reply@waterloo.edu |140000|
+---------+--------+----------------------+------+



### Retrieve rows with missing firstName or lastName

In [20]:
val filterNonNullDF = nonNullDF.filter($"firstName" === "--" || $"lastName" === "--").sort($"email".asc)

filterNonNullDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [firstName: string, lastName: string ... 2 more fields]


In [21]:
filterNonNullDF.show(false)

+---------+--------+----------------------+------+
|firstName|lastName|email                 |salary|
+---------+--------+----------------------+------+
|--       |wendell |no-reply@princeton.edu|160000|
|--       |wendell |no-reply@princeton.edu|160000|
|matei    |--      |no-reply@waterloo.edu |140000|
|matei    |--      |no-reply@waterloo.edu |140000|
+---------+--------+----------------------+------+



### Example aggregations using agg() and countDistinct()

In [22]:
// Find the distinct last names for each first name
val countDistinctDF = nonNullDF.select($"firstName", $"lastName")
                                  .groupBy($"firstName")
                                  .agg(countDistinct($"lastName") as "distinct_last_names")

countDistinctDF: org.apache.spark.sql.DataFrame = [firstName: string, distinct_last_names: bigint]


In [23]:
countDistinctDF.show(false)

+---------+-------------------+
|firstName|distinct_last_names|
+---------+-------------------+
|xiangrui |1                  |
|matei    |1                  |
|michael  |2                  |
|--       |1                  |
+---------+-------------------+



### Compare the DataFrame and SQL query physical plans

In [24]:
countDistinctDF.explain()

== Physical Plan ==
*(3) HashAggregate(keys=[firstName#136], functions=[count(distinct lastName#137)])
+- Exchange hashpartitioning(firstName#136, 200), ENSURE_REQUIREMENTS, [id=#332]
   +- *(2) HashAggregate(keys=[firstName#136], functions=[partial_count(distinct lastName#137)])
      +- *(2) HashAggregate(keys=[firstName#136, lastName#137], functions=[])
         +- Exchange hashpartitioning(firstName#136, lastName#137, 200), ENSURE_REQUIREMENTS, [id=#327]
            +- *(1) HashAggregate(keys=[firstName#136, lastName#137], functions=[])
               +- *(1) Project [coalesce(col#49.firstName, --) AS firstName#136, coalesce(col#49.lastName, --) AS lastName#137]
                  +- Generate explode(employees#3), false, [col#49]
                     +- Union
                        :- LocalTableScan [employees#3]
                        +- LocalTableScan [employees#12]




In [25]:
// register the DataFrame as a temp view so that we can query it using SQL
nonNullDF.createOrReplaceTempView("databricks_df_example")

spark.sql("""
  SELECT firstName, count(distinct lastName) as distinct_last_names
  FROM databricks_df_example
  GROUP BY firstName
""").explain

== Physical Plan ==
*(3) HashAggregate(keys=[firstName#136], functions=[count(distinct lastName#137)])
+- Exchange hashpartitioning(firstName#136, 200), ENSURE_REQUIREMENTS, [id=#386]
   +- *(2) HashAggregate(keys=[firstName#136], functions=[partial_count(distinct lastName#137)])
      +- *(2) HashAggregate(keys=[firstName#136, lastName#137], functions=[])
         +- Exchange hashpartitioning(firstName#136, lastName#137, 200), ENSURE_REQUIREMENTS, [id=#381]
            +- *(1) HashAggregate(keys=[firstName#136, lastName#137], functions=[])
               +- *(1) Project [coalesce(col#49.firstName, --) AS firstName#136, coalesce(col#49.lastName, --) AS lastName#137]
                  +- Generate explode(employees#3), false, [col#49]
                     +- Union
                        :- LocalTableScan [employees#3]
                        +- LocalTableScan [employees#12]




### Sum up all the salaries

In [26]:
val salarySumDF = nonNullDF.agg("salary" -> "sum")

salarySumDF: org.apache.spark.sql.DataFrame = [sum(salary): bigint]


In [27]:
salarySumDF.show(false)

+-----------+
|sum(salary)|
+-----------+
|1020000    |
+-----------+



### Print the summary statistics for the salaries

In [28]:
nonNullDF.describe("salary").show()

+-------+------------------+
|summary|            salary|
+-------+------------------+
|  count|                 8|
|   mean|          127500.0|
| stddev|28157.719063467175|
|    min|             80000|
|    max|            160000|
+-------+------------------+



# The END