https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html

In [1]:
# import pyspark class Row from module sql
from pyspark.sql import *

# Create Example Data - Departments and Employees

# Create the Departments
department1 = Row(id='123456', name='Computer Science')
department2 = Row(id='789012', name='Mechanical Engineering')
department3 = Row(id='345678', name='Theater and Drama')
department4 = Row(id='901234', name='Indoor Recreation')


In [2]:
# Create the Employees
Employee = Row("firstName", "lastName", "email", "salary")
employee1 = Employee('michael', 'armbrust', 'no-reply@berkeley.edu', 100000)
employee2 = Employee('xiangrui', 'meng', 'no-reply@stanford.edu', 120000)
employee3 = Employee('matei', None, 'no-reply@waterloo.edu', 140000)
employee4 = Employee(None, 'wendell', 'no-reply@berkeley.edu', 160000)

# Create the DepartmentWithEmployees instances from Departments and Employees
departmentWithEmployees1 = Row(department=department1, employees=[employee1, employee2])
departmentWithEmployees2 = Row(department=department2, employees=[employee3, employee4])
departmentWithEmployees3 = Row(department=department3, employees=[employee1, employee4])
departmentWithEmployees4 = Row(department=department4, employees=[employee2, employee3])

print (department1)


Row(id='123456', name='Computer Science')


In [3]:
print(employee2)

Row(firstName='xiangrui', lastName='meng', email='no-reply@stanford.edu', salary=120000)


In [4]:
print(departmentWithEmployees1.employees[0].email)

no-reply@berkeley.edu


In [5]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
sc.version

'2.2.0'

In [6]:
departmentsWithEmployeesSeq1 = [departmentWithEmployees1, departmentWithEmployees2]
df1 = sqlContext.createDataFrame(departmentsWithEmployeesSeq1)

In [7]:
display(df1)
df1.first()

DataFrame[department: struct<id:string,name:string>, employees: array<struct<firstName:string,lastName:string,email:string,salary:bigint>>]

Row(department=Row(id='123456', name='Computer Science'), employees=[Row(firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=100000), Row(firstName='xiangrui', lastName='meng', email='no-reply@stanford.edu', salary=120000)])

In [8]:
departmentsWithEmployeesSeq2 = [departmentWithEmployees3, departmentWithEmployees4]
df2 = sqlContext.createDataFrame(departmentsWithEmployeesSeq2)


In [9]:
display(df2)

DataFrame[department: struct<id:string,name:string>, employees: array<struct<firstName:string,lastName:string,email:string,salary:bigint>>]

In [10]:
unionDF = df1.unionAll(df2)
display(unionDF)

DataFrame[department: struct<id:string,name:string>, employees: array<struct<firstName:string,lastName:string,email:string,salary:bigint>>]

In [11]:
unionDF.first()

Row(department=Row(id='123456', name='Computer Science'), employees=[Row(firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=100000), Row(firstName='xiangrui', lastName='meng', email='no-reply@stanford.edu', salary=120000)])

In [12]:
unionDF.take(5)

[Row(department=Row(id='123456', name='Computer Science'), employees=[Row(firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=100000), Row(firstName='xiangrui', lastName='meng', email='no-reply@stanford.edu', salary=120000)]),
 Row(department=Row(id='789012', name='Mechanical Engineering'), employees=[Row(firstName='matei', lastName=None, email='no-reply@waterloo.edu', salary=140000), Row(firstName=None, lastName='wendell', email='no-reply@berkeley.edu', salary=160000)]),
 Row(department=Row(id='345678', name='Theater and Drama'), employees=[Row(firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=100000), Row(firstName=None, lastName='wendell', email='no-reply@berkeley.edu', salary=160000)]),
 Row(department=Row(id='901234', name='Indoor Recreation'), employees=[Row(firstName='xiangrui', lastName='meng', email='no-reply@stanford.edu', salary=120000), Row(firstName='matei', lastName=None, email='no-reply@waterloo.edu', salary=14000

In [13]:
from pyspark.sql import Row


In [14]:
eDF = sqlContext.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a":"b"})])

In [15]:
from pyspark.sql.functions import explode
eDF.select(explode(eDF.intlist).alias("an_int")).collect()

[Row(an_int=1), Row(an_int=2), Row(an_int=3)]

In [16]:
eDF.select(explode(eDF.mapfield).alias("key","value")).show()

+---+-----+
|key|value|
+---+-----+
|  a|    b|
+---+-----+



In [17]:
display(eDF)

DataFrame[a: bigint, intlist: array<bigint>, mapfield: map<string,string>]

In [18]:
df = unionDF.select(explode("employees").alias("e"))

In [19]:
display(df)

DataFrame[e: struct<firstName:string,lastName:string,email:string,salary:bigint>]

In [21]:
explodeDF = df.selectExpr("e.firstName","e.lastName","e.email","e.salary")