In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("spark-by-example").getOrCreate()

creates an empty RDD

In [5]:
emptyRDD = spark.sparkContext.emptyRDD()

In [6]:
print(emptyRDD)

EmptyRDD[0] at emptyRDD at NativeMethodAccessorImpl.java:0


alternatively an empty RDD can be created by using parallelize

In [8]:
rdd2 = spark.sparkContext.parallelize([])

In [9]:
print(rdd2)

ParallelCollectionRDD[1] at readRDDFromFile at PythonRDD.scala:287


create schema 

In [10]:
from pyspark.sql.types import StructType, StructField, StringType

In [11]:
# creates schema structure
schema = StructType([
    StructField('firstname', StringType(), True),
    StructField('middlename', StringType(), True),
    StructField('lastname', StringType(), True)
])

In [12]:
# create empty DataFrame from empty RDD
df = spark.createDataFrame(emptyRDD, schema)
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)



In [13]:
# convert empty RDD to Dataframe
df1 = emptyRDD.toDF(schema)
df1.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)



In [14]:
# create empty Dataframe directly
df2 = spark.createDataFrame([], schema)
df2.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)



In [15]:
# create empty Dataframe with no schema (no columns)
df3 = spark.createDataFrame([], StructType([]))
df3.printSchema()

root



In [16]:
# create PySpark RDD 
# we will create an RDD by passing Python list object to `sparkContext.parallelize()` function.
# in PySpark when you have data in a list means that you have your collection in PySpark Driver memory.
# When you create an RDD this collection is going to be parallelized.
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark-by-example').getOrCreate()
dept = [("Finance",10),("Marketing",20),("Sales",30),("IT",40)]
rdd = spark.sparkContext.parallelize(dept)


In [19]:
df = rdd.toDF()
df.printSchema()
df.show(truncate=False)
# by default, `toDF()` function creates column names as "_1" and "_2".

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)

+---------+---+
|_1       |_2 |
+---------+---+
|Finance  |10 |
|Marketing|20 |
|Sales    |30 |
|IT       |40 |
+---------+---+



In [20]:
# `toDF()` has another signature that takes arguments to define column names as shown below
deptColumns = ["dept_name", "dept_id"]
df2 = rdd.toDF(deptColumns)
df2.printSchema()
df2.show(truncate=False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



2.2 Using PySpark `createDataFrame()` function

`SparkSession` class provides `createDataFrame()` method to create DataFrame and it takes rdd object as an argument

In [24]:
deptDF = spark.createDataFrame(rdd, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



2.3 Using `createDataFrame()` with StructType schema

When you infer the schema, by default the datatype of the columns is derived from the data and set's nullable to true
for all columns. We can change this behavior by supplying schema using StructType - where we can specify a column name
data tyoe and nullable for each field/column

In [28]:
from pyspark.sql.types import StructType, StructField, StringType
deptSchema = StructType([
    StructField('dept_name', StringType(), True),
    StructField('dept_id', StringType(), True)
])

deptDF1 = spark.createDataFrame(rdd, schema = deptSchema)
deptDF1.printSchema()
deptDF1.show(truncate=False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: string (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



3. Complete Example

In [None]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('spark-by-example').getOrCreate()

dept = [("Finance",10), ("Marketing", 20), ("Sales", 30), ("IT", 40)]
rdd = spark.sparkContext.parallelize(dept)

df = rdd.toDF()
df.printSchema()
df.show(truncate=False)

