In [3]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import SparkSession



# Creating a spark session
spark = SparkSession.builder.appName('SparkLearning').getOrCreate()


# Create Empty RDD, DataFrame with Schema (StructType)

In [3]:
# Creates Empty RDD
emptyRDD = spark.sparkContext.emptyRDD()

# Create Schema
#from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField('firstName', StringType(), nullable=False),
    StructField('middleName', StringType(), nullable=True),
    StructField('lastName', StringType(), nullable=False)
])


# Create empty DataFrame from empty RDD
df = spark.createDataFrame(emptyRDD, schema=schema)
df.printSchema()

# Convert empty RDD to Dataframe
df1 = emptyRDD.toDF(schema)
df1.printSchema()

# Create empty DataFrame directly.
df2 = spark.createDataFrame([], schema=schema)
df2.printSchema()

root
 |-- firstName: string (nullable = false)
 |-- middleName: string (nullable = true)
 |-- lastName: string (nullable = false)

root
 |-- firstName: string (nullable = false)
 |-- middleName: string (nullable = true)
 |-- lastName: string (nullable = false)

root
 |-- firstName: string (nullable = false)
 |-- middleName: string (nullable = true)
 |-- lastName: string (nullable = false)



# Convert PySpark RDD to DataFrame

In [4]:
#  create an RDD by passing Python list object to sparkContext.parallelize() function
# In PySpark, when you have a collection of data in a PySpark driver memory when we create an RDD, "parallelized" is going to help

dept = [("Finance", 10), ("Marketing", 20),("Sales", 30), ("IT", 40)]
# Creating an RDD
rdd = spark.sparkContext.parallelize(dept)
schema = StructType([
    StructField('Dept', StringType(), nullable=False),
    StructField('Id', IntegerType(), nullable=False)
])
df = rdd.toDF(schema)
df.printSchema()
df.show(truncate=False)

root
 |-- Dept: string (nullable = false)
 |-- Id: integer (nullable = false)

+---------+---+
|Dept     |Id |
+---------+---+
|Finance  |10 |
|Marketing|20 |
|Sales    |30 |
|IT       |40 |
+---------+---+



In [27]:
# Using PySpark createDataFrame() function
dept_columns = ["DeptName", "Id"]
df = spark.createDataFrame(rdd, schema=dept_columns)
df.printSchema()
df.show(1)

root
 |-- DeptName: string (nullable = true)
 |-- Id: long (nullable = true)

+--------+---+
|DeptName| Id|
+--------+---+
| Finance| 10|
+--------+---+
only showing top 1 row



# Convert PySpark DataFrame to Pandas

In [5]:

data = [
    ("James", "", "Smith", "36636", "M", 60000),
    ("Michael", "Rose", "", "40288", "M", 70000),
    ("Robert", "", "Williams", "42114", "", 400000),
    ("Marria", "Anne", "Jones", "39192", "F", 500000),
    ("Jen", "Mary", "Brown", "", "F", 0)
]

columns = ["first_name", "middle_name", "last_name", "dob", "gender", "salary"]

df = spark.createDataFrame(data, schema=columns)
df.printSchema()
df.show()

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+----------+-----------+---------+-----+------+------+
|first_name|middle_name|last_name|  dob|gender|salary|
+----------+-----------+---------+-----+------+------+
|     James|           |    Smith|36636|     M| 60000|
|   Michael|       Rose|         |40288|     M| 70000|
|    Robert|           | Williams|42114|      |400000|
|    Marria|       Anne|    Jones|39192|     F|500000|
|       Jen|       Mary|    Brown|     |     F|     0|
+----------+-----------+---------+-----+------+------+



In [7]:
# Convert PySpark Dataframe to Pandas DataFrame
pandasDF = df.toPandas()
print(pandasDF)

  first_name middle_name last_name    dob gender  salary
0      James                 Smith  36636      M   60000
1    Michael        Rose            40288      M   70000
2     Robert              Williams  42114         400000
3     Marria        Anne     Jones  39192      F  500000
4        Jen        Mary     Brown             F       0


In [34]:
# Convert Spark Nested Struct DataFrame to Pandas
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

dataStruct = [
    (("James", "", "Smith"), "36636", "M", 60000),
    (("Michael", "Rose", ""), "40288", "M", 70000),
    (("Robert", "", "Williams"), "42114", "", 400000),
    (("Marria", "Anne", "Jones"), "39192", "F", 500000),
    (("Jen", "Mary", "Brown"), "", "F", 0)
]

schemaStruct = StructType([
    StructField("name", StructType([
        StructField("firstname", StringType(), nullable=True),
        StructField("middlename", StringType(), nullable=True),
        StructField("lastname", StringType(), nullable=True)
    ])),
    StructField("dob", StringType(), nullable=True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True)
])


df = spark.createDataFrame(dataStruct, schema=schemaStruct)
df.printSchema()
pdDF = df.toPandas()
print(pdDF)


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

                    name    dob gender  salary
0       (James, , Smith)  36636      M   60000
1      (Michael, Rose, )  40288      M   70000
2   (Robert, , Williams)  42114         400000
3  (Marria, Anne, Jones)  39192      F  500000
4     (Jen, Mary, Brown)             F       0


# PySpark show() – Display DataFrame Contents in Table

In [12]:
# Default - displays 20 rows and 
# 20 charactes from column value 
df.show()

#Display full column contents
df.show(truncate=False)

# Display 2 rows and full column contents
df.show(2,truncate=False) 

# Display 2 rows & column values 25 characters
df.show(2,truncate=25) 

# Display DataFrame rows & columns vertically
df.show(n=3,truncate=25,vertical=True)

+--------------------+-----+------+------+
|                name|  dob|gender|salary|
+--------------------+-----+------+------+
|    {James, , Smith}|36636|     M| 60000|
|   {Michael, Rose, }|40288|     M| 70000|
|{Robert, , Williams}|42114|      |400000|
|{Marria, Anne, Jo...|39192|     F|500000|
|  {Jen, Mary, Brown}|     |     F|     0|
+--------------------+-----+------+------+

+---------------------+-----+------+------+
|name                 |dob  |gender|salary|
+---------------------+-----+------+------+
|{James, , Smith}     |36636|M     |60000 |
|{Michael, Rose, }    |40288|M     |70000 |
|{Robert, , Williams} |42114|      |400000|
|{Marria, Anne, Jones}|39192|F     |500000|
|{Jen, Mary, Brown}   |     |F     |0     |
+---------------------+-----+------+------+

+-----------------+-----+------+------+
|name             |dob  |gender|salary|
+-----------------+-----+------+------+
|{James, , Smith} |36636|M     |60000 |
|{Michael, Rose, }|40288|M     |70000 |
+--------------

Syntax

def show(self, n=20, truncate=True, vertical=False):
    pass

# PySpark StructType & StructField
1. PySpark provides `StructType` class from `pyspark.sql.types` to define the structure of the DataFrame.
2. StructType is a `collection` or list of `StructField` objects.

Refer cell 4 & 11 for how to create StructType & StructField with DataFrame. And also nasting of StructType

In [14]:
# Defining schema using nested StructType
structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]

structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2 = spark.createDataFrame(data=structureData,schema=structureSchema)
df2.printSchema()
df2.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+-----+------+------+
|name                |id   |gender|salary|
+--------------------+-----+------+------+
|{James, , Smith}    |36636|M     |3100  |
|{Michael, Rose, }   |40288|M     |4300  |
|{Robert, , Williams}|42114|M     |1400  |
|{Maria, Anne, Jones}|39192|F     |5500  |
|{Jen, Mary, Brown}  |     |F     |-1    |
+--------------------+-----+------+------+



Adding & Changing struct of the DataFrame

Using PySpark SQL function `struct()`, we can change the struct of the existing DataFrame and add a new StructType to it


In [16]:
from pyspark.sql.functions import struct, col, when

updatedDF = df2.withColumn(
    "OtherInfo",
    struct(
        col('id').alias("identifier"),
        col('gender').alias("gender"),
        col('salary').alias("salary"),
        when(col('salary').cast(IntegerType()) < 2000, "Low")
            .when(col('salary').cast(IntegerType()) < 4000, "Medium")
            .otherwise('High').alias("Salary_grade")
    )
).drop("id", "gender", "salary")

updatedDF.printSchema()
updatedDF.show()


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- OtherInfo: struct (nullable = false)
 |    |-- identifier: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- salary: integer (nullable = true)
 |    |-- Salary_grade: string (nullable = false)

+--------------------+--------------------+
|                name|           OtherInfo|
+--------------------+--------------------+
|    {James, , Smith}|{36636, M, 3100, ...|
|   {Michael, Rose, }|{40288, M, 4300, ...|
|{Robert, , Williams}|{42114, M, 1400, ...|
|{Maria, Anne, Jones}|{39192, F, 5500, ...|
|  {Jen, Mary, Brown}|      {, F, -1, Low}|
+--------------------+--------------------+



Using SQL ArrayType and MapType

In [23]:
# Using SQL ArrayType and MapType
from pyspark.sql.types import ArrayType, MapType, BooleanType

structData = [
    (("Bob", "harley", "Martin"), ["Cricket", "Volley"], {"isCricketer": False, "isVolleyPlayer": True})
]

arrayStructureSchema = StructType([
    StructField('name', StructType([
       StructField('firstname', StringType(), True),
       StructField('middlename', StringType(), True),
       StructField('lastname', StringType(), True)
       ])),
       StructField('hobbies', ArrayType(StringType()), True),
       StructField('properties', MapType(StringType(),BooleanType()), True)
    ])

df3 = spark.createDataFrame(structData, schema=arrayStructureSchema)
df3.printSchema()
df3.show(vertical=True, truncate=60)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- hobbies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: boolean (valueContainsNull = true)

-RECORD 0----------------------------------------------------
 name       | {Bob, harley, Martin}                          
 hobbies    | [Cricket, Volley]                              
 properties | {isVolleyPlayer -> true, isCricketer -> false} 



# Creating StructType object struct from JSON file

 You can get the schema by using `df3.schema.json()`
 

In [26]:
schema_json = df3.schema.json()
df3.schema.simpleString()

'struct<name:struct<firstname:string,middlename:string,lastname:string>,hobbies:array<string>,properties:map<string,boolean>>'

In [30]:
# Now let’s load the json file and use it to create a DataFrame.
import json
schemaFromJson = StructType.fromJson(json.loads(schema_json))
df3 = spark.createDataFrame(
        spark.sparkContext.parallelize(structData),schemaFromJson)
df3.printSchema()
df3.show(truncate=60, vertical=True)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- hobbies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: boolean (valueContainsNull = true)

-RECORD 0----------------------------------------------------
 name       | {Bob, harley, Martin}                          
 hobbies    | [Cricket, Volley]                              
 properties | {isVolleyPlayer -> true, isCricketer -> false} 



Checking if a Column Exists in a DataFrame

In [54]:
print("name" in df3.schema.fieldNames())

True


# PySpark Column Class

1. One of the simplest ways to create a Column class object is by using PySpark `lit()` SQL function, this takes a literal value and returns a Column object

In [56]:
from pyspark.sql.functions import lit

colObj = lit("Hello World")
type(colObj)


pyspark.sql.column.Column

In [66]:
data = [("Jeams", 40), ("Anna", 36)]
df = spark.createDataFrame(data).toDF("name.fname", "age")
df.printSchema()

# Using DataFrame object (df)
df.select(df.age).show()
df.select(df["age"]).show()

#Accessing column name with dot (with backticks)
df.select(df["`name.fname`"]).show()

root
 |-- name.fname: string (nullable = true)
 |-- age: long (nullable = true)

+---+
|age|
+---+
| 40|
| 36|
+---+

+---+
|age|
+---+
| 40|
| 36|
+---+

+----------+
|name.fname|
+----------+
|     Jeams|
|      Anna|
+----------+



In [68]:
# Using SQL col() function
from pyspark.sql.functions import col
df.select(col('age')).show()
df.select(col("`name.fname`")).show()

+---+
|age|
+---+
| 40|
| 36|
+---+

+----------+
|name.fname|
+----------+
|     Jeams|
|      Anna|
+----------+



# Create DataFrame with struct using Row class

In [69]:
from pyspark.sql import Row

data = [
    Row(name="Jeams", prop=Row(hair="Black", eye="Blue")),
    Row(name="Ann", prop=Row(hair="Grey", eye="Black"))
]

df = spark.createDataFrame(data)
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- prop: struct (nullable = true)
 |    |-- hair: string (nullable = true)
 |    |-- eye: string (nullable = true)



In [70]:
# Access struct column
df.select(df.name).show()
df.select(df.prop.eye).show()

+-----+
| name|
+-----+
|Jeams|
|  Ann|
+-----+

+--------+
|prop.eye|
+--------+
|    Blue|
|   Black|
+--------+



In [72]:
df.select(col("prop.hair")).show()
df.select(col("prop.*")).show()

+-----+
| hair|
+-----+
|Black|
| Grey|
+-----+

+-----+-----+
| hair|  eye|
+-----+-----+
|Black| Blue|
| Grey|Black|
+-----+-----+



: 

PySpark column also provides a way to do arithmetic operations on columns using operators.

In [8]:
data = [
    (200, 1, 2),
    (300, 40, 20),
    (500, 91, 42)
]

df = spark.createDataFrame(data).toDF("col1", "col2", "col3")
df.printSchema()

# Arithematic operations
df.select(df.col1 + df.col2).show()
df.select(df.col1 - df.col2).show()
df.select(df.col1 * df.col2).show()
df.select(df.col1 / df.col2).show()
df.select(df.col1 % df.col2).show()

df.select(df.col2 > df.col3).show()
df.select(df.col2 < df.col3).show()
df.select(df.col2 == df.col3).show()


root
 |-- col1: long (nullable = true)
 |-- col2: long (nullable = true)
 |-- col3: long (nullable = true)

+-------------+
|(col1 + col2)|
+-------------+
|          201|
|          340|
|          591|
+-------------+

+-------------+
|(col1 - col2)|
+-------------+
|          199|
|          260|
|          409|
+-------------+

+-------------+
|(col1 * col2)|
+-------------+
|          200|
|        12000|
|        45500|
+-------------+

+------------------+
|     (col1 / col2)|
+------------------+
|             200.0|
|               7.5|
|5.4945054945054945|
+------------------+

+-------------+
|(col1 % col2)|
+-------------+
|            0|
|           20|
|           45|
+-------------+

+-------------+
|(col2 > col3)|
+-------------+
|        false|
|         true|
|         true|
+-------------+

+-------------+
|(col2 < col3)|
+-------------+
|         true|
|        false|
|        false|
+-------------+

+-------------+
|(col2 = col3)|
+-------------+
|        false|
| 

# PySpark Column Functions