In [2]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import SparkSession



# Creating a spark session
spark = SparkSession.builder.appName('SparkLearning').getOrCreate()


# Create Empty RDD, DataFrame with Schema (StructType)

In [3]:
# Creates Empty RDD
emptyRDD = spark.sparkContext.emptyRDD()

# Create Schema
#from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField('firstName', StringType(), nullable=False),
    StructField('middleName', StringType(), nullable=True),
    StructField('lastName', StringType(), nullable=False)
])


# Create empty DataFrame from empty RDD
df = spark.createDataFrame(emptyRDD, schema=schema)
df.printSchema()

# Convert empty RDD to Dataframe
df1 = emptyRDD.toDF(schema)
df1.printSchema()

# Create empty DataFrame directly.
df2 = spark.createDataFrame([], schema=schema)
df2.printSchema()

root
 |-- firstName: string (nullable = false)
 |-- middleName: string (nullable = true)
 |-- lastName: string (nullable = false)

root
 |-- firstName: string (nullable = false)
 |-- middleName: string (nullable = true)
 |-- lastName: string (nullable = false)

root
 |-- firstName: string (nullable = false)
 |-- middleName: string (nullable = true)
 |-- lastName: string (nullable = false)



# Convert PySpark RDD to DataFrame

In [4]:
#  create an RDD by passing Python list object to sparkContext.parallelize() function
# In PySpark, when you have a collection of data in a PySpark driver memory when we create an RDD, "parallelized" is going to help

dept = [("Finance", 10), ("Marketing", 20),("Sales", 30), ("IT", 40)]
# Creating an RDD
rdd = spark.sparkContext.parallelize(dept)
schema = StructType([
    StructField('Dept', StringType(), nullable=False),
    StructField('Id', IntegerType(), nullable=False)
])
df = rdd.toDF(schema)
df.printSchema()
df.show(truncate=False)

root
 |-- Dept: string (nullable = false)
 |-- Id: integer (nullable = false)

+---------+---+
|Dept     |Id |
+---------+---+
|Finance  |10 |
|Marketing|20 |
|Sales    |30 |
|IT       |40 |
+---------+---+



In [27]:
# Using PySpark createDataFrame() function
dept_columns = ["DeptName", "Id"]
df = spark.createDataFrame(rdd, schema=dept_columns)
df.printSchema()
df.show(1)

root
 |-- DeptName: string (nullable = true)
 |-- Id: long (nullable = true)

+--------+---+
|DeptName| Id|
+--------+---+
| Finance| 10|
+--------+---+
only showing top 1 row



# Convert PySpark DataFrame to Pandas

In [5]:

data = [
    ("James", "", "Smith", "36636", "M", 60000),
    ("Michael", "Rose", "", "40288", "M", 70000),
    ("Robert", "", "Williams", "42114", "", 400000),
    ("Marria", "Anne", "Jones", "39192", "F", 500000),
    ("Jen", "Mary", "Brown", "", "F", 0)
]

columns = ["first_name", "middle_name", "last_name", "dob", "gender", "salary"]

df = spark.createDataFrame(data, schema=columns)
df.printSchema()
df.show()

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+----------+-----------+---------+-----+------+------+
|first_name|middle_name|last_name|  dob|gender|salary|
+----------+-----------+---------+-----+------+------+
|     James|           |    Smith|36636|     M| 60000|
|   Michael|       Rose|         |40288|     M| 70000|
|    Robert|           | Williams|42114|      |400000|
|    Marria|       Anne|    Jones|39192|     F|500000|
|       Jen|       Mary|    Brown|     |     F|     0|
+----------+-----------+---------+-----+------+------+



In [7]:
# Convert PySpark Dataframe to Pandas DataFrame
pandasDF = df.toPandas()
print(pandasDF)

  first_name middle_name last_name    dob gender  salary
0      James                 Smith  36636      M   60000
1    Michael        Rose            40288      M   70000
2     Robert              Williams  42114         400000
3     Marria        Anne     Jones  39192      F  500000
4        Jen        Mary     Brown             F       0


In [34]:
# Convert Spark Nested Struct DataFrame to Pandas
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

dataStruct = [
    (("James", "", "Smith"), "36636", "M", 60000),
    (("Michael", "Rose", ""), "40288", "M", 70000),
    (("Robert", "", "Williams"), "42114", "", 400000),
    (("Marria", "Anne", "Jones"), "39192", "F", 500000),
    (("Jen", "Mary", "Brown"), "", "F", 0)
]

schemaStruct = StructType([
    StructField("name", StructType([
        StructField("firstname", StringType(), nullable=True),
        StructField("middlename", StringType(), nullable=True),
        StructField("lastname", StringType(), nullable=True)
    ])),
    StructField("dob", StringType(), nullable=True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True)
])


df = spark.createDataFrame(dataStruct, schema=schemaStruct)
df.printSchema()
pdDF = df.toPandas()
print(pdDF)


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

                    name    dob gender  salary
0       (James, , Smith)  36636      M   60000
1      (Michael, Rose, )  40288      M   70000
2   (Robert, , Williams)  42114         400000
3  (Marria, Anne, Jones)  39192      F  500000
4     (Jen, Mary, Brown)             F       0


# PySpark show() – Display DataFrame Contents in Table

In [12]:
# Default - displays 20 rows and 
# 20 charactes from column value 
df.show()

#Display full column contents
df.show(truncate=False)

# Display 2 rows and full column contents
df.show(2,truncate=False) 

# Display 2 rows & column values 25 characters
df.show(2,truncate=25) 

# Display DataFrame rows & columns vertically
df.show(n=3,truncate=25,vertical=True)

+--------------------+-----+------+------+
|                name|  dob|gender|salary|
+--------------------+-----+------+------+
|    {James, , Smith}|36636|     M| 60000|
|   {Michael, Rose, }|40288|     M| 70000|
|{Robert, , Williams}|42114|      |400000|
|{Marria, Anne, Jo...|39192|     F|500000|
|  {Jen, Mary, Brown}|     |     F|     0|
+--------------------+-----+------+------+

+---------------------+-----+------+------+
|name                 |dob  |gender|salary|
+---------------------+-----+------+------+
|{James, , Smith}     |36636|M     |60000 |
|{Michael, Rose, }    |40288|M     |70000 |
|{Robert, , Williams} |42114|      |400000|
|{Marria, Anne, Jones}|39192|F     |500000|
|{Jen, Mary, Brown}   |     |F     |0     |
+---------------------+-----+------+------+

+-----------------+-----+------+------+
|name             |dob  |gender|salary|
+-----------------+-----+------+------+
|{James, , Smith} |36636|M     |60000 |
|{Michael, Rose, }|40288|M     |70000 |
+--------------

Syntax

def show(self, n=20, truncate=True, vertical=False):
    pass

# PySpark StructType & StructField
1. PySpark provides `StructType` class from `pyspark.sql.types` to define the structure of the DataFrame.
2. StructType is a `collection` or list of `StructField` objects.

Refer cell 4 & 11 for how to create StructType & StructField with DataFrame. And also nasting of StructType

In [14]:
# Defining schema using nested StructType
structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]

structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2 = spark.createDataFrame(data=structureData,schema=structureSchema)
df2.printSchema()
df2.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+-----+------+------+
|name                |id   |gender|salary|
+--------------------+-----+------+------+
|{James, , Smith}    |36636|M     |3100  |
|{Michael, Rose, }   |40288|M     |4300  |
|{Robert, , Williams}|42114|M     |1400  |
|{Maria, Anne, Jones}|39192|F     |5500  |
|{Jen, Mary, Brown}  |     |F     |-1    |
+--------------------+-----+------+------+



Adding & Changing struct of the DataFrame

Using PySpark SQL function `struct()`, we can change the struct of the existing DataFrame and add a new StructType to it


In [16]:
from pyspark.sql.functions import struct, col, when

updatedDF = df2.withColumn(
    "OtherInfo",
    struct(
        col('id').alias("identifier"),
        col('gender').alias("gender"),
        col('salary').alias("salary"),
        when(col('salary').cast(IntegerType()) < 2000, "Low")
            .when(col('salary').cast(IntegerType()) < 4000, "Medium")
            .otherwise('High').alias("Salary_grade")
    )
).drop("id", "gender", "salary")

updatedDF.printSchema()
updatedDF.show()


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- OtherInfo: struct (nullable = false)
 |    |-- identifier: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- salary: integer (nullable = true)
 |    |-- Salary_grade: string (nullable = false)

+--------------------+--------------------+
|                name|           OtherInfo|
+--------------------+--------------------+
|    {James, , Smith}|{36636, M, 3100, ...|
|   {Michael, Rose, }|{40288, M, 4300, ...|
|{Robert, , Williams}|{42114, M, 1400, ...|
|{Maria, Anne, Jones}|{39192, F, 5500, ...|
|  {Jen, Mary, Brown}|      {, F, -1, Low}|
+--------------------+--------------------+



Using SQL ArrayType and MapType

In [23]:
# Using SQL ArrayType and MapType
from pyspark.sql.types import ArrayType, MapType, BooleanType

structData = [
    (("Bob", "harley", "Martin"), ["Cricket", "Volley"], {"isCricketer": False, "isVolleyPlayer": True})
]

arrayStructureSchema = StructType([
    StructField('name', StructType([
       StructField('firstname', StringType(), True),
       StructField('middlename', StringType(), True),
       StructField('lastname', StringType(), True)
       ])),
       StructField('hobbies', ArrayType(StringType()), True),
       StructField('properties', MapType(StringType(),BooleanType()), True)
    ])

df3 = spark.createDataFrame(structData, schema=arrayStructureSchema)
df3.printSchema()
df3.show(vertical=True, truncate=60)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- hobbies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: boolean (valueContainsNull = true)

-RECORD 0----------------------------------------------------
 name       | {Bob, harley, Martin}                          
 hobbies    | [Cricket, Volley]                              
 properties | {isVolleyPlayer -> true, isCricketer -> false} 



# Creating StructType object struct from JSON file

 You can get the schema by using `df3.schema.json()`
 

In [26]:
schema_json = df3.schema.json()
df3.schema.simpleString()

'struct<name:struct<firstname:string,middlename:string,lastname:string>,hobbies:array<string>,properties:map<string,boolean>>'

In [30]:
# Now let’s load the json file and use it to create a DataFrame.
import json
schemaFromJson = StructType.fromJson(json.loads(schema_json))
df3 = spark.createDataFrame(
        spark.sparkContext.parallelize(structData),schemaFromJson)
df3.printSchema()
df3.show(truncate=60, vertical=True)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- hobbies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: boolean (valueContainsNull = true)

-RECORD 0----------------------------------------------------
 name       | {Bob, harley, Martin}                          
 hobbies    | [Cricket, Volley]                              
 properties | {isVolleyPlayer -> true, isCricketer -> false} 



Checking if a Column Exists in a DataFrame

In [54]:
print("name" in df3.schema.fieldNames())

True


# PySpark Column Class

1. One of the simplest ways to create a Column class object is by using PySpark `lit()` SQL function, this takes a literal value and returns a Column object

In [56]:
from pyspark.sql.functions import lit

colObj = lit("Hello World")
type(colObj)


pyspark.sql.column.Column

In [66]:
data = [("Jeams", 40), ("Anna", 36)]
df = spark.createDataFrame(data).toDF("name.fname", "age")
df.printSchema()

# Using DataFrame object (df)
df.select(df.age).show()
df.select(df["age"]).show()

#Accessing column name with dot (with backticks)
df.select(df["`name.fname`"]).show()

root
 |-- name.fname: string (nullable = true)
 |-- age: long (nullable = true)

+---+
|age|
+---+
| 40|
| 36|
+---+

+---+
|age|
+---+
| 40|
| 36|
+---+

+----------+
|name.fname|
+----------+
|     Jeams|
|      Anna|
+----------+



In [68]:
# Using SQL col() function
from pyspark.sql.functions import col
df.select(col('age')).show()
df.select(col("`name.fname`")).show()

+---+
|age|
+---+
| 40|
| 36|
+---+

+----------+
|name.fname|
+----------+
|     Jeams|
|      Anna|
+----------+



# Create DataFrame with struct using Row class

In [69]:
from pyspark.sql import Row

data = [
    Row(name="Jeams", prop=Row(hair="Black", eye="Blue")),
    Row(name="Ann", prop=Row(hair="Grey", eye="Black"))
]

df = spark.createDataFrame(data)
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- prop: struct (nullable = true)
 |    |-- hair: string (nullable = true)
 |    |-- eye: string (nullable = true)



In [70]:
# Access struct column
df.select(df.name).show()
df.select(df.prop.eye).show()

+-----+
| name|
+-----+
|Jeams|
|  Ann|
+-----+

+--------+
|prop.eye|
+--------+
|    Blue|
|   Black|
+--------+



In [72]:
df.select(col("prop.hair")).show()
df.select(col("prop.*")).show()

+-----+
| hair|
+-----+
|Black|
| Grey|
+-----+

+-----+-----+
| hair|  eye|
+-----+-----+
|Black| Blue|
| Grey|Black|
+-----+-----+



: 

PySpark column also provides a way to do arithmetic operations on columns using operators.

In [8]:
data = [
    (200, 1, 2),
    (300, 40, 20),
    (500, 91, 42)
]

df = spark.createDataFrame(data).toDF("col1", "col2", "col3")
df.printSchema()

# Arithematic operations
df.select(df.col1 + df.col2).show()
df.select(df.col1 - df.col2).show()
df.select(df.col1 * df.col2).show()
df.select(df.col1 / df.col2).show()
df.select(df.col1 % df.col2).show()

df.select(df.col2 > df.col3).show()
df.select(df.col2 < df.col3).show()
df.select(df.col2 == df.col3).show()


root
 |-- col1: long (nullable = true)
 |-- col2: long (nullable = true)
 |-- col3: long (nullable = true)

+-------------+
|(col1 + col2)|
+-------------+
|          201|
|          340|
|          591|
+-------------+

+-------------+
|(col1 - col2)|
+-------------+
|          199|
|          260|
|          409|
+-------------+

+-------------+
|(col1 * col2)|
+-------------+
|          200|
|        12000|
|        45500|
+-------------+

+------------------+
|     (col1 / col2)|
+------------------+
|             200.0|
|               7.5|
|5.4945054945054945|
+------------------+

+-------------+
|(col1 % col2)|
+-------------+
|            0|
|           20|
|           45|
+-------------+

+-------------+
|(col2 > col3)|
+-------------+
|        false|
|         true|
|         true|
+-------------+

+-------------+
|(col2 < col3)|
+-------------+
|         true|
|        false|
|        false|
+-------------+

+-------------+
|(col2 = col3)|
+-------------+
|        false|
| 

# PySpark Column Functions

In [9]:
data = [
    ("James","Bond","100",None),
    ("Ann","Varsa","200",'F'),
    ("Tom Cruise","XXX","400",''),
    ("Tom Brand",None,"400",'M')
] 

columns = ["fname","lname","id","gender"]
columnFuncDF = spark.createDataFrame(data,schema=columns)
columnFuncDF.printSchema()

root
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)



`alias()` – Set’s name to Column

In [12]:
from pyspark.sql.functions import expr

# using alias()
columnFuncDF.select(
    columnFuncDF.fname.alias("first_name"),
    columnFuncDF.lname.alias("last_name")
).show()

# Using expr()
columnFuncDF.select(expr(" fname ||','|| lname").alias("full_name")).show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     James|     Bond|
|       Ann|    Varsa|
|Tom Cruise|      XXX|
| Tom Brand|     NULL|
+----------+---------+

+--------------+
|     full_name|
+--------------+
|    James,Bond|
|     Ann,Varsa|
|Tom Cruise,XXX|
|          NULL|
+--------------+



`asc()` & `desc()` – Sort the DataFrame columns by Ascending or Descending order.

In [16]:
# asc, desc to sort ascending and descending order repsectively.
print("Sort fname asc:")
columnFuncDF.sort(columnFuncDF.fname.asc()).show()

print("Sort fname desc:")
columnFuncDF.sort(columnFuncDF.fname.desc()).show()

Sort fname asc:
+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|       Ann|Varsa|200|     F|
|     James| Bond|100|  NULL|
| Tom Brand| NULL|400|     M|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+

Sort fname desc:
+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| NULL|400|     M|
|     James| Bond|100|  NULL|
|       Ann|Varsa|200|     F|
+----------+-----+---+------+



`cast()` & `astype()` – Used to convert the data Type

In [22]:
from pyspark.sql.types import IntegerType
columnFuncDF.select(columnFuncDF.fname, columnFuncDF.id.cast(IntegerType())).printSchema()
columnFuncDF.select(columnFuncDF.fname, columnFuncDF.id.cast('int')).printSchema()

root
 |-- fname: string (nullable = true)
 |-- id: integer (nullable = true)

root
 |-- fname: string (nullable = true)
 |-- id: integer (nullable = true)



`between()` – Returns a Boolean expression when a column values in between lower and upper bound

In [26]:
columnFuncDF.filter(columnFuncDF.id.between(50, 150)).show()
columnFuncDF.select(columnFuncDF.id.between(50, 150)).show()

+-----+-----+---+------+
|fname|lname| id|gender|
+-----+-----+---+------+
|James| Bond|100|  NULL|
+-----+-----+---+------+

+----------------------------+
|((id >= 50) AND (id <= 150))|
+----------------------------+
|                        true|
|                       false|
|                       false|
|                       false|
+----------------------------+



`contains()` – Check if a PySpark DataFrame column value contains a string value specified in this function.

In [27]:
columnFuncDF.filter(columnFuncDF.fname.contains("Tom")).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| NULL|400|     M|
+----------+-----+---+------+



`startswith()` & `endswith()` –
Checks if the value of the DataFrame Column startsWith() and endsWith() a String. startsWith() filters rows where a specified substring exists at the beginning while endsWith() filter rows where the specified substring presents at the end.

In [28]:
columnFuncDF.filter(columnFuncDF.fname.startswith("T")).show()
columnFuncDF.filter(columnFuncDF.fname.endswith("and")).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| NULL|400|     M|
+----------+-----+---+------+

+---------+-----+---+------+
|    fname|lname| id|gender|
+---------+-----+---+------+
|Tom Brand| NULL|400|     M|
+---------+-----+---+------+



`isNull` & `isNotNull()` – Checks if the DataFrame column has NULL or non NULL values.

In [30]:
columnFuncDF.filter(columnFuncDF.gender.isNull()).show()
columnFuncDF.filter(columnFuncDF.gender.isNotNull()).show()

+-----+-----+---+------+
|fname|lname| id|gender|
+-----+-----+---+------+
|James| Bond|100|  NULL|
+-----+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|       Ann|Varsa|200|     F|
|Tom Cruise|  XXX|400|      |
| Tom Brand| NULL|400|     M|
+----------+-----+---+------+



`like()` & `rlike()` – Similar to SQL LIKE expression


In [35]:
columnFuncDF.select(columnFuncDF.fname, columnFuncDF.lname).filter(columnFuncDF.fname.like("%nd")).show()

+---------+-----+
|    fname|lname|
+---------+-----+
|Tom Brand| NULL|
+---------+-----+



`substr()` – Returns a Column after getting sub string from the Column

In [40]:
columnFuncDF.select(columnFuncDF.fname.substr(1, 3).alias("substr_name"))\
            .show()

+-----------+
|substr_name|
+-----------+
|        Jam|
|        Ann|
|        Tom|
|        Tom|
+-----------+



`when()` & `otherwise()` – It is similar to SQL Case When, executes sequence of expressions until it matches the condition and returns a value when match

In [46]:
from pyspark.sql.functions import when

columnFuncDF.select(
    columnFuncDF.fname, 
    columnFuncDF.lname,
    when(columnFuncDF.gender == 'M', 'Male') \
    .when(columnFuncDF.gender == 'F', 'Female') \
    .when(columnFuncDF.gender == None, '')
    .otherwise(columnFuncDF.gender) \
    .alias("derived_gender"),
    columnFuncDF.id
).show()

+----------+-----+--------------+---+
|     fname|lname|derived_gender| id|
+----------+-----+--------------+---+
|     James| Bond|          NULL|100|
|       Ann|Varsa|        Female|200|
|Tom Cruise|  XXX|              |400|
| Tom Brand| NULL|          Male|400|
+----------+-----+--------------+---+



`isin()` – Check if value presents in a List

In [48]:
columnFuncDF.select(columnFuncDF.id, columnFuncDF.fname, columnFuncDF.lname, columnFuncDF.gender) \
            .filter(columnFuncDF.id.isin([100, 200])) \
            .show()

+---+-----+-----+------+
| id|fname|lname|gender|
+---+-----+-----+------+
|100|James| Bond|  NULL|
|200|  Ann|Varsa|     F|
+---+-----+-----+------+



`getField()` – To get the value by key from MapType column and by stuct child name from StructType column

In [49]:
#Create DataFrame with struct, array & map
from pyspark.sql.types import StructType,StructField,StringType,ArrayType,MapType

data=[(("James","Bond"),["Java","C#"],{'hair':'black','eye':'brown'}),
      (("Ann","Varsa"),[".NET","Python"],{'hair':'brown','eye':'black'}),
      (("Tom Cruise",""),["Python","Scala"],{'hair':'red','eye':'grey'}),
      (("Tom Brand",None),["Perl","Ruby"],{'hair':'black','eye':'blue'})]

schema = StructType([
        StructField('name', StructType([
            StructField('fname', StringType(), True),
            StructField('lname', StringType(), True)])),
        StructField('languages', ArrayType(StringType()),True),
        StructField('properties', MapType(StringType(),StringType()),True)
     ])

df=spark.createDataFrame(data,schema)
df.printSchema()


root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [52]:
# getField from MapType
df.select(df.properties.getField("hair")).show()

# getField from Struct
df.select(df.name.getField("fname")).show()

# from ArrayType
df.select(df.languages).show()
df.select(df.languages[0]).show()

+----------------+
|properties[hair]|
+----------------+
|           black|
|           brown|
|             red|
|           black|
+----------------+

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
|Tom Cruise|
| Tom Brand|
+----------+

+---------------+
|      languages|
+---------------+
|     [Java, C#]|
| [.NET, Python]|
|[Python, Scala]|
|   [Perl, Ruby]|
+---------------+

+------------+
|languages[0]|
+------------+
|        Java|
|        .NET|
|      Python|
|        Perl|
+------------+



`getItem()` – To get the value by index from MapType or ArrayTupe & ny key for MapType column.

In [53]:
#getItem() used with ArrayType
df.select(df.languages.getItem(1)).show()

#getItem() used with MapType
df.select(df.properties.getItem("hair")).show()

+------------+
|languages[1]|
+------------+
|          C#|
|      Python|
|       Scala|
|        Ruby|
+------------+

+----------------+
|properties[hair]|
+----------------+
|           black|
|           brown|
|             red|
|           black|
+----------------+



# PySpark Select Columns From DataFrame

In PySpark, `select()` function is used to select single, multiple, column by index, all columns from the list and the nested columns from a DataFrame, PySpark `select()` is a transformation function hence it returns a new DataFrame with the selected columns.

In [4]:
data = [
    ("James", "Smith", "USA", "CA"),
    ("Michael", "Rose", "USA", "NY"),
    ("Robert", "Williams", "USA", "CA"),
    ("Maria", "Jones", "USA", "FL")
]

columns = ["firstname", "lastname", "country", "state"]

df = spark.createDataFrame(data = data, schema = columns)
df.show(truncate=False)

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|James    |Smith   |USA    |CA   |
|Michael  |Rose    |USA    |NY   |
|Robert   |Williams|USA    |CA   |
|Maria    |Jones   |USA    |FL   |
+---------+--------+-------+-----+



Select Single & Multiple Columns From PySpark

In [7]:
df.select("firstname", "lastname", "country", "state").show()
df.select(df.firstname, df.lastname).show()
df.select(df['firstname'], df['lastname']).show()

#By using col() function
from pyspark.sql.functions import col
df.select(col("firstname"),col("lastname")).show()

#Select columns by regular expression
df.select(df.colRegex("`^.*name*`")).show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+



Select All Columns From List

In [9]:
# Select All columns from List
df.select(*columns).show()

# Select All columns
df.select([col for col in df.columns]).show()
df.select("*").show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



In [10]:
#Selects first 3 columns and top 3 rows
df.select(df.columns[:3]).show(3)

#Selects columns 2 to 4  and top 3 rows
df.select(df.columns[2:4]).show(3)

+---------+--------+-------+
|firstname|lastname|country|
+---------+--------+-------+
|    James|   Smith|    USA|
|  Michael|    Rose|    USA|
|   Robert|Williams|    USA|
+---------+--------+-------+
only showing top 3 rows

+-------+-----+
|country|state|
+-------+-----+
|    USA|   CA|
|    USA|   NY|
|    USA|   CA|
+-------+-----+
only showing top 3 rows



Select Nested Struct Columns from PySpark

In [11]:
data = [
        (("James",None,"Smith"),"OH","M"),
        (("Anna","Rose",""),"NY","F"),
        (("Julia","","Williams"),"OH","F"),
        (("Maria","Anne","Jones"),"NY","M"),
        (("Jen","Mary","Brown"),"NY","M"),
        (("Mike","Mary","Williams"),"OH","M")
        ]

from pyspark.sql.types import StructType,StructField, StringType    

schema = StructType([
    StructField('name', StructType([
         StructField('firstname', StringType(), True),
         StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
         ])),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
     ])

df2 = spark.createDataFrame(data = data, schema = schema)

df2.printSchema()
df2.show(truncate=False) # shows all columns

df2.select("name").show(truncate=False)
df2.select("name.firstname","name.lastname").show(truncate=False)
df2.select("name.*").show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+----------------------+-----+------+
|name                  |state|gender|
+----------------------+-----+------+
|{James, NULL, Smith}  |OH   |M     |
|{Anna, Rose, }        |NY   |F     |
|{Julia, , Williams}   |OH   |F     |
|{Maria, Anne, Jones}  |NY   |M     |
|{Jen, Mary, Brown}    |NY   |M     |
|{Mike, Mary, Williams}|OH   |M     |
+----------------------+-----+------+

+----------------------+
|name                  |
+----------------------+
|{James, NULL, Smith}  |
|{Anna, Rose, }        |
|{Julia, , Williams}   |
|{Maria, Anne, Jones}  |
|{Jen, Mary, Brown}    |
|{Mike, Mary, Williams}|
+----------------------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|James    |Smith   |
|Anna     |        |
|Julia  

# PySpark `Collect()` – Retrieve data from DataFrame

PySpark RDD/DataFrame `collect()` is an action operation that is used to retrieve all the elements of the dataset (from all nodes) to the driver node. We should use the `collect()` on smaller dataset usually after `filter()`, `group()` e.t.c. Retrieving larger datasets results in OutOfMemory error.

In [12]:
dept = [
    ("Finance",10), \
    ("Marketing",20), \
    ("Sales",30), \
    ("IT",40) \
]

deptColumns = ["dept_name", "dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.show(truncate=False)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



In [13]:
# deptDF.collect() retrieves all elements in a DataFrame as an Array of Row type to the driver node
dataCollect = deptDF.collect()
print(dataCollect)

[Row(dept_name='Finance', dept_id=10), Row(dept_name='Marketing', dept_id=20), Row(dept_name='Sales', dept_id=30), Row(dept_name='IT', dept_id=40)]


Note that `collect()` is an action hence it does not return a DataFrame instead, it returns data in an Array to the driver. Once the data is in an array, we can use python for loop to process it further

In [15]:
for row in dataCollect:
    print(row['dept_name'] + "," +str(row['dept_id']))


#Returns value of First Row, First Column which is "Finance"
deptDF.collect()[0][0]

Finance,10
Marketing,20
Sales,30
IT,40


'Finance'

Note: In case we want to just return certain elements of a DataFrame, we should call PySpark `select()` transformation first.


### When to avoid Collect():
 Usually, `collect()` is used to retrieve the action output when you have very small result set and calling `collect()` on an `RDD/DataFrame` with a bigger result set causes `out of memory` as it returns the entire dataset (from all workers) to the driver hence we should avoid calling `collect()` on a larger dataset.

### collect () vs select ():
`select()` is a transformation that returns a new DataFrame and holds the columns that are selected whereas `collect()` is an action that returns the entire data set in an Array to the driver.

# PySpark `withColumn()`

PySpark `withColumn()` is a transformation function of DataFrame which is used to change the value, convert the datatype of an existing column, create a new column, and many more

In [6]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]

df = spark.createDataFrame(data, schema=columns)
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



### Change DataType using PySpark withColumn()
By using PySpark `withColumn()` on a DataFrame, we can cast or change the data type of a column. In order to change data type, you would also need to use `cast()` function along with `withColumn()`

In [10]:
from pyspark.sql.functions import col, lit

# Change DataType using PySpark withColumn()
print("Change DataType using PySpark withColumn(): ")
df2 = df.withColumn("salary",col("salary").cast("Integer"))
df2.printSchema()
df2.show(truncate=False)

# Update The Value of an Existing Column
print("Update The Value of an Existing Column: ")
df3 = df.withColumn("salary",col("salary")*100)
df3.printSchema()
df3.show(truncate=False) 

# Create a Column from an Existing
print("Create a Column from an Existing: ")
df4 = df.withColumn("CopiedColumn",col("salary")* -1)
df4.printSchema()

# Add a New Column using withColumn()
print("Add a New Column using withColumn(): ")
df5 = df.withColumn("Country", lit("USA"))
df5.printSchema()
df5.show(truncate=False)

df6 = df.withColumn("Country", lit("USA")) \
   .withColumn("anotherColumn",lit("anotherValue"))
df6.printSchema()


# Rename Column Name
print("Rename Column Name: ")
df.withColumnRenamed("gender","sex") \
  .show(truncate=False) 
  
# Drop Column From PySpark DataFrame
print("Drop Column From PySpark DataFrame: ")
df4.drop("CopiedColumn") \
.show(truncate=False) 

Change DataType using PySpark withColumn(): 
root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|dob       |gender|salary|
+---------+----------+--------+----------+------+------+
|James    |          |Smith   |1991-04-01|M     |3000  |
|Michael  |Rose      |        |2000-05-19|M     |4000  |
|Robert   |          |Williams|1978-09-05|M     |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |
|Jen      |Mary      |Brown   |1980-02-17|F     |-1    |
+---------+----------+--------+----------+------+------+

Update The Value of an Existing Column: 
root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = tr

# PySpark `withColumnRenamed()`

PySpark `withColumnRenamed()` to rename a DataFrame column, we often need to rename one column or multiple (or all) columns on PySpark DataFrame

In [11]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql.functions import *

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
  (('Michael','Rose',''),'2000-05-19','M',4000),
  (('Robert','','Williams'),'1978-09-05','M',4000),
  (('Maria','Anne','Jones'),'1967-12-01','F',4000),
  (('Jen','Mary','Brown'),'1980-02-17','F',-1)
]

schema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df = spark.createDataFrame(data = dataDF, schema = schema)
df.printSchema()

# Syntax: withColumnRenamed(existingName, newNam)

# To rename DataFrame column name
print("To rename DataFrame column name: ")
df.withColumnRenamed("dob","DateOfBirth").printSchema()

# To rename multiple columns
print("To rename multiple columns:")
df2 = df.withColumnRenamed("dob","DateOfBirth") \
    .withColumnRenamed("salary","salary_amount")
df2.printSchema()

# To rename a nested column in Dataframe
print("To rename a nested column in Dataframe:")
schema2 = StructType([
    StructField("fname",StringType()),
    StructField("middlename",StringType()),
    StructField("lname",StringType())])
    
df.select(col("name").cast(schema2),
  col("dob"),
  col("gender"),
  col("salary")) \
    .printSchema()    

# To rename nested elements
print("To rename nested elements: ")
df.select(col("name.firstname").alias("fname"),
  col("name.middlename").alias("mname"),
  col("name.lastname").alias("lname"),
  col("dob"),col("gender"),col("salary")) \
  .printSchema()
  
# To rename nested columns
print("To rename nested columns: ")
df4 = df.withColumn("fname",col("name.firstname")) \
      .withColumn("mname",col("name.middlename")) \
      .withColumn("lname",col("name.lastname")) \
      .drop("name")
df4.printSchema()

# To change all columns in a PySpark DataFrame
print("To change all columns in a PySpark DataFrame: ")
newColumns = ["newCol1","newCol2","newCol3","newCol4"]
df.toDF(*newColumns).printSchema()



root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

To rename DataFrame column name: 
root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

To rename multiple columns:
root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary_amount: integer (nullable = true)

To rename a nested column in Dataframe:

# PySpark `where()` & `filter()` Function

PySpark `filter()` function is used to filter the rows from RDD/DataFrame based on the given condition or SQL expression, you can also use `where()` clause instead of the `filter()` if you are coming from an SQL background, both these functions operate exactly the same. `filter()` function returns a new DataFrame or RDD with only the rows that meet the condition specified.

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, ArrayType
from pyspark.sql.functions import col,array_contains

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

arrayStructureData = [
        (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
        (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
        (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
        (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
        (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
        (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
        ]
        
arrayStructureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('languages', ArrayType(StringType()), True),
         StructField('state', StringType(), True),
         StructField('gender', StringType(), True)
         ])


df = spark.createDataFrame(data = arrayStructureData, schema = arrayStructureSchema)
df.printSchema()
df.show(truncate=False)

df.filter(df.state == "OH") \
    .show(truncate=False)

df.filter(col("state") == "OH") \
    .show(truncate=False)    
    
df.filter("gender  == 'M'") \
    .show(truncate=False)    

df.filter( (df.state  == "OH") & (df.gender  == "M") ) \
    .show(truncate=False)        

df.filter(array_contains(df.languages,"Java")) \
    .show(truncate=False)        

df.filter(df.name.lastname == "Williams") \
    .show(truncate=False) 

df.filter( (df.state == "OH") & (df.gender == "M") ).show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Anna, Rose, }        |[Spark, Java, C++]|NY   |F     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Maria, Anne, Jones}  |[CSharp, VB]      |NY   |M     |
|{Jen, Mary, Brown}    |[CSharp, VB]      |NY   |M     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+

+----------------------+------------------+-----+------+
|name                  |langu

# PySpark `distinct()` and `dropDuplicates()`

PySpark `distinct()` transformation is used to drop/remove the duplicate rows (all columns) from DataFrame and `dropDuplicates()` is used to drop rows based on selected (one or multiple) columns. `distinct()` and `dropDuplicates()` returns a new DataFrame.

PySpark doesn’t have a distinct method that takes columns that should run distinct (drop duplicate rows on selected multiple columns) however, `dropDuplicates()` transformation which takes multiple columns to eliminate duplicates

In [14]:
from pyspark.sql.functions import expr
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

data = [("James", "Sales", 3000), \
    ("Michael", "Sales", 4600), \
    ("Robert", "Sales", 4100), \
    ("Maria", "Finance", 3000), \
    ("James", "Sales", 3000), \
    ("Scott", "Finance", 3300), \
    ("Jen", "Finance", 3900), \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000), \
    ("Saif", "Sales", 4100) \
  ]
columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)

# Distinct
distinctDF = df.distinct()
print("Distinct count: "+str(distinctDF.count()))
distinctDF.show(truncate=False)

# Drop duplicates
df2 = df.dropDuplicates()
print("Distinct count: "+str(df2.count()))
df2.show(truncate=False)

# Drop duplicates on selected columns
dropDisDF = df.dropDuplicates(["department","salary"])
print("Distinct count of department salary : "+str(dropDisDF.count()))
dropDisDF.show(truncate=False)


root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+

Distinct count: 9
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Michael      |Sales     |4600  |
|James        |Sales     |3000  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Kumar        |Marketing |2000  |
|Jeff         |Marketing |3000  |
|S

# PySpark `orderBy()` and `sort()`

You can use either `sort()` or `orderBy()` function of PySpark DataFrame to sort DataFrame by ascending or descending order based on single or multiple columns. Both methods take one or more columns as arguments and return a new DataFrame after sorting. You can also do sorting using PySpark SQL sorting functions.

In [3]:
import pyspark
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder.appName('PySparkLearning').getOrCreate()

In [4]:
simpleData = [
    ("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Raman","Finance","CA",99000,40,24000), \
    ("Scott","Finance","NY",83000,36,19000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]

columns= ["employee_name","department","state","salary","age","bonus"]

df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



### DataFrame sorting using the `sort()` function:

PySpark DataFrame class provides `sort()` function to sort on one or more columns. `sort()` takes a Boolean argument for ascending or descending order. To specify different sorting orders for different columns, you can use the parameter as a list. 

Syntax:

DataFrame.sort(*cols, **kwargs)

Parameters: cols: str, list, or Column, 

Other Parameters: ascending: bool or list, optional

e.g. df.sort("column1", "column2", ascending=[True, False]) 


### DataFrame sorting using `orderBy()` function

DataFrame.orderBy(*cols, **kwargs)

Other Parameters: ascending: bool or list, optional

In [13]:
from pyspark.sql.functions import col

df.sort("department","state").show(truncate=False)
# df.sort(col("department"),col("state")).show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
+-------------+----------+-----+------+---+-----+



In [12]:
# Sorting DataFrame using orderBy()

df.orderBy("department","state").show(truncate=False)
# df.orderBy(col("department"),col("state")).show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
+-------------+----------+-----+------+---+-----+



In [11]:
# Sort DataFrame with asc

df.sort(df.department.asc(),df.state.asc()).show(truncate=False)
# df.sort(col("department").asc(),col("state").asc()).show(truncate=False)
# df.orderBy(col("department").asc(),col("state").asc()).show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Raman        |Finance   |CA   |99000 |40 |24000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
+-------------+----------+-----+------+---+-----+



In [10]:
# Sort DataFrame with desc

df.sort(df.department.asc(),df.state.desc()).show(truncate=False)
# df.sort(col("department").asc(),col("state").desc()).show(truncate=False)
# df.orderBy(col("department").asc(),col("state").desc()).show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
+-------------+----------+-----+------+---+-----+



In [14]:
# Sort using spark SQL

df.createOrReplaceTempView("EMP")
spark.sql("select employee_name,department,state,salary,age,bonus from EMP ORDER BY department asc").show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Raman        |Finance   |CA   |99000 |40 |24000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
+-------------+----------+-----+------+---+-----+



# PySpark `groupBy()`


Similar to SQL GROUP BY clause, PySpark groupBy() function is used to collect the identical data into groups on DataFrame and perform count, sum, avg, min, max functions on the grouped data.

Syntax:

DataFrame.groupBy(*cols)
#or 
DataFrame.groupby(*cols)

When we perform `groupBy()` on PySpark Dataframe, it returns GroupedData object which contains below aggregate functions.

`count()`:  Return the number of rows for each group. 

`mean()`:	Returns the mean of values for each group.

`max()`:`	Returns the maximum of values for each group.

`min()`:	Returns the minimum of values for each group.

`sum()`:	Returns the total for values for each group.

`avg()`:	Returns the average for values for each group.

`agg()`:	Using groupBy() agg() function, we can calculate more than one aggregate at a time.

`pivot()`:	This function is used to Pivot the DataFrame

In [16]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,sum,avg,max

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]

schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)


print("Calculate the total number of each department using count():")
df.groupBy("department").count().show()

print("Calculate the minimum salary of each department using min():")
df.groupBy("department").min("salary").show()

print("Calculate the maximum salary of each department using max():")
df.groupBy("department").max("salary").show()

print("Calculate the average salary of each department using avg():")
df.groupBy("department").avg( "salary").show()

print("Calculate the mean salary of each department using mean():")
df.groupBy("department").mean( "salary").show()


root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+

Calculate the total number of each department using count():
+----------+-----+
|department|count|
+----------+----

In [17]:
# df.groupBy("department").sum("salary").show(truncate=False)
# df.groupBy("department").count().show(truncate=False)


print("GroupBy with multiple columns: ")
df.groupBy("department","state") \
    .sum("salary","bonus") \
   .show(truncate=False)

df.groupBy("department") \
    .agg(sum("salary").alias("sum_salary"), \
         avg("salary").alias("avg_salary"), \
         sum("bonus").alias("sum_bonus"), \
         max("bonus").alias("max_bonus") \
     ) \
    .show(truncate=False)
    
df.groupBy("department") \
    .agg(sum("salary").alias("sum_salary"), \
      avg("salary").alias("avg_salary"), \
      sum("bonus").alias("sum_bonus"), \
      max("bonus").alias("max_bonus")) \
    .where(col("sum_bonus") >= 50000) \
    .show(truncate=False)

GroupBy with multiple columns: 
+----------+-----+-----------+----------+
|department|state|sum(salary)|sum(bonus)|
+----------+-----+-----------+----------+
|Sales     |NY   |176000     |30000     |
|Sales     |CA   |81000      |23000     |
|Finance   |CA   |189000     |47000     |
|Finance   |NY   |162000     |34000     |
|Marketing |NY   |91000      |21000     |
|Marketing |CA   |80000      |18000     |
+----------+-----+-----------+----------+

+----------+----------+-----------------+---------+---------+
|department|sum_salary|avg_salary       |sum_bonus|max_bonus|
+----------+----------+-----------------+---------+---------+
|Sales     |257000    |85666.66666666667|53000    |23000    |
|Finance   |351000    |87750.0          |81000    |24000    |
|Marketing |171000    |85500.0          |39000    |21000    |
+----------+----------+-----------------+---------+---------+

+----------+----------+-----------------+---------+---------+
|department|sum_salary|avg_salary       |sum_bonus