## PySpark Where/Filter Function

PySpark `filter()` function is used to filter the rows from RDD/DataFrame based on the given condition or SQL expression, you can also use `where()` clause, it is an alias for `filter`

In [0]:
dbutils.library.restartPython() # Removes Python state, but some libraries might not work without calling this command.dbutils.restartPython()

#### Load libraries

In [0]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import IntegerType, DateType, StringType, StructType, StructField, ArrayType, MapType, DoubleType
from pyspark.sql.functions import lit, col, expr, when

#### Create Spark session

In [0]:
spark = SparkSession.builder.appName('PySpark Where/Filter Function').getOrCreate()

In [0]:
data = [
  ('John', '', 'Smith', '36636', 'M', 2500.0),
  ('Jane', '', 'Doe', '42114', 'F', 500.0),
  ('Richard', 'Laurence', 'Marquette', '97086', 'M', 1500.0),
  ('Israel', '', 'Israeli', '', 'M', 3000.0),
  ('Edward', 'III', '', 'SL4', 'M', 5000.0)
]
 
schema = StructType([
  StructField('firstname', StringType(),True),
  StructField('middlename', StringType(),True),
  StructField('lastname', StringType(),True),
  StructField('zip', StringType(), True),
  StructField('gender', StringType(), True),
  StructField('salary', DoubleType(), True)
])

columns = schema.fieldNames()

df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
df.show(truncate=False)

In [0]:
# DataFrame with Nested Data
datan = [
    (('James','','Smith'),['Java','Scala','C++'],'AZ','M'),
    (('Anna','Rose',''),['Spark','Java','C++'],'NY','F'),
    (('Julia','','Williams'),['C#','JavaScript'],'AZ','F'),
    (('Maria','Anne','Jones'),['C#','JavaScript'],'NY','M'),
    (('Jen','Mary','Brown'),['C#','JavaScript'],'NY','M'),
    (('Mike','Mary','Williams'),['Python','JavaScript'],'AZ','M')
]

scheman = StructType([
     StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
     ])),
     StructField('languages', ArrayType(StringType()), True),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
 ])

dfn = spark.createDataFrame(data = datan, schema = scheman)
dfn.printSchema()
dfn.show(truncate=False)

#### DataFrame filter() with Column Condition

In [0]:
df.filter(df.salary == 5000).show(truncate=False)

In [0]:
df.filter(df.salary > 2000).show(truncate=False)

#### DataFrame filter() with SQL Expression

In [0]:
df.filter("salary == 5000").show()

In [0]:
df.filter("salary >= 2000").show()

#### Filter with Multiple Conditions

In [0]:
df.filter((df.gender  == "M") & (df.salary  >= 3000)).show(truncate=False)

In [0]:
# '~'' is used as 'not'
df.filter((df.gender  == "M") & ~(df.salary  >= 3000)).show(truncate=False)

In [0]:
df.filter((df.gender  == "M") & ((df.salary  >= 3000)==False)).show(truncate=False)

In [0]:
query = ((df.gender  == "M") & ~(df.salary  >= 3000))
df.filter(query).show(truncate=False)

#### Filter Based on List Values

In [0]:
li = ['John','Edward']
df.filter(df.firstname.isin(li)).show()

#### Filter Based on Starts With, Ends With, Contains

In [0]:
# Using startswith
df.filter(df.lastname.startswith('D')).show()

In [0]:
# Using endswith
df.filter(df.lastname.endswith('e')).show()

In [0]:
# Using contains
df.filter(df.lastname.contains('t')).show()

#### Filter like and rlike

In [0]:
df.filter(df.firstname.like("J%n%")).show()

In [0]:
# rlike - LIKE with Regex
# This check case insensitive if ends with 'rd'
df.filter(df.firstname.rlike("(?i)^*rd$")).show()

#### Filter on an Array column

In [0]:
from pyspark.sql.functions import array_contains

dfn.filter(array_contains(dfn.languages,'Java')).show(truncate=False)  

#### Filtering on Nested Struct columns

In [0]:
dfn.filter(dfn.name.lastname == 'Williams').show(truncate=False)

#### The end of the notebook