# Filter Operation

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("csnnnn").getOrCreate()

df = spark.read.csv("data/testv.csv", inferSchema=True, header=True, sep="\t").select("MSISDN", "EMAIL", "NID")
df.printSchema()
df.show(10)
df.count()

root
 |-- MSISDN: integer (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- NID: string (nullable = true)

+--------+--------------------+--------------+
|  MSISDN|               EMAIL|           NID|
+--------+--------------------+--------------+
|54924133|JEANPOMPIER0517@G...|J0711874907942|
|54846497|                NULL|B160985220041C|
|57369115|                NULL|      S0987933|
|57113437|                NULL|D090290300772B|
|58468805|                NULL|N110561330075F|
|57228141|                NULL|     A03228083|
|57119226|                NULL|P0502893808515|
|57118074|                NULL|M1503654401680|
|54881304|                NULL|J080295290173A|
|57320678|                NULL|L100783300094F|
+--------+--------------------+--------------+
only showing top 10 rows



18

In [60]:
# Method 1:
df.filter("MSISDN > 57320678 and NID like 'S%'").show()

+--------+-----+--------+
|  MSISDN|EMAIL|     NID|
+--------+-----+--------+
|57369115| NULL|S0987933|
+--------+-----+--------+



In [62]:
# Method 2: Using multiple conditions (Use & (and), | (or), ~ (not), and wrap each condition in parentheses)

df.filter(
    (df['MSISDN'] >= 70430321) &
    (df['NID'] == 'NYRL44185')
).show()


# OR


from pyspark.sql.functions import col

df.filter(
    col('MSISDN').isNotNull() &
    col('NID').isNotNull()
).show()

+--------+-----+---------+
|  MSISDN|EMAIL|      NID|
+--------+-----+---------+
|70430321| NULL|NYRL44185|
+--------+-----+---------+

+--------+--------------------+--------------+
|  MSISDN|               EMAIL|           NID|
+--------+--------------------+--------------+
|54924133|JEANPOMPIER0517@G...|J0711874907942|
|54846497|                NULL|B160985220041C|
|57369115|                NULL|      S0987933|
|57113437|                NULL|D090290300772B|
|58468805|                NULL|N110561330075F|
|57228141|                NULL|     A03228083|
|57119226|                NULL|P0502893808515|
|57118074|                NULL|M1503654401680|
|54881304|                NULL|J080295290173A|
|57320678|                NULL|L100783300094F|
|70419962|                NULL|     PAG898953|
|54882497|                NULL|J0204981200574|
|57432871|                NULL|P0407853002537|
|54599755|                NULL|G0602930101634|
|54521567|                NULL|L141089380213G|
|54537767|        

In [70]:
# There are methods in col(). when you can use:
df.filter(col("MSISDN").between(57119226, 57432871)).show()

# OR 

df.filter("MSISDN between 57119226 and 57432871").show()

+--------+-----+--------------+
|  MSISDN|EMAIL|           NID|
+--------+-----+--------------+
|57369115| NULL|      S0987933|
|57228141| NULL|     A03228083|
|57119226| NULL|P0502893808515|
|57320678| NULL|L100783300094F|
|57432871| NULL|P0407853002537|
+--------+-----+--------------+

+--------+-----+--------------+
|  MSISDN|EMAIL|           NID|
+--------+-----+--------------+
|57369115| NULL|      S0987933|
|57228141| NULL|     A03228083|
|57119226| NULL|P0502893808515|
|57320678| NULL|L100783300094F|
|57432871| NULL|P0407853002537|
+--------+-----+--------------+



In [82]:
my_list = [57228141, 57228141, 57320678]
df.filter(col("MSISDN").isin(my_list)).show() 

# OR

df.filter("MSISDN in (57228141, 57228141, 57320678)").show() # SQL-LIKE


df.filter(~(col("MSISDN").isin(my_list))).show()
df.filter("MSISDN not in (57228141, 57228141, 57320678)").show() # SQL-LIKE



+--------+-----+--------------+
|  MSISDN|EMAIL|           NID|
+--------+-----+--------------+
|57228141| NULL|     A03228083|
|57320678| NULL|L100783300094F|
+--------+-----+--------------+

+--------+-----+--------------+
|  MSISDN|EMAIL|           NID|
+--------+-----+--------------+
|57228141| NULL|     A03228083|
|57320678| NULL|L100783300094F|
+--------+-----+--------------+

+--------+--------------------+--------------+
|  MSISDN|               EMAIL|           NID|
+--------+--------------------+--------------+
|54924133|JEANPOMPIER0517@G...|J0711874907942|
|54846497|                NULL|B160985220041C|
|57369115|                NULL|      S0987933|
|57113437|                NULL|D090290300772B|
|58468805|                NULL|N110561330075F|
|57119226|                NULL|P0502893808515|
|57118074|                NULL|M1503654401680|
|54881304|                NULL|J080295290173A|
|70419962|                NULL|     PAG898953|
|54882497|                NULL|J0204981200574|
|57