# Filter Operation

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("csnnnn").getOrCreate()

df = spark.read.csv("./data/testdata.csv", inferSchema=True, header=True, sep=",").select("MSISDN", "EMAIL")

df.printSchema()
df.show(10)
df.count()

root
 |-- MSISDN: integer (nullable = true)
 |-- EMAIL: string (nullable = true)

+--------+--------------------+
|  MSISDN|               EMAIL|
+--------+--------------------+
|54924133|JEANPOMPIER0517@G...|
|54846497|                NULL|
|57369115|                NULL|
|57113437|                NULL|
|58468805|                NULL|
|57228141|                NULL|
|57119226|                NULL|
|57118074|                NULL|
|54881304|                NULL|
|57320678|                NULL|
+--------+--------------------+
only showing top 10 rows



99999

In [3]:
# Method 1:
df.filter("MSISDN > 57320678 and EMAIL like 'S%'").show()

+--------+--------------------+
|  MSISDN|               EMAIL|
+--------+--------------------+
|59376589|SYLVESTREJENNIFER...|
|59793875|S_hosenally@hotma...|
|58554292|SHIVANEE07@YAHOO.COM|
|59373621|SABRINACAS@GMAIL.COM|
|58564580|SEEDEEALSOUDESH@G...|
|59776903|SKANDA-GUHA@YAHOO...|
|58578443|SIMLABOODHOO@HOTM...|
|58558411|SHYAM.ABACOUSNAC@...|
|58533718|  SARAH.293@LIVE.COM|
|59359909|SURVANASUMPUTH@YA...|
|59832131|SHOBNADILMAHOMED@...|
|57487043|SAIFALIKHOD777@GM...|
|57443148|S.HUSSENBOCUS@NKM...|
|57339476|STEEVEBEGUE@GMAIL...|
|58430756| STEEVENPK@GMAIL.COM|
|57381291|SULLEIMAN.VAWDA@G...|
|59836789|SHAHANAJHUMMUN@GM...|
|57473693|    SSCOTT@INICIA.MU|
|57470860|STEPHANEJIO@GMAIL...|
|58528826| STELLIO.J@GMAIL.COM|
+--------+--------------------+
only showing top 20 rows



In [4]:
# Method 2: Using multiple conditions (Use & (and), | (or), ~ (not), and wrap each condition in parentheses)

df.filter(
    (df['MSISDN'] >= 70430321) &
    (df['EMAIL'] == 'NYRL44185')
).show()


# OR


from pyspark.sql.functions import col

df.filter(
    col('MSISDN').isNotNull() &
    col('NID').isNotNull()
).show()

+------+-----+
|MSISDN|EMAIL|
+------+-----+
+------+-----+

+--------+--------------------+
|  MSISDN|               EMAIL|
+--------+--------------------+
|54924133|JEANPOMPIER0517@G...|
|54846497|                NULL|
|57369115|                NULL|
|57113437|                NULL|
|58468805|                NULL|
|57228141|                NULL|
|57119226|                NULL|
|57118074|                NULL|
|54881304|                NULL|
|57320678|                NULL|
|70419962|                NULL|
|54882497|                NULL|
|57432871|                NULL|
|54599755|                NULL|
|54521567|                NULL|
|54537767|                NULL|
|54591599|                NULL|
|70430321|                NULL|
|54236101|                NULL|
|59855287|                NULL|
+--------+--------------------+
only showing top 20 rows



In [70]:
# There are methods in col(). when you can use:
df.filter(col("MSISDN").between(57119226, 57432871)).show()

# OR 

df.filter("MSISDN between 57119226 and 57432871").show()

+--------+-----+--------------+
|  MSISDN|EMAIL|           NID|
+--------+-----+--------------+
|57369115| NULL|      S0987933|
|57228141| NULL|     A03228083|
|57119226| NULL|P0502893808515|
|57320678| NULL|L100783300094F|
|57432871| NULL|P0407853002537|
+--------+-----+--------------+

+--------+-----+--------------+
|  MSISDN|EMAIL|           NID|
+--------+-----+--------------+
|57369115| NULL|      S0987933|
|57228141| NULL|     A03228083|
|57119226| NULL|P0502893808515|
|57320678| NULL|L100783300094F|
|57432871| NULL|P0407853002537|
+--------+-----+--------------+



In [82]:
my_list = [57228141, 57228141, 57320678]
df.filter(col("MSISDN").isin(my_list)).show() 

# OR

df.filter("MSISDN in (57228141, 57228141, 57320678)").show() # SQL-LIKE


df.filter(~(col("MSISDN").isin(my_list))).show()
df.filter("MSISDN not in (57228141, 57228141, 57320678)").show() # SQL-LIKE



+--------+-----+--------------+
|  MSISDN|EMAIL|           NID|
+--------+-----+--------------+
|57228141| NULL|     A03228083|
|57320678| NULL|L100783300094F|
+--------+-----+--------------+

+--------+-----+--------------+
|  MSISDN|EMAIL|           NID|
+--------+-----+--------------+
|57228141| NULL|     A03228083|
|57320678| NULL|L100783300094F|
+--------+-----+--------------+

+--------+--------------------+--------------+
|  MSISDN|               EMAIL|           NID|
+--------+--------------------+--------------+
|54924133|JEANPOMPIER0517@G...|J0711874907942|
|54846497|                NULL|B160985220041C|
|57369115|                NULL|      S0987933|
|57113437|                NULL|D090290300772B|
|58468805|                NULL|N110561330075F|
|57119226|                NULL|P0502893808515|
|57118074|                NULL|M1503654401680|
|54881304|                NULL|J080295290173A|
|70419962|                NULL|     PAG898953|
|54882497|                NULL|J0204981200574|
|57