### Library Imports

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from datetime import datetime

Create a `SparkSession`. No need to create `SparkContext` as you automatically get it as part of the `SparkSession`.

In [3]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Exploring Joins") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

sc = spark.sparkContext

In [32]:
df = spark.createDataFrame(
    [
        (1, "Not bot", "", ""), 
        (2, "Bot", "", ""), 
        (3, None, "", ""), 
    ], ['id', 'ua_form_factor', "user_agent", "ua_details"]
)

df.toPandas()

Unnamed: 0,id,ua_form_factor,user_agent,ua_details
0,1,Not bot,,
1,2,Bot,,
2,3,,,


In [40]:
result = df.where(
    (F.col("ua_form_factor").isNull()) |
    (F.col("ua_form_factor") != 'Bot')
  ) \
  .drop("user_agent", "ua_details")

result.explain()
result.toPandas()

== Physical Plan ==
*(1) Project [id#166L, ua_form_factor#167]
+- *(1) Filter (isnull(ua_form_factor#167) || NOT (ua_form_factor#167 = Bot))
   +- Scan ExistingRDD[id#166L,ua_form_factor#167,user_agent#168,ua_details#169]


Unnamed: 0,id,ua_form_factor
0,1,Not bot
1,3,


In [38]:
result = df.where(F.coalesce(F.col("ua_form_factor"), F.lit("")) != 'Bot') \
  .drop("user_agent", "ua_details")

result.explain()
result.toPandas()

== Physical Plan ==
*(1) Project [id#166L, ua_form_factor#167]
+- *(1) Filter NOT (coalesce(ua_form_factor#167, ) = Bot)
   +- Scan ExistingRDD[id#166L,ua_form_factor#167,user_agent#168,ua_details#169]


Unnamed: 0,id,ua_form_factor
0,1,Not bot
1,3,
