In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("missingData").getOrCreate()

df = spark.read.csv("ContainsNull.csv", inferSchema = True, header = True)

df.printSchema()
df.show()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [5]:
# Drop where there is null
df.na.drop().show()

# Drop when it has n number of null values
df.na.drop(thresh=2).show() # needs to have at least 2 null

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [6]:
# drop only if all are null values
df.na.drop(how="all").show()


# drop only if there is any null values
df.na.drop(how="any").show()


+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [7]:
# consider only on subset (where there is any nulls)
df.na.drop(subset=["Sales"]).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [9]:
# filling missing data
# can only be filled with the same data type, otherwise stays null
df.na.fill("No Data").show()

df.na.fill("No Name", subset=["Name"]).show()


+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Data| null|
|emp3|No Data|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Name| null|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [11]:
# fill missing sales with avg value
from pyspark.sql.functions import mean

mean_val_row = df.select(mean(df["Sales"])).collect()
print(mean_val_row)
# get just the value
mean_val = mean_val_row[0][0]
mean_val

[Row(avg(Sales)=400.5)]


400.5

In [12]:
df.na.fill(mean_val, ["Sales"]).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

