In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName("miss").getOrCreate()

In [7]:
print(spark)
print(type(spark))

<pyspark.sql.session.SparkSession object at 0x108d84358>
<class 'pyspark.sql.session.SparkSession'>


In [15]:
df = spark.read \
.csv('file:///Users/hdagar3/Documents/Spark_Things/Spark_Course_Files_JosePortilla/Spark_DataFrames/ContainsNull.csv',\
        inferSchema=True,header=True)

In [16]:
print(df.show())

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

None


In [17]:
# now we will look into how to deal with missing data

In [18]:
print(df.na) # object of DataFrameNaFunctions class
print(type(df.na)) 

<pyspark.sql.dataframe.DataFrameNaFunctions object at 0x108e260f0>
<class 'pyspark.sql.dataframe.DataFrameNaFunctions'>


In [19]:
# lets see at dropping the missing data

In [21]:
df.na.drop().show() # it drops the row which is having null value in any column
print(df.na.drop())

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+

DataFrame[Id: string, Name: string, Sales: double]


In [23]:
df.na.drop(thresh=2).show() # thresh overrides how
# it displays that row which is having at least 2 non-null values 

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [26]:
df.na.drop(how='any').show() # how='any' is by default

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [28]:
df.na.drop(how='all').show()
# it drops row only if all values in that row are null

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [32]:
df.na.drop(subset=['Sales']).show() # it only considers the specified column for null check

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [33]:
# Lets see how to fill in missing data

In [34]:
print(df.printSchema())

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)

None


In [37]:
print(df.na.fill('Fill Value'))
df.na.fill('Fill Value').show() # 'Fill Value' is of type string, so it would be replaced in all string columns

DataFrame[Id: string, Name: string, Sales: double]
+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|Fill Value| null|
|emp3|Fill Value|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [39]:
df.na.fill(0).show() # 0 is of type numeric, hence it would be replaced in all columns of numeric type

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [47]:
df.na.fill('No Name',subset=['Name']).show()
# So for code redability it is always better to fill values by specifying the columns also.

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Name| null|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [64]:
from pyspark.sql.functions import mean,max

In [65]:
df.select(mean('Sales')).show()

+----------+
|avg(Sales)|
+----------+
|     400.5|
+----------+



In [66]:
# lets fill the mean value in Sales column
df.agg({'Sales':'mean'}).show()
mean_df = df.agg({'Sales':'mean'})
print(mean_df.collect())

+----------+
|avg(Sales)|
+----------+
|     400.5|
+----------+

[Row(avg(Sales)=400.5)]


In [67]:
print(mean_df.collect()[0])

Row(avg(Sales)=400.5)


In [68]:
print(mean_df.collect()[0][0])

400.5


In [71]:
print(mean_df.collect()[0].asDict()['avg(Sales)'])
mean_sales_value = mean_df.collect()[0].asDict()['avg(Sales)']

400.5


In [72]:
df.na.fill(mean_sales_value,subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [None]:
# END