In [2]:
import findspark
findspark.init('/home/appzop2/spark-2.4.0-bin-hadoop2.7/')

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('miss').getOrCreate()

In [8]:
df = spark.read.csv('ContainsNull.csv',inferSchema=True, header=True)

In [9]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [10]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



## na 
It is used to handle missing data

In [11]:
#drops all the rows where there is missigdata despite of number
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



or

In [16]:
df.na.drop(how='any').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [13]:
#set threshold for null values per row
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [17]:
#only drop a row if all the values are null
df.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



## select columns for na

In [19]:
#drop only if sales value is null
df.na.drop(subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



## fill a value instead of na

In [23]:
df.na.fill('no name',subset=['Name'] ).show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|no name| null|
|emp3|no name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



<b>spark is smart enough to figure out datatypes, and update only those columns</b>

In [26]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [28]:
df.na.fill(0).show() #only updates sales columns values as it is double

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



## replacing a column null values with average value

In [29]:
from pyspark.sql.functions import mean

In [30]:
mean_value = df.select(mean(df['Sales'])).collect()

In [31]:
mean_value[0]

Row(avg(Sales)=400.5)

In [33]:
mean_sales = mean_value[0][0]
mean_sales

400.5

In [41]:
df.na.fill(mean_sales,['Sales']).show()

#will only target one column at once,
#want to fill multiple columns use it with loops

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [44]:
source_df = spark.createDataFrame(
    [
        ("h!o!c!k!e!y", "rangers", "new york"),
        ("soccer", "??nacional!!", "medellin")
    ],
    ["sport", "team", "city"]
)

source_df.show()

+-----------+------------+--------+
|      sport|        team|    city|
+-----------+------------+--------+
|h!o!c!k!e!y|     rangers|new york|
|     soccer|??nacional!!|medellin|
+-----------+------------+--------+

