### Pyspark Handling Missing Values
- Dropping Columns
- Dropping Rows
- Various Parameter In Dropping functionalities
- Handling Missing values by Mean, MEdian And Mode

In [16]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Practise').getOrCreate()

In [17]:
df_pyspark=spark.read.csv('test2.csv',header=True,inferSchema=True)

In [18]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [19]:
df_pyspark.show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|   John|  31|        10| 30000|
|  James|  30|         8| 25000|
|   Anne|  29|         4| 20000|
|  Paula|  24|         3| 20000|
|  Lizzy|  21|         1| 15000|
|   Alex|  23|         2| 18000|
|Michael|NULL|      NULL| 40000|
|   NULL|  34|        10| 38000|
|   NULL|  36|      NULL|  NULL|
+-------+----+----------+------+



In [20]:
##drop the columns
df_pyspark.drop('Name').show()

+----+----------+------+
| age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         4| 20000|
|  24|         3| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|NULL|      NULL| 40000|
|  34|        10| 38000|
|  36|      NULL|  NULL|
+----+----------+------+



In [21]:
df_pyspark.show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|   John|  31|        10| 30000|
|  James|  30|         8| 25000|
|   Anne|  29|         4| 20000|
|  Paula|  24|         3| 20000|
|  Lizzy|  21|         1| 15000|
|   Alex|  23|         2| 18000|
|Michael|NULL|      NULL| 40000|
|   NULL|  34|        10| 38000|
|   NULL|  36|      NULL|  NULL|
+-------+----+----------+------+



In [22]:
df_pyspark.na.drop().show()

+-----+---+----------+------+
| Name|age|Experience|Salary|
+-----+---+----------+------+
| John| 31|        10| 30000|
|James| 30|         8| 25000|
| Anne| 29|         4| 20000|
|Paula| 24|         3| 20000|
|Lizzy| 21|         1| 15000|
| Alex| 23|         2| 18000|
+-----+---+----------+------+



In [23]:
### any==how
df_pyspark.na.drop(how="any").show()

+-----+---+----------+------+
| Name|age|Experience|Salary|
+-----+---+----------+------+
| John| 31|        10| 30000|
|James| 30|         8| 25000|
| Anne| 29|         4| 20000|
|Paula| 24|         3| 20000|
|Lizzy| 21|         1| 15000|
| Alex| 23|         2| 18000|
+-----+---+----------+------+



In [24]:
##threshold
df_pyspark.na.drop(how="any",thresh=3).show()

+-----+---+----------+------+
| Name|age|Experience|Salary|
+-----+---+----------+------+
| John| 31|        10| 30000|
|James| 30|         8| 25000|
| Anne| 29|         4| 20000|
|Paula| 24|         3| 20000|
|Lizzy| 21|         1| 15000|
| Alex| 23|         2| 18000|
| NULL| 34|        10| 38000|
+-----+---+----------+------+



In [25]:
##Subset
df_pyspark.na.drop(how="any",subset=['Experience']).show()

+-----+---+----------+------+
| Name|age|Experience|Salary|
+-----+---+----------+------+
| John| 31|        10| 30000|
|James| 30|         8| 25000|
| Anne| 29|         4| 20000|
|Paula| 24|         3| 20000|
|Lizzy| 21|         1| 15000|
| Alex| 23|         2| 18000|
| NULL| 34|        10| 38000|
+-----+---+----------+------+



In [26]:
### Filling the Missing Value
df_pyspark.na.fill('Missing Values',['Experience','age']).show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|   John|  31|        10| 30000|
|  James|  30|         8| 25000|
|   Anne|  29|         4| 20000|
|  Paula|  24|         3| 20000|
|  Lizzy|  21|         1| 15000|
|   Alex|  23|         2| 18000|
|Michael|NULL|      NULL| 40000|
|   NULL|  34|        10| 38000|
|   NULL|  36|      NULL|  NULL|
+-------+----+----------+------+



In [27]:
df_pyspark.show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|   John|  31|        10| 30000|
|  James|  30|         8| 25000|
|   Anne|  29|         4| 20000|
|  Paula|  24|         3| 20000|
|  Lizzy|  21|         1| 15000|
|   Alex|  23|         2| 18000|
|Michael|NULL|      NULL| 40000|
|   NULL|  34|        10| 38000|
|   NULL|  36|      NULL|  NULL|
+-------+----+----------+------+



In [28]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [29]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age', 'Experience', 'Salary'], 
    outputCols=["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]
    ).setStrategy("median")

In [30]:
# Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------+----+----------+------+-----------+------------------+--------------+
|   Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+-------+----+----------+------+-----------+------------------+--------------+
|   John|  31|        10| 30000|         31|                10|         30000|
|  James|  30|         8| 25000|         30|                 8|         25000|
|   Anne|  29|         4| 20000|         29|                 4|         20000|
|  Paula|  24|         3| 20000|         24|                 3|         20000|
|  Lizzy|  21|         1| 15000|         21|                 1|         15000|
|   Alex|  23|         2| 18000|         23|                 2|         18000|
|Michael|NULL|      NULL| 40000|         29|                 4|         40000|
|   NULL|  34|        10| 38000|         34|                10|         38000|
|   NULL|  36|      NULL|  NULL|         36|                 4|         20000|
+-------+----+----------+------+-----------+--------