<a href="https://colab.research.google.com/github/chandan3324/PySpark/blob/main/2_Pyspark_Handling_Missing_values.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pyspark Handling missing values
1. Dropping Columns
2. Dropping Rows
3. Various Parameter in Dropping functionalities
4. Handling Missing values by Mean, Median, Mode

In [10]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [11]:
df_pyspark = spark.read.csv('test2.csv', header=True, inferSchema=True)

In [12]:
df_pyspark.show()

+-------+----+----------+------+
|  Name |Age |Experience|Salary|
+-------+----+----------+------+
|Chandan|  22|         2| 50000|
|  Akhil|  22|         3| 35000|
| Aditya|  24|         4| 50000|
|Sandeep|NULL|         2|  NULL|
|   NULL|  25|      NULL| 62222|
|   Ajay|  23|         2|  NULL|
+-------+----+----------+------+



In [33]:
### Drop the columns

df_pyspark.drop('Name ').show()

+----+----------+------+
|Age |Experience|Salary|
+----+----------+------+
|  22|         2| 50000|
|  22|         3| 35000|
|  24|         4| 50000|
|NULL|         2|  NULL|
|  25|      NULL| 62222|
|  23|         2|  NULL|
+----+----------+------+



In [22]:
df_pyspark.na.drop().show()

+-------+----+----------+------+
|  Name |Age |Experience|Salary|
+-------+----+----------+------+
|Chandan|  22|         2| 50000|
|  Akhil|  22|         3| 35000|
| Aditya|  24|         4| 50000|
+-------+----+----------+------+



In [23]:
### For how == any
df_pyspark.na.drop(how = 'any').show()

+-------+----+----------+------+
|  Name |Age |Experience|Salary|
+-------+----+----------+------+
|Chandan|  22|         2| 50000|
|  Akhil|  22|         3| 35000|
| Aditya|  24|         4| 50000|
+-------+----+----------+------+



In [24]:
 ### How = all

 df_pyspark.na.drop(how = 'all').show()

+-------+----+----------+------+
|  Name |Age |Experience|Salary|
+-------+----+----------+------+
|Chandan|  22|         2| 50000|
|  Akhil|  22|         3| 35000|
| Aditya|  24|         4| 50000|
|Sandeep|NULL|         2|  NULL|
|   NULL|  25|      NULL| 62222|
|   Ajay|  23|         2|  NULL|
+-------+----+----------+------+



In [28]:
### Treshold i.e at least 3 non-null values should be present

df_pyspark.na.drop(how = 'any', thresh = 3).show()

+-------+----+----------+------+
|  Name |Age |Experience|Salary|
+-------+----+----------+------+
|Chandan|  22|         2| 50000|
|  Akhil|  22|         3| 35000|
| Aditya|  24|         4| 50000|
|   Ajay|  23|         2|  NULL|
+-------+----+----------+------+



In [35]:
### Subset

df_pyspark.na.drop(how = 'any', subset = ['Name ']).show()

+-------+----+----------+------+
|  Name |Age |Experience|Salary|
+-------+----+----------+------+
|Chandan|  22|         2| 50000|
|  Akhil|  22|         3| 35000|
| Aditya|  24|         4| 50000|
|Sandeep|NULL|         2|  NULL|
|   Ajay|  23|         2|  NULL|
+-------+----+----------+------+



In [47]:
### Filling the missing values
df_pyspark.na.fill(0, ['Experience', 'Age ']).show()

+-------+----+----------+------+
|  Name |Age |Experience|Salary|
+-------+----+----------+------+
|Chandan|  22|         2| 50000|
|  Akhil|  22|         3| 35000|
| Aditya|  24|         4| 50000|
|Sandeep|   0|         2|  NULL|
|   NULL|  25|         0| 62222|
|   Ajay|  23|         2|  NULL|
+-------+----+----------+------+



In [49]:
df_pyspark.show()

+-------+----+----------+------+
|  Name |Age |Experience|Salary|
+-------+----+----------+------+
|Chandan|  22|         2| 50000|
|  Akhil|  22|         3| 35000|
| Aditya|  24|         4| 50000|
|Sandeep|NULL|         2|  NULL|
|   NULL|  25|      NULL| 62222|
|   Ajay|  23|         2|  NULL|
+-------+----+----------+------+



In [52]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ['Age ', 'Experience', 'Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age ', 'Experience', 'Salary']]
).setStrategy("median")

In [53]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------+----+----------+------+------------+------------------+--------------+
|  Name |Age |Experience|Salary|Age _imputed|Experience_imputed|Salary_imputed|
+-------+----+----------+------+------------+------------------+--------------+
|Chandan|  22|         2| 50000|          22|                 2|         50000|
|  Akhil|  22|         3| 35000|          22|                 3|         35000|
| Aditya|  24|         4| 50000|          24|                 4|         50000|
|Sandeep|NULL|         2|  NULL|          23|                 2|         50000|
|   NULL|  25|      NULL| 62222|          25|                 2|         62222|
|   Ajay|  23|         2|  NULL|          23|                 2|         50000|
+-------+----+----------+------+------------+------------------+--------------+

