### Pyspark Tutorials
- Droping Columns
- Droping Rows

In [82]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("pyspark3").getOrCreate()
spark

In [84]:
df = spark.read.csv('test2.csv',header=True,inferSchema=True)
df

DataFrame[Name: string, Age: int, Roll_no: int]

In [85]:
df.show()

+-------+----+-------+
|   Name| Age|Roll_no|
+-------+----+-------+
| Gaurav|  26|     25|
|  Swami|  25|     12|
|    Dep|  22|   null|
|  Sunny|null|     15|
|Bhawesh|  32|     18|
|   null|  15|   null|
|   null|null|     16|
+-------+----+-------+



In [88]:
df.na.drop().show()

+-------+---+-------+
|   Name|Age|Roll_no|
+-------+---+-------+
| Gaurav| 26|     25|
|  Swami| 25|     12|
|Bhawesh| 32|     18|
+-------+---+-------+



In [89]:
df.na.drop(how='all').show()

+-------+----+-------+
|   Name| Age|Roll_no|
+-------+----+-------+
| Gaurav|  26|     25|
|  Swami|  25|     12|
|    Dep|  22|   null|
|  Sunny|null|     15|
|Bhawesh|  32|     18|
|   null|  15|   null|
|   null|null|     16|
+-------+----+-------+



In [90]:
df.na.drop(how='any').show()

+-------+---+-------+
|   Name|Age|Roll_no|
+-------+---+-------+
| Gaurav| 26|     25|
|  Swami| 25|     12|
|Bhawesh| 32|     18|
+-------+---+-------+



In [91]:
# threshold
df.na.drop(how='any',thresh=2).show()

+-------+----+-------+
|   Name| Age|Roll_no|
+-------+----+-------+
| Gaurav|  26|     25|
|  Swami|  25|     12|
|    Dep|  22|   null|
|  Sunny|null|     15|
|Bhawesh|  32|     18|
+-------+----+-------+



In [92]:
## Subset
df.na.drop(how='any',subset=['Age']).show()

+-------+---+-------+
|   Name|Age|Roll_no|
+-------+---+-------+
| Gaurav| 26|     25|
|  Swami| 25|     12|
|    Dep| 22|   null|
|Bhawesh| 32|     18|
|   null| 15|   null|
+-------+---+-------+



In [93]:
df.na.drop(how='any',subset=['Age','Roll_no']).show()

+-------+---+-------+
|   Name|Age|Roll_no|
+-------+---+-------+
| Gaurav| 26|     25|
|  Swami| 25|     12|
|Bhawesh| 32|     18|
+-------+---+-------+



In [94]:
### First we have to remove inferSchema
df = spark.read.csv('test2.csv',header=True)

In [95]:
### Filling the Missing Value

df.na.fill('Missing Values','Name').show()

+--------------+----+-------+
|          Name| Age|Roll_no|
+--------------+----+-------+
|        Gaurav|  26|     25|
|         Swami|  25|     12|
|           Dep|  22|   null|
|         Sunny|null|     15|
|       Bhawesh|  32|     18|
|Missing Values|  15|   null|
|Missing Values|null|     16|
+--------------+----+-------+



In [96]:
df.na.fill('Missing Values',['Name','Age']).show()

+--------------+--------------+-------+
|          Name|           Age|Roll_no|
+--------------+--------------+-------+
|        Gaurav|            26|     25|
|         Swami|            25|     12|
|           Dep|            22|   null|
|         Sunny|Missing Values|     15|
|       Bhawesh|            32|     18|
|Missing Values|            15|   null|
|Missing Values|Missing Values|     16|
+--------------+--------------+-------+



In [97]:
df.na.fill('Missing Values',['Name','Age','Roll_no']).show()

+--------------+--------------+--------------+
|          Name|           Age|       Roll_no|
+--------------+--------------+--------------+
|        Gaurav|            26|            25|
|         Swami|            25|            12|
|           Dep|            22|Missing Values|
|         Sunny|Missing Values|            15|
|       Bhawesh|            32|            18|
|Missing Values|            15|Missing Values|
|Missing Values|Missing Values|            16|
+--------------+--------------+--------------+



In [98]:
df.show()

+-------+----+-------+
|   Name| Age|Roll_no|
+-------+----+-------+
| Gaurav|  26|     25|
|  Swami|  25|     12|
|    Dep|  22|   null|
|  Sunny|null|     15|
|Bhawesh|  32|     18|
|   null|  15|   null|
|   null|null|     16|
+-------+----+-------+



In [99]:
### Before we have to add inferSchema
df = spark.read.csv('test2.csv',header=True,inferSchema=True)


### Imputer System

from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age','Roll_no'],
    outputCols=['{}_imputed'.format(c) for c in ['Age','Roll_no']]
    ).setStrategy('mean')

In [100]:
# Add imputation cols to df 

imputer.fit(df).transform(df).show()

+-------+----+-------+-----------+---------------+
|   Name| Age|Roll_no|Age_imputed|Roll_no_imputed|
+-------+----+-------+-----------+---------------+
| Gaurav|  26|     25|         26|             25|
|  Swami|  25|     12|         25|             12|
|    Dep|  22|   null|         22|             17|
|  Sunny|null|     15|         24|             15|
|Bhawesh|  32|     18|         32|             18|
|   null|  15|   null|         15|             17|
|   null|null|     16|         24|             16|
+-------+----+-------+-----------+---------------+



In [101]:
### for median value


imputer = Imputer(
    inputCols=['Age','Roll_no'],
    outputCols=['{}_imputed'.format(c) for c in ['Age','Roll_no']]
    ).setStrategy('median')
imputer.fit(df).transform(df).show()

+-------+----+-------+-----------+---------------+
|   Name| Age|Roll_no|Age_imputed|Roll_no_imputed|
+-------+----+-------+-----------+---------------+
| Gaurav|  26|     25|         26|             25|
|  Swami|  25|     12|         25|             12|
|    Dep|  22|   null|         22|             16|
|  Sunny|null|     15|         25|             15|
|Bhawesh|  32|     18|         32|             18|
|   null|  15|   null|         15|             16|
|   null|null|     16|         25|             16|
+-------+----+-------+-----------+---------------+

