In [1]:
import pandas as pd

In [2]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.sql.functions import col, desc

In [4]:
spark = SparkSession.builder.appName('dataframe').getOrCreate()

In [5]:
data = spark.read.format('csv').\
    option('inferSchema', 'true').\
    option('header', 'true').\
    option('path', 'test1.csv').\
    load()

In [6]:
data.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Qualifications: string (nullable = true)
 |-- Job Title : string (nullable = true)
 |-- Salary : integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [7]:
data.show()

+----+----+--------------+------------+-------+----------+
|Name| Age|Qualifications|  Job Title |Salary |Experience|
+----+----+--------------+------------+-------+----------+
|   a|  35|           mba|Asst manager| 200000|        10|
|   b|  45|           mba|     Manager| 150000|        15|
|   c|  27|          null|        null|  55000|         4|
|   d|  36|            pg|        null|  80000|         9|
|   e|  33|          null|          TL|  65000|         5|
|   f|null|           phd|    Director| 400000|        25|
|   g|null|            ma|          TL|  75000|         7|
+----+----+--------------+------------+-------+----------+



In [8]:
#deleting a column
data.drop('Age').show()

+----+--------------+------------+-------+----------+
|Name|Qualifications|  Job Title |Salary |Experience|
+----+--------------+------------+-------+----------+
|   a|           mba|Asst manager| 200000|        10|
|   b|           mba|     Manager| 150000|        15|
|   c|          null|        null|  55000|         4|
|   d|            pg|        null|  80000|         9|
|   e|          null|          TL|  65000|         5|
|   f|           phd|    Director| 400000|        25|
|   g|            ma|          TL|  75000|         7|
+----+--------------+------------+-------+----------+



In [9]:
#deletes rows which have atleast one null value(s)
data.na.drop().show()

+----+---+--------------+------------+-------+----------+
|Name|Age|Qualifications|  Job Title |Salary |Experience|
+----+---+--------------+------------+-------+----------+
|   a| 35|           mba|Asst manager| 200000|        10|
|   b| 45|           mba|     Manager| 150000|        15|
+----+---+--------------+------------+-------+----------+



In [10]:
#default how == 'any'
data.na.drop(how='all').show()
#deletes rows which have ALL null values

+----+----+--------------+------------+-------+----------+
|Name| Age|Qualifications|  Job Title |Salary |Experience|
+----+----+--------------+------------+-------+----------+
|   a|  35|           mba|Asst manager| 200000|        10|
|   b|  45|           mba|     Manager| 150000|        15|
|   c|  27|          null|        null|  55000|         4|
|   d|  36|            pg|        null|  80000|         9|
|   e|  33|          null|          TL|  65000|         5|
|   f|null|           phd|    Director| 400000|        25|
|   g|null|            ma|          TL|  75000|         7|
+----+----+--------------+------------+-------+----------+



In [11]:
data.na.drop(how='any', thresh = 5).show()
#thresh - limit of NON-NULL values values in a row.
#if thresh = 5 then delete rows which have (1,2,3 or 4) non-null values.
#'c' row had 4 non-null values, hence that row gets deleted here.

+----+----+--------------+------------+-------+----------+
|Name| Age|Qualifications|  Job Title |Salary |Experience|
+----+----+--------------+------------+-------+----------+
|   a|  35|           mba|Asst manager| 200000|        10|
|   b|  45|           mba|     Manager| 150000|        15|
|   d|  36|            pg|        null|  80000|         9|
|   e|  33|          null|          TL|  65000|         5|
|   f|null|           phd|    Director| 400000|        25|
|   g|null|            ma|          TL|  75000|         7|
+----+----+--------------+------------+-------+----------+



In [12]:
data.na.drop(how='any', thresh = 6).show()
#if thresh = 5 then delete rows which have (1,2,3,4,5) non-null values.
#rows d, e, f, g have 5 non-null values, hence these rows get deleted.

+----+---+--------------+------------+-------+----------+
|Name|Age|Qualifications|  Job Title |Salary |Experience|
+----+---+--------------+------------+-------+----------+
|   a| 35|           mba|Asst manager| 200000|        10|
|   b| 45|           mba|     Manager| 150000|        15|
+----+---+--------------+------------+-------+----------+



In [13]:
data.na.drop(thresh = 7).show()
#if thresh = 5 then delete rows which have atmost 7-1=6 non-null values.
#rows a,b have 6 non-null values, hence these rows get deleted.

+----+---+--------------+----------+-------+----------+
|Name|Age|Qualifications|Job Title |Salary |Experience|
+----+---+--------------+----------+-------+----------+
+----+---+--------------+----------+-------+----------+



In [14]:
data.na.drop(subset = ['Age']).show()
#removes rows with null values for a particular column

+----+---+--------------+------------+-------+----------+
|Name|Age|Qualifications|  Job Title |Salary |Experience|
+----+---+--------------+------------+-------+----------+
|   a| 35|           mba|Asst manager| 200000|        10|
|   b| 45|           mba|     Manager| 150000|        15|
|   c| 27|          null|        null|  55000|         4|
|   d| 36|            pg|        null|  80000|         9|
|   e| 33|          null|          TL|  65000|         5|
+----+---+--------------+------------+-------+----------+



In [15]:
data.na.drop(subset = ['Job Title ']).show()

+----+----+--------------+------------+-------+----------+
|Name| Age|Qualifications|  Job Title |Salary |Experience|
+----+----+--------------+------------+-------+----------+
|   a|  35|           mba|Asst manager| 200000|        10|
|   b|  45|           mba|     Manager| 150000|        15|
|   e|  33|          null|          TL|  65000|         5|
|   f|null|           phd|    Director| 400000|        25|
|   g|null|            ma|          TL|  75000|         7|
+----+----+--------------+------------+-------+----------+



In [16]:
data.na.fill('Missing Value').show()
#replaces null values in string places

+----+----+--------------+-------------+-------+----------+
|Name| Age|Qualifications|   Job Title |Salary |Experience|
+----+----+--------------+-------------+-------+----------+
|   a|  35|           mba| Asst manager| 200000|        10|
|   b|  45|           mba|      Manager| 150000|        15|
|   c|  27| Missing Value|Missing Value|  55000|         4|
|   d|  36|            pg|Missing Value|  80000|         9|
|   e|  33| Missing Value|           TL|  65000|         5|
|   f|null|           phd|     Director| 400000|        25|
|   g|null|            ma|           TL|  75000|         7|
+----+----+--------------+-------------+-------+----------+



In [17]:
data.na.fill('Missing Value', 'Job Title ').show()
#fills null values in specified column

+----+----+--------------+-------------+-------+----------+
|Name| Age|Qualifications|   Job Title |Salary |Experience|
+----+----+--------------+-------------+-------+----------+
|   a|  35|           mba| Asst manager| 200000|        10|
|   b|  45|           mba|      Manager| 150000|        15|
|   c|  27|          null|Missing Value|  55000|         4|
|   d|  36|            pg|Missing Value|  80000|         9|
|   e|  33|          null|           TL|  65000|         5|
|   f|null|           phd|     Director| 400000|        25|
|   g|null|            ma|           TL|  75000|         7|
+----+----+--------------+-------------+-------+----------+



In [18]:
from pyspark.ml.feature import Imputer

In [19]:
impute = Imputer(inputCols = ['Age'], 
                 outputCols = ["{}_imputed".format(c) for c in ['Age']]).setStrategy('mean')

In [21]:
impute.fit(data).transform(data).show()

+----+----+--------------+------------+-------+----------+-----------+
|Name| Age|Qualifications|  Job Title |Salary |Experience|Age_imputed|
+----+----+--------------+------------+-------+----------+-----------+
|   a|  35|           mba|Asst manager| 200000|        10|         35|
|   b|  45|           mba|     Manager| 150000|        15|         45|
|   c|  27|          null|        null|  55000|         4|         27|
|   d|  36|            pg|        null|  80000|         9|         36|
|   e|  33|          null|          TL|  65000|         5|         33|
|   f|null|           phd|    Director| 400000|        25|         35|
|   g|null|            ma|          TL|  75000|         7|         35|
+----+----+--------------+------------+-------+----------+-----------+

