# creating a spark session

In [1]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").config(conf=SparkConf()).getOrCreate()

## dropping duplicates

In [10]:
#dropping Duplicates

from pyspark.sql import Row

df = spark.createDataFrame([ \
                    Row(name='king', age=5,gender='M'), \
                    Row(name='king', age=5,gender='M'), \
                    Row(name='queen', age=10,gender='F'),\
                    Row(name='queen', age=100,gender='F')])

'''
For spark 1.6 use this code to create the above dataframe

df = sc.parallelize([ \
                    Row(name='king', age=5,gender='M'), \
                    Row(name='king', age=5,gender='M'), \
                    Row(name='queen', age=10,gender='F'),\
                    Row(name='queen', age=100,gender='F')]).toDF()
                    
'''

df.show()
df.dropDuplicates().select("name","age","gender").show()
df.dropDuplicates(["name","gender"]).select("name","age","gender").show()

+---+------+-----+
|age|gender| name|
+---+------+-----+
|  5|     M| king|
|  5|     M| king|
| 10|     F|queen|
|100|     F|queen|
+---+------+-----+

+-----+---+------+
| name|age|gender|
+-----+---+------+
| king|  5|     M|
|queen| 10|     F|
|queen|100|     F|
+-----+---+------+

+-----+---+------+
| name|age|gender|
+-----+---+------+
| king|  5|     M|
|queen| 10|     F|
+-----+---+------+



# Replacing values

In [28]:
# Replacing the values in different columns 

from pyspark.sql import Row

from pyspark.sql import functions as f

df = spark.createDataFrame([ \
                    Row(a='king',b=5,c='m'), \
                    Row(a='king', b=5,c='m'), \
                    Row(a='queen', b=15,c='F'),\
                    Row(a='queen',b=25,c='F'),\
                    Row(a='raj',b=30,c='m')])

df.show()
df.replace(["king","queen","raj"],["KING","QUEEN","RAJ"],"a").show()


+-----+---+---+
|    a|  b|  c|
+-----+---+---+
| KING|  5|  m|
| KING|  5|  m|
|QUEEN| 15|  F|
|QUEEN| 25|  F|
|  RAJ| 30|  m|
+-----+---+---+

+-----+---+---+
|    a|  b|  c|
+-----+---+---+
| king|  5|  m|
| king|  5|  m|
|queen| 15|  F|
|queen| 25|  F|
|  raj| 30|  m|
+-----+---+---+



## Filling null values

In [25]:
# Filling  null values

from pyspark.sql import Row

from pyspark.sql import functions as f

df = spark.createDataFrame([ \
                    Row(a='king',b=5,c='m'), \
                    Row(a='king', b=5,c='m'), \
                    Row(a='queen', b=15,c='F'),\
                    Row(a='queen',b=25,c='F'),\
                    Row(a='raj',b=30,c='m')])

'''
To get this data frame in spark 1.6

df = sc.parallelize([ \
                    Row(a='king',b=5,c='m'), \
                    Row(a='king', b=5,c='m'), \
                    Row(a='queen', b=15,c='F'),\
                    Row(a='queen',b=25,c='F'),\
                    Row(a='raj',b=30,c='m')])

'''
df=df.select(f.when(df.c== "m","M").alias("gender"),f.when(df.b<20,10).alias("age"),\
             f.when(df.a== "king","KING").when(df.a=="queen","QUEEN").alias("name"))

df.show()
df.na.fill({'gender':'unknown','age':'0'}).show()
df.na.fill("unkown",subset=["gender","age"]).show()

+------+----+-----+
|gender| age| name|
+------+----+-----+
|     M|  10| KING|
|     M|  10| KING|
|  null|  10|QUEEN|
|  null|null|QUEEN|
|     M|null| null|
+------+----+-----+

+-------+---+-----+
| gender|age| name|
+-------+---+-----+
|      M| 10| KING|
|      M| 10| KING|
|unknown| 10|QUEEN|
|unknown|  0|QUEEN|
|      M|  0| null|
+-------+---+-----+

+------+----+-----+
|gender| age| name|
+------+----+-----+
|     M|  10| KING|
|     M|  10| KING|
|unkown|  10|QUEEN|
|unkown|null|QUEEN|
|     M|null| null|
+------+----+-----+



# dropping null 

In [32]:
#dropping null values

from pyspark.sql import Row

from pyspark.sql import functions as f

df = spark.createDataFrame([ \
                    Row(a='king',b=5,c='m'), \
                    Row(a='king', b=5,c='m'), \
                    Row(a='queen', b=15,c='F'),\
                    Row(a='queen',b=25,c='F'),\
                    Row(a='raj',b=30,c='g')])
df=df.select(f.when(df.c== "m","M").alias("gender"),f.when(df.b<20,10).alias("age"),\
             f.when(df.a== "king","KING").when(df.a=="queen","QUEEN").alias("name"))
df.show()
df.dropna('any').show()
df.dropna('all').show()

+------+----+-----+
|gender| age| name|
+------+----+-----+
|     M|  10| KING|
|     M|  10| KING|
|  null|  10|QUEEN|
|  null|null|QUEEN|
|  null|null| null|
+------+----+-----+

+------+---+----+
|gender|age|name|
+------+---+----+
|     M| 10|KING|
|     M| 10|KING|
+------+---+----+

+------+----+-----+
|gender| age| name|
+------+----+-----+
|     M|  10| KING|
|     M|  10| KING|
|  null|  10|QUEEN|
|  null|null|QUEEN|
+------+----+-----+

