In [None]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [None]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Using User Defined Functions

In [None]:
from numpy import NaN
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import DoubleType

udf1 = UserDefinedFunction(lambda x: NaN if x > 0.5 else x, DoubleType())
udf2 = UserDefinedFunction(lambda x: NaN if x > 1.0 else x, DoubleType())

In [None]:
from pyspark.sql.functions import rand, randn

df = sqlc.range(0, 5).withColumn("uniform", rand(7)).withColumn("normal", randn(7))

dfnan = df.withColumn("nanUniform", udf1("uniform")) \
          .withColumn("nanNormal", udf2("normal")) \
          .drop("uniform").withColumnRenamed("nanUniform", "uniform") \
          .drop("normal").withColumnRenamed("nanNormal", "normal")

In [None]:
dfnan.show()

## NA Functions

### Drop

In [None]:
dfnan.na.drop(how='all',subset=['uniform','normal']).show()

### Replace

In [None]:
dfnan.na.replace([NaN],[0.0], 'uniform').show()

### Fill

In [None]:
dfnan.na.fill({'uniform': 0.0, 'normal': 1.0}).show()

In [None]:
dfsummary = dfnan.drop("id").na.drop(how='any').describe()
dfsummary.show()

In [None]:
uniformMean = float(dfsummary.rdd.collect()[1][1])
print(uniformMean)

In [None]:
pandasSummary = dfsummary.toPandas().set_index("summary")
pandasSummary

In [None]:
uniformMean = float(pandasSummary.loc['mean'].uniform)
print(uniformMean)

In [None]:
normalMean = float(pandasSummary.loc['mean'].normal)
print(normalMean)

In [None]:
dfnan.na.fill({"uniform": uniformMean, "normal": normalMean}).show()

## Duplicates

In [None]:
dfDuplicates = dfnan.union(sc.parallelize([(5,1,1),(6,1,1)]).toDF())

In [None]:
dfDuplicates.show()

In [None]:
dfDuplicates.dropDuplicates(["uniform","normal"]).show()

In [None]:
sc.stop()