In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Using User Defined Functions

In [3]:
from numpy import NaN
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import DoubleType

udf1 = UserDefinedFunction(lambda x: NaN if x > 0.5 else x, DoubleType())
udf2 = UserDefinedFunction(lambda x: NaN if x > 1.0 else x, DoubleType())

In [4]:
from pyspark.sql.functions import rand, randn

df = sqlc.range(0, 5).withColumn("uniform", rand(7)).withColumn("normal", randn(7))

dfnan = df.withColumn("nanUniform", udf1("uniform")) \
          .withColumn("nanNormal", udf2("normal")) \
          .drop("uniform").withColumnRenamed("nanUniform", "uniform") \
          .drop("normal").withColumnRenamed("nanNormal", "normal")

In [5]:
dfnan.show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|                NaN| -1.2904230199480902|
|  1|0.06498948189958098|-0.05248092572410684|
|  2|                NaN|                 NaN|
|  3|0.41371264720975787| -0.5877482396744728|
|  4|                NaN|                 NaN|
+---+-------------------+--------------------+



## NA Functions

### Drop

In [6]:
dfnan.na.drop(how='all',subset=['uniform','normal']).show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|                NaN| -1.2904230199480902|
|  1|0.06498948189958098|-0.05248092572410684|
|  3|0.41371264720975787| -0.5877482396744728|
+---+-------------------+--------------------+



### Replace

In [7]:
dfnan.na.replace([NaN],[0.0], 'uniform').show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|                0.0| -1.2904230199480902|
|  1|0.06498948189958098|-0.05248092572410684|
|  2|                0.0|                 NaN|
|  3|0.41371264720975787| -0.5877482396744728|
|  4|                0.0|                 NaN|
+---+-------------------+--------------------+



### Fill

In [8]:
dfnan.na.fill({'uniform': 0.0, 'normal': 1.0}).show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|                0.0| -1.2904230199480902|
|  1|0.06498948189958098|-0.05248092572410684|
|  2|                0.0|                 1.0|
|  3|0.41371264720975787| -0.5877482396744728|
|  4|                0.0|                 1.0|
+---+-------------------+--------------------+



In [9]:
dfsummary = dfnan.drop("id").na.drop(how='any').describe()
dfsummary.show()

+-------+-------------------+--------------------+
|summary|            uniform|              normal|
+-------+-------------------+--------------------+
|  count|                  2|                   2|
|   mean|0.23935106455466942|-0.32011458269928983|
| stddev|0.24658451494766348|  0.3784911474418124|
|    min|0.06498948189958098| -0.5877482396744728|
|    max|0.41371264720975787|-0.05248092572410684|
+-------+-------------------+--------------------+



In [10]:
uniformMean = float(dfsummary.rdd.collect()[1][1])
print(uniformMean)

0.23935106455466942


In [11]:
pandasSummary = dfsummary.toPandas().set_index("summary")
pandasSummary

Unnamed: 0_level_0,uniform,normal
summary,Unnamed: 1_level_1,Unnamed: 2_level_1
count,2.0,2.0
mean,0.2393510645546694,-0.3201145826992898
stddev,0.2465845149476634,0.3784911474418124
min,0.0649894818995809,-0.5877482396744728
max,0.4137126472097578,-0.0524809257241068


In [12]:
uniformMean = float(pandasSummary.loc['mean'].uniform)
print(uniformMean)

0.23935106455466942


In [13]:
normalMean = float(pandasSummary.loc['mean'].normal)
print(normalMean)

-0.32011458269928983


In [14]:
dfnan.na.fill({"uniform": uniformMean, "normal": normalMean}).show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|0.23935106455466942| -1.2904230199480902|
|  1|0.06498948189958098|-0.05248092572410684|
|  2|0.23935106455466942|-0.32011458269928983|
|  3|0.41371264720975787| -0.5877482396744728|
|  4|0.23935106455466942|-0.32011458269928983|
+---+-------------------+--------------------+



## Duplicates

In [None]:
dfDuplicates = dfnan.union(sc.parallelize([(5,1,1),(6,1,1)]).toDF())

In [None]:
dfDuplicates.show()

In [None]:
dfDuplicates.dropDuplicates(["uniform","normal"]).show()

In [None]:
sc.stop()