In [None]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [None]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Hypothesis Testing

### Goodness of Fit

In [None]:
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.stat import Statistics

vec = Vectors.dense([0.3, 0.2, 0.15, 0.1, 0.1, 0.1, 0.05])

In [None]:
goodnessOfFitTestResult = Statistics.chiSqTest(vec)

In [None]:
goodnessOfFitTestResult.statistic

In [None]:
goodnessOfFitTestResult.pValue

In [None]:
goodnessOfFitTestResult.nullHypothesis

### Independence

In [None]:
from pyspark.mllib.linalg import Matrices

mat = Matrices.dense(3, 2, [13.0, 47.0, 40.0, 80.0, 11.0, 9.0])

In [None]:
independenceTestResult = Statistics.chiSqTest(mat)

In [None]:
independenceTestResult.statistic

In [None]:
independenceTestResult.pValue

In [None]:
independenceTestResult.nullHypothesis

### Independence - Labeled Points

In [None]:
from pyspark.mllib.regression import LabeledPoint

obs = sc.parallelize([LabeledPoint(0,Vectors.dense(1.0,2.0)),
                      LabeledPoint(0,Vectors.dense(0.5,1.5)),
                      LabeledPoint(1,Vectors.dense(1.0,8.0))])

In [None]:
featTestResults = Statistics.chiSqTest(obs)

In [None]:
featTestResults

In [None]:
list(map(lambda r: {r.statistic, r.pValue, r.nullHypothesis}, featTestResults))

### Distribution

In [None]:
from pyspark.mllib.random import RandomRDDs

data = RandomRDDs.normalRDD(sc, size=100, numPartitions=1, seed=13)

In [None]:
ks_result = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)

In [None]:
ks_result.statistic

In [None]:
ks_result.pValue

In [None]:
ks_result.nullHypothesis

### Kernel Density Estimation

In [None]:
from pyspark.mllib.stat import KernelDensity

In [None]:
kd = KernelDensity()
kd.setSample(data)
kd.setBandwidth(0.1)

In [None]:
kd.estimate([-1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5])

In [None]:
sc.stop()