In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Hypothesis Testing

### Goodness of Fit

In [3]:
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.stat import Statistics

vec = Vectors.dense([0.3, 0.2, 0.15, 0.1, 0.1, 0.1, 0.05])

In [4]:
goodnessOfFitTestResult = Statistics.chiSqTest(vec)

In [5]:
goodnessOfFitTestResult.statistic

0.295

In [6]:
goodnessOfFitTestResult.pValue

0.999520973435643

In [7]:
goodnessOfFitTestResult.nullHypothesis

'observed follows the same distribution as expected.'

### Independence

In [8]:
from pyspark.mllib.linalg import Matrices

mat = Matrices.dense(3, 2, [13.0, 47.0, 40.0, 80.0, 11.0, 9.0])

In [9]:
independenceTestResult = Statistics.chiSqTest(mat)

In [10]:
independenceTestResult.statistic

90.22588968846716

In [11]:
independenceTestResult.pValue

0.0

In [12]:
independenceTestResult.nullHypothesis

'the occurrence of the outcomes is statistically independent.'

### Independence - Labeled Points

In [13]:
from pyspark.mllib.regression import LabeledPoint

obs = sc.parallelize([LabeledPoint(0,Vectors.dense(1.0,2.0)),
                      LabeledPoint(0,Vectors.dense(0.5,1.5)),
                      LabeledPoint(1,Vectors.dense(1.0,8.0))])

In [14]:
featTestResults = Statistics.chiSqTest(obs)

In [15]:
featTestResults

[<pyspark.mllib.stat.test.ChiSqTestResult at 0x7f84dd6f09b0>,
 <pyspark.mllib.stat.test.ChiSqTestResult at 0x7f84dd6f0b38>]

In [16]:
list(map(lambda r: {r.statistic, r.pValue, r.nullHypothesis}, featTestResults))

[{0.75,
  0.3864762307712326,
  'the occurrence of the outcomes is statistically independent.'},
 {0.22313016014843035,
  3.0000000000000004,
  'the occurrence of the outcomes is statistically independent.'}]

### Distribution

In [17]:
from pyspark.mllib.random import RandomRDDs

data = RandomRDDs.normalRDD(sc, size=100, numPartitions=1, seed=13)

In [18]:
ks_result = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)

In [19]:
ks_result.statistic

0.12019890461912125

In [20]:
ks_result.pValue

0.10230385223938121

In [21]:
ks_result.nullHypothesis

'Sample follows theoretical distribution'

### Kernel Density Estimation

In [22]:
from pyspark.mllib.stat import KernelDensity

In [23]:
kd = KernelDensity()
kd.setSample(data)
kd.setBandwidth(0.1)

In [24]:
kd.estimate([-1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5])

array([ 0.1023487 ,  0.15699217,  0.2957955 ,  0.51760411,  0.38091952,
        0.30242779,  0.1841904 ])

In [25]:
sc.stop()