In [1]:
sc

<pyspark.context.SparkContext at 0x7fdcda9bd790>

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Hypothesis Testing

### Goodness of Fit

In [4]:
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.stat import Statistics

vec = Vectors.dense([0.3, 0.2, 0.15, 0.1, 0.1, 0.1, 0.05])

In [6]:
goodnessOfFitTestResult = Statistics.chiSqTest(vec)

In [8]:
goodnessOfFitTestResult.statistic

0.295

In [9]:
goodnessOfFitTestResult.pValue

0.999520973435643

In [10]:
goodnessOfFitTestResult.nullHypothesis

u'observed follows the same distribution as expected.'

### Independence

In [11]:
from pyspark.mllib.linalg import Matrices

mat = Matrices.dense(3, 2, [13.0, 47.0, 40.0, 80.0, 11.0, 9.0])

In [12]:
independenceTestResult = Statistics.chiSqTest(mat)

In [13]:
independenceTestResult.statistic

90.22588968846716

In [14]:
independenceTestResult.pValue

0.0

In [15]:
independenceTestResult.nullHypothesis

u'the occurrence of the outcomes is statistically independent.'

### Independence - Labeled Points

In [16]:
from pyspark.mllib.regression import LabeledPoint

obs = sc.parallelize([LabeledPoint(0,Vectors.dense(1.0,2.0)),
                      LabeledPoint(0,Vectors.dense(0.5,1.5)),
                      LabeledPoint(1,Vectors.dense(1.0,8.0))])

In [17]:
featTestResults = Statistics.chiSqTest(obs)

In [19]:
featTestResults

[<pyspark.mllib.stat.test.ChiSqTestResult at 0x7fdcc89c0550>,
 <pyspark.mllib.stat.test.ChiSqTestResult at 0x7fdcc89d01d0>]

In [20]:
map(lambda r: {r.statistic, r.pValue, r.nullHypothesis}, featTestResults)

[{0.3864762307712326,
  0.75,
  u'the occurrence of the outcomes is statistically independent.'},
 {0.22313016014843035,
  3.0000000000000004,
  u'the occurrence of the outcomes is statistically independent.'}]

### Distribution

In [25]:
from pyspark.mllib.random import RandomRDDs

data = RandomRDDs.normalRDD(sc, size=100, numPartitions=1, seed=13)

In [26]:
ks_result = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)

In [27]:
ks_result.statistic

0.12019890461912125

In [28]:
ks_result.pValue

0.10230385223938121

In [29]:
ks_result.nullHypothesis

u'Sample follows theoretical distribution'

### Kernel Density Estimation

In [30]:
from pyspark.mllib.stat import KernelDensity

In [33]:
kd = KernelDensity()
kd.setSample(data)
kd.setBandwidth(0.1)

In [34]:
kd.estimate([-1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5])

array([ 0.1023487 ,  0.15699217,  0.2957955 ,  0.51760411,  0.38091952,
        0.30242779,  0.1841904 ])