In [1]:
sc

<pyspark.context.SparkContext at 0x7f51bdaf67d0>

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Summary Statistics

In [3]:
from pyspark.mllib.linalg import Vectors, Vector
from pyspark.mllib.stat import Statistics

In [4]:
observations = sc.parallelize([Vectors.dense(2.0,1.0),
                               Vectors.dense(9.0,3.0),
                               Vectors.dense(-7.0,5.0)])

In [5]:
summary = Statistics.colStats(observations)

In [6]:
summary.mean()

array([ 1.33333333,  3.        ])

In [7]:
summary.variance()

array([ 64.33333333,   4.        ])

In [8]:
summary.numNonzeros()

array([ 3.,  3.])

In [9]:
summary.normL1()

array([ 18.,   9.])

In [10]:
summary.normL2()

array([ 11.5758369 ,   5.91607978])

In [11]:
X = observations.map(lambda obs: obs[0])
Y = observations.map(lambda obs: obs[1])

In [12]:
Statistics.corr(X, Y, "pearson")

-0.5610408535732833

In [13]:
Statistics.corr(observations, method="pearson")

array([[ 1.        , -0.56104085],
       [-0.56104085,  1.        ]])

## Random RDDs

In [14]:
from pyspark.mllib.random import RandomRDDs

In [15]:
million = RandomRDDs.poissonRDD(sc, mean=1.0, size=1000000L, numPartitions=10)
million.take(10)

[0.0, 4.0, 2.0, 0.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0]

In [16]:
million.mean()

0.9988870000000032

In [17]:
million.variance()

0.9985597612310049

In [18]:
data = RandomRDDs.normalVectorRDD(sc, numRows=10000L, numCols=3, numPartitions=10)

In [19]:
data.take(1)

[array([-0.90797185, -1.28129607, -0.93291205])]

In [20]:
stats = Statistics.colStats(data)

In [21]:
stats.mean()

array([-0.01685616, -0.01293579,  0.01474321])

In [22]:
stats.variance()

array([ 1.0041775 ,  1.00101702,  0.99997092])

In [23]:
stats.numNonzeros()

array([ 10000.,  10000.,  10000.])

In [24]:
stats.normL1()

array([ 7993.96645454,  7964.51516656,  8008.43209757])

In [25]:
stats.normL2()

array([ 100.21782351,  100.05419811,  100.00441409])

In [27]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Sampling

In [28]:
from pyspark.mllib.linalg import Vectors, Vector

elements = sc.parallelize([Vectors.dense(4.0,7.0,13.0),
                           Vectors.dense(-2.0,8.0,4.0),
                           Vectors.dense(3.0,-11.0,19.0)])

In [29]:
elements.sample(withReplacement=False, fraction=0.5, seed=10L).collect()

[DenseVector([4.0, 7.0, 13.0])]

In [30]:
elements.sample(withReplacement=True, fraction=3.0, seed=10L).collect()

[DenseVector([4.0, 7.0, 13.0]),
 DenseVector([4.0, 7.0, 13.0]),
 DenseVector([4.0, 7.0, 13.0]),
 DenseVector([-2.0, 8.0, 4.0]),
 DenseVector([-2.0, 8.0, 4.0]),
 DenseVector([-2.0, 8.0, 4.0]),
 DenseVector([-2.0, 8.0, 4.0]),
 DenseVector([3.0, -11.0, 19.0]),
 DenseVector([3.0, -11.0, 19.0]),
 DenseVector([3.0, -11.0, 19.0])]

In [31]:
data = sc.parallelize(range(1,1000000))

In [32]:
splits = data.randomSplit([0.6, 0.2, 0.2], seed = 13L)
map(lambda rdd: rdd.count(), splits)

[601108, 199675, 199216]

In [33]:
training, test, validation = splits

## Stratified Sampling

In [34]:
from pyspark.mllib.linalg.distributed import IndexedRow

rows = sc.parallelize([IndexedRow(0,Vectors.dense(1.0,2.0)),
                       IndexedRow(1,Vectors.dense(4.0,5.0)),
                       IndexedRow(1,Vectors.dense(7.0,8.0))])

In [35]:
fractions = {0: 4.0, 1: 0.5}

In [36]:
approxSample = rows.map(lambda row: (row.index, row.vector)).sampleByKey(True, fractions, 2)

In [37]:
approxSample.collect()

[(0L, DenseVector([1.0, 2.0])),
 (0L, DenseVector([1.0, 2.0])),
 (0L, DenseVector([1.0, 2.0])),
 (1L, DenseVector([4.0, 5.0])),
 (1L, DenseVector([4.0, 5.0])),
 (1L, DenseVector([4.0, 5.0]))]

In [38]:
sc

<pyspark.context.SparkContext at 0x7f51bdaf67d0>

In [39]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Hypothesis Testing

### Goodness of Fit

In [40]:
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.stat import Statistics

vec = Vectors.dense([0.3, 0.2, 0.15, 0.1, 0.1, 0.1, 0.05])

In [41]:
goodnessOfFitTestResult = Statistics.chiSqTest(vec)

In [42]:
goodnessOfFitTestResult.statistic

0.295

In [43]:
goodnessOfFitTestResult.pValue

0.999520973435643

In [44]:
goodnessOfFitTestResult.nullHypothesis

u'observed follows the same distribution as expected.'

### Independence

In [45]:
from pyspark.mllib.linalg import Matrices

mat = Matrices.dense(3, 2, [13.0, 47.0, 40.0, 80.0, 11.0, 9.0])

In [46]:
independenceTestResult = Statistics.chiSqTest(mat)

In [47]:
independenceTestResult.statistic

90.22588968846716

In [48]:
independenceTestResult.pValue

0.0

In [49]:
independenceTestResult.nullHypothesis

u'the occurrence of the outcomes is statistically independent.'

### Independence - Labeled Points

In [50]:
from pyspark.mllib.regression import LabeledPoint

obs = sc.parallelize([LabeledPoint(0,Vectors.dense(1.0,2.0)),
                      LabeledPoint(0,Vectors.dense(0.5,1.5)),
                      LabeledPoint(1,Vectors.dense(1.0,8.0))])

In [51]:
featTestResults = Statistics.chiSqTest(obs)

In [52]:
featTestResults

[<pyspark.mllib.stat.test.ChiSqTestResult at 0x7f5195225b90>,
 <pyspark.mllib.stat.test.ChiSqTestResult at 0x7f51951bab10>]

In [53]:
map(lambda r: {r.statistic, r.pValue, r.nullHypothesis}, featTestResults)

[{0.3864762307712326,
  0.75,
  u'the occurrence of the outcomes is statistically independent.'},
 {0.22313016014843035,
  3.0000000000000004,
  u'the occurrence of the outcomes is statistically independent.'}]

### Distribution

In [54]:
from pyspark.mllib.random import RandomRDDs

data = RandomRDDs.normalRDD(sc, size=100, numPartitions=1, seed=13)

In [55]:
ks_result = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)

In [56]:
ks_result.statistic

0.12019890461912125

In [57]:
ks_result.pValue

0.10230385223938121

In [58]:
ks_result.nullHypothesis

u'Sample follows theoretical distribution'

### Kernel Density Estimation

In [59]:
from pyspark.mllib.stat import KernelDensity

In [60]:
kd = KernelDensity()
kd.setSample(data)
kd.setBandwidth(0.1)

In [61]:
kd.estimate([-1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5])

array([ 0.1023487 ,  0.15699217,  0.2957955 ,  0.51760411,  0.38091952,
        0.30242779,  0.1841904 ])