In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Summary Statistics

In [3]:
from collections import namedtuple
Record = namedtuple('Record',['desc','value1','value2'])

In [4]:
recDF = sc.parallelize([Record("first",1,3.7),
                        Record("second",-2,2.1),
                        Record("third",6,0.7)]).toDF()

In [None]:
recStats = recDF.describe()
recStats.show()

In [None]:
recStatsPandas = recStats.toPandas().set_index('summary')
recStatsPandas

In [None]:
recStatsPandas.loc['mean'].value1

In [None]:
recDF.stat.corr('value1','value2')

In [None]:
recDF.stat.cov('value1','value2')

In [None]:
recDF.stat.freqItems(['value1','value2']).toPandas()

## Sampling

In [None]:
df = sqlc.createDataFrame([(1, 10), (1, 20), (2, 10),(2, 20), (2, 30), (3, 20), (3, 30)]).toDF("key", "value")

In [None]:
df.show()

In [None]:
dfSampled = df.sample(withReplacement=False, fraction=0.3, seed=11)
dfSampled.show()

In [None]:
training, testing = df.randomSplit(weights=[0.3, 0.7], seed=11)

In [None]:
training.show()

In [None]:
testing.show()

## Stratified Sampling

In [None]:
dfStrat = df.stat.sampleBy(col="key", fractions={1: 0.7, 2: 0.7, 3: 0.7}, seed=11)
dfStrat.show()

## Random Data Generation

In [None]:
from pyspark.sql.functions import rand, randn

In [None]:
df = sqlc.range(0, 5)

In [None]:
df.show()

In [None]:
df2 = df.select("id").withColumn("uniform", rand(5)).withColumn("normal", randn(5))

In [None]:
df2.show()

In [None]:
sc.stop()