In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Summary Statistics

In [3]:
from collections import namedtuple
Record = namedtuple('Record',['desc','value1','value2'])

In [4]:
recDF = sc.parallelize([Record("first",1,3.7),
                        Record("second",-2,2.1),
                        Record("third",6,0.7)]).toDF()

In [5]:
recStats = recDF.describe()
recStats.show()

+-------+-----+------------------+------------------+
|summary| desc|            value1|            value2|
+-------+-----+------------------+------------------+
|  count|    3|                 3|                 3|
|   mean| null|1.6666666666666667| 2.166666666666667|
| stddev| null| 4.041451884327381|1.5011106998930273|
|    min|first|                -2|               0.7|
|    max|third|                 6|               3.7|
+-------+-----+------------------+------------------+



In [6]:
recStatsPandas = recStats.toPandas().set_index('summary')
recStatsPandas

Unnamed: 0_level_0,desc,value1,value2
summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
count,3,3.0,3.0
mean,,1.6666666666666667,2.166666666666667
stddev,,4.041451884327381,1.5011106998930273
min,first,-2.0,0.7
max,third,6.0,3.7


In [7]:
recStatsPandas.loc['mean'].value1

'1.6666666666666667'

In [8]:
recDF.stat.corr('value1','value2')

-0.5879120879120878

In [9]:
recDF.stat.cov('value1','value2')

-3.566666666666667

In [10]:
recDF.stat.freqItems(['value1','value2']).toPandas()

Unnamed: 0,value1_freqItems,value2_freqItems
0,"[-2, 1, 6]","[0.7, 2.1, 3.7]"


## Sampling

In [11]:
df = sqlc.createDataFrame([(1, 10), (1, 20), (2, 10),(2, 20), (2, 30), (3, 20), (3, 30)]).toDF("key", "value")

In [12]:
df.show()

+---+-----+
|key|value|
+---+-----+
|  1|   10|
|  1|   20|
|  2|   10|
|  2|   20|
|  2|   30|
|  3|   20|
|  3|   30|
+---+-----+



In [13]:
dfSampled = df.sample(withReplacement=False, fraction=0.3, seed=11)
dfSampled.show()

+---+-----+
|key|value|
+---+-----+
|  1|   10|
|  2|   10|
|  2|   30|
+---+-----+



In [14]:
training, testing = df.randomSplit(weights=[0.3, 0.7], seed=11)

In [15]:
training.show()

+---+-----+
|key|value|
+---+-----+
|  1|   10|
|  2|   10|
|  2|   30|
+---+-----+



In [16]:
testing.show()

+---+-----+
|key|value|
+---+-----+
|  1|   20|
|  2|   20|
|  3|   20|
|  3|   30|
+---+-----+



## Stratified Sampling

In [17]:
dfStrat = df.stat.sampleBy(col="key", fractions={1: 0.7, 2: 0.7, 3: 0.7}, seed=11)
dfStrat.show()

+---+-----+
|key|value|
+---+-----+
|  1|   10|
|  1|   20|
|  2|   10|
|  2|   30|
|  3|   20|
|  3|   30|
+---+-----+



## Random Data Generation

In [18]:
from pyspark.sql.functions import rand, randn

In [19]:
df = sqlc.range(0, 5)

In [20]:
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [21]:
df2 = df.select("id").withColumn("uniform", rand(5)).withColumn("normal", randn(5))

In [22]:
df2.show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|0.47611851579756026|-0.21311682946326227|
|  1|0.06498948189958098|-0.05248092572410684|
|  2| 0.7069655052310547|  1.3682472758997855|
|  3| 0.1982919638208397|  -0.256535324205377|
|  4|0.12030715258495939|  -0.506853671746243|
+---+-------------------+--------------------+



In [23]:
sc.stop()