In [117]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand, randn, mean, min, max
from pyspark import SparkContext
from pyspark.sql import SQLContext


In [118]:
spark = SparkSession \
        .builder \
        .appName("PythonWordCount") \
        .getOrCreate() 

In [119]:
sqlc = SQLContext(spark.sparkContext)

In [120]:
df = (sqlc.range(0, 100 * 100)
    .withColumn('uniform', rand(seed=10))
    .withColumn('normal', randn(seed=27)))

print('# rows: ', df.count())
df.show()

# rows:  10000
+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0| 0.1709497137955568| -0.8664700627108758|
|  1| 0.8051143958005459| -0.5970491018333267|
|  2| 0.5775925576589018| 0.18267161219540898|
|  3| 0.9476047869880925| -1.8497305679917546|
|  4|    0.2093704977577|  0.9410417279045351|
|  5|0.36664222617947817| -0.6516475674670159|
|  6| 0.8078688178371882|  0.5901002135239671|
|  7| 0.7135143433452461|  -1.850241871360443|
|  8| 0.7195325566306053| 0.09176896733073023|
|  9|0.31335292311175456|-0.38605118617831075|
| 10| 0.8062503712025726|  1.2134544166783332|
| 11|0.10814914646176654| -1.0757702531630617|
| 12| 0.3362232980701172| 0.04961226872064977|
| 13| 0.8133304803837667|  -0.768259602441542|
| 14|0.47649428738170896|  0.2911293146907403|
| 15|  0.524728096293865|-0.33406080411047484|
| 16| 0.9701253460019921|  1.3607097640771781|
| 17| 0.6232167713919952|  0.5986772981082732

In [121]:
df.describe().show()

+-------+------------------+--------------------+--------------------+
|summary|                id|             uniform|              normal|
+-------+------------------+--------------------+--------------------+
|  count|             10000|               10000|               10000|
|   mean|            4999.5|  0.5024351587579093|-0.00202023947322...|
| stddev|2886.8956799071675|  0.2888572701169501|  1.0026324939995948|
|    min|                 0|5.225599964608918E-5|  -3.905213826109845|
|    max|              9999|  0.9998135527087725|  3.7776553456547246|
+-------+------------------+--------------------+--------------------+



In [122]:
df.describe('uniform', 'normal').show()

+-------+--------------------+--------------------+
|summary|             uniform|              normal|
+-------+--------------------+--------------------+
|  count|               10000|               10000|
|   mean|  0.5024351587579093|-0.00202023947322...|
| stddev|  0.2888572701169501|  1.0026324939995948|
|    min|5.225599964608918E-5|  -3.905213826109845|
|    max|  0.9998135527087725|  3.7776553456547246|
+-------+--------------------+--------------------+



In [123]:
df.select([mean('uniform'), min('uniform'), max('uniform')]).show()

+------------------+--------------------+------------------+
|      avg(uniform)|        min(uniform)|      max(uniform)|
+------------------+--------------------+------------------+
|0.5024351587579093|5.225599964608918E-5|0.9998135527087725|
+------------------+--------------------+------------------+



In [124]:
# Covariance
print('Cov uniform vs uniform: ', df.stat.cov('uniform', 'uniform'))
print('Cov uniform vs uniform: ', df.stat.cov('uniform', 'normal'))

df.agg({'uniform' : 'variance'}).show()

Cov uniform vs uniform:  0.08343852249941668
Cov uniform vs uniform:  -0.0013170595610058967
+-------------------+
|  variance(uniform)|
+-------------------+
|0.08343852249941668|
+-------------------+



In [125]:
# Correlation
print('Cor uniform vs uniform: ', df.stat.corr('uniform', 'uniform'))
print('Cor uniform x normal: ', df.stat.corr('uniform', 'normal'))

Cor uniform vs uniform:  1.0
Cor uniform x normal:  -0.004547579890834926


In [126]:
# Tabela de contingência (cross tab)

names = ['Pedro', 'Maria', 'João']
colors = ['verde', 'amarelo', 'rosa', 'vermelho', 'preto']

#Gera tabela misturada
df = SQLContext.createDataFrame([(names[i % 3], colors[i % 5]) for i in range(100)], ["name", "color"])

df.show(10)


AttributeError: 'list' object has no attribute 'sparkSession'

In [None]:
df.stat.crosstab("name", "color").show

In [None]:
spark.stop()