In [None]:
from pyspark.sql.functions import rand, randn
from pyspark import SparkContext
from pyspark.sql import SQLContext, SparkSession

In [7]:
spark = SparkSession \
        .builder\
        .appName("PythonWordCount")\
        .getOrCreate()

In [11]:
sqlc = SQLContext(spark.sparkContext)

df = (sqlc.range(0, 1000 * 1000).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27)))

print('# rows: ', df.count())
df.show()

# rows:  1000000
+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0| 0.1709497137955568| -0.8664700627108758|
|  1| 0.8051143958005459| -0.5970491018333267|
|  2| 0.5775925576589018| 0.18267161219540898|
|  3| 0.9476047869880925| -1.8497305679917546|
|  4|    0.2093704977577|  0.9410417279045351|
|  5|0.36664222617947817| -0.6516475674670159|
|  6| 0.8078688178371882|  0.5901002135239671|
|  7| 0.7135143433452461|  -1.850241871360443|
|  8| 0.7195325566306053| 0.09176896733073023|
|  9|0.31335292311175456|-0.38605118617831075|
| 10| 0.8062503712025726|  1.2134544166783332|
| 11|0.10814914646176654| -1.0757702531630617|
| 12| 0.3362232980701172| 0.04961226872064977|
| 13| 0.8133304803837667|  -0.768259602441542|
| 14|0.47649428738170896|  0.2911293146907403|
| 15|  0.524728096293865|-0.33406080411047484|
| 16| 0.9701253460019921|  1.3607097640771781|
| 17| 0.6232167713919952|  0.59867729810827

In [13]:
df.describe().show()

+-------+------------------+--------------------+--------------------+
|summary|                id|             uniform|              normal|
+-------+------------------+--------------------+--------------------+
|  count|           1000000|             1000000|             1000000|
|   mean|          499999.5|  0.4997785318606761|6.545992003465573E-4|
| stddev|288675.27893234405|  0.2887560412263698|  1.0003498848232582|
|    min|                 0|2.710561290975022E-7|  -4.949492960499273|
|    max|            999999|  0.9999998822463074|   4.474351963425938|
+-------+------------------+--------------------+--------------------+



In [14]:
df.describe('uniform', 'normal').show()

+-------+--------------------+--------------------+
|summary|             uniform|              normal|
+-------+--------------------+--------------------+
|  count|             1000000|             1000000|
|   mean|  0.4997785318606761|6.545992003465573E-4|
| stddev|  0.2887560412263698|  1.0003498848232582|
|    min|2.710561290975022E-7|  -4.949492960499273|
|    max|  0.9999998822463074|   4.474351963425938|
+-------+--------------------+--------------------+



In [16]:
from pyspark.sql.functions import mean, min, max

df.select([mean('uniform'), min('uniform'), max('uniform')]).show()

+------------------+--------------------+------------------+
|      avg(uniform)|        min(uniform)|      max(uniform)|
+------------------+--------------------+------------------+
|0.4997785318606761|2.710561290975022E-7|0.9999998822463074|
+------------------+--------------------+------------------+



In [18]:
# Covariância
sqlContext = SQLContext(spark.sparkContext)

df = sqlContext.range(0, 1000 * 1000).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))

df.stat.cov('rand1', 'rand2')

0.00011441526053927191

In [26]:
# Covariância
sqlContext = SQLContext(spark.sparkContext)

df = sqlContext.range(0, 1000 * 1000).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))

print('cov(rand2, rand2): ', df.stat.cov('rand2', 'rand2'))

df.agg({'rand2': 'variance'}).show()

cov(rand2, rand2):  0.08339799711775965
+-------------------+
|    variance(rand2)|
+-------------------+
|0.08339799711775966|
+-------------------+



In [27]:
# Correlação
sqlContext = SQLContext(spark.sparkContext)

df = sqlContext.range(0, 1000 * 1000).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))

print('cor(rand2, rand2): ', df.stat.corr('rand2', 'rand2'))

print('cor(rand1, rand2): ', df.stat.corr('rand1', 'rand2'))

cor(rand2, rand2):  1.0
cor(rand1, rand2):  0.00137206619523886


In [29]:
# Tabela de Contigência
names = ["Pedro", "Maria", "João"]
colors = ["verde", "amarelo", "rosa", "vermelho", "preto"]
df = sqlContext.createDataFrame([(names[i % 3], colors[i % 5]) for i in range(100)], ["name", "color"])

df.show(10)

+-----+--------+
| name|   color|
+-----+--------+
|Pedro|   verde|
|Maria| amarelo|
| João|    rosa|
|Pedro|vermelho|
|Maria|   preto|
| João|   verde|
|Pedro| amarelo|
|Maria|    rosa|
| João|vermelho|
|Pedro|   preto|
+-----+--------+
only showing top 10 rows



In [30]:
# Contagem da frequência de pares e valores de duas colunas
df.stat.crosstab("name", "color").show()

+----------+-------+-----+----+-----+--------+
|name_color|amarelo|preto|rosa|verde|vermelho|
+----------+-------+-----+----+-----+--------+
|      João|      6|    6|   7|    7|       7|
|     Maria|      7|    7|   7|    6|       6|
|     Pedro|      7|    7|   6|    7|       7|
+----------+-------+-----+----+-----+--------+



In [32]:
# Itens frequentes
df = sqlContext.createDataFrame([(1, 2, 3) if i % 2 == 0 else (i, 2 * i, i % 4) for i in range(100)], ["a", "b", "c"])

df.show(10)

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1|  2|  3|
|  1|  2|  1|
|  1|  2|  3|
|  3|  6|  3|
|  1|  2|  3|
|  5| 10|  1|
|  1|  2|  3|
|  7| 14|  3|
|  1|  2|  3|
|  9| 18|  1|
+---+---+---+
only showing top 10 rows



In [33]:
freq = df.stat.freqItems(["a", "b", "c"], 0.4)

freq.collect()[0]

Row(a_freqItems=[11, 1], b_freqItems=[2, 22], c_freqItems=[1, 3])

In [34]:
from pyspark.sql.functions import struct

freq = df.withColumn('ab', struct('a', 'b')).stat.freqItems(['ab'], 0.4)
freq.collect()[0]

Row(ab_freqItems=[Row(a=11, b=22), Row(a=1, b=2)])

In [37]:
# Funções matemáticas
from pyspark.sql.functions import *

df = sqlContext.range(0, 10).withColumn('uniform', rand(seed=10) * 3.14)

df.select(
    'uniform',
    toDegrees('uniform'),
    (pow(cos(df['uniform']), 2) + pow(sin(df['uniform']), 2)). \
    alias("cos^2 + sin^2")).show()

+-------------------+------------------+------------------+
|            uniform|  DEGREES(uniform)|     cos^2 + sin^2|
+-------------------+------------------+------------------+
| 0.5367821013180484|30.755348923687915|               1.0|
|0.10747087445354876| 6.157627526768682|               1.0|
| 1.1475525508626785| 65.74991793390322|               1.0|
|  1.310955978808693|  75.1122447131799|               1.0|
| 3.1083266315458262|178.09399733569154|0.9999999999999999|
| 0.5165986402305565|29.598921787408102|               1.0|
| 0.5696528438969835| 32.63870374292187|0.9999999999999999|
| 1.5573024855692674| 89.22685984835181|0.9999999999999999|
| 3.0450071328478523| 174.4660572994135|               1.0|
|0.23646103537894467|13.548219346507173|               1.0|
+-------------------+------------------+------------------+



