In [1]:
sc

<pyspark.context.SparkContext at 0x7f51e3da4710>

In [2]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Summary Statistics

In [3]:
from collections import namedtuple
Record = namedtuple('Record',['desc','value1','value2'])

In [4]:
recDF = sc.parallelize([Record("first",1,3.7),
                        Record("second",-2,2.1),
                        Record("third",6,0.7)]).toDF()

In [5]:
recStats = recDF.describe()
recStats.show()

+-------+-----+------------------+------------------+
|summary| desc|            value1|            value2|
+-------+-----+------------------+------------------+
|  count|    3|                 3|                 3|
|   mean| null|1.6666666666666667| 2.166666666666667|
| stddev| null| 4.041451884327381|1.5011106998930273|
|    min|first|                -2|               0.7|
|    max|third|                 6|               3.7|
+-------+-----+------------------+------------------+



In [6]:
recStatsPandas = recStats.toPandas().set_index('summary')
recStatsPandas

Unnamed: 0_level_0,desc,value1,value2
summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
count,3,3.0,3.0
mean,,1.6666666666666667,2.166666666666667
stddev,,4.041451884327381,1.5011106998930273
min,first,-2.0,0.7
max,third,6.0,3.7


In [7]:
recStatsPandas.loc['mean'].value1

u'1.6666666666666667'

In [8]:
recDF.stat.corr('value1','value2')

-0.5879120879120878

In [9]:
recDF.stat.cov('value1','value2')

-3.5666666666666664

In [10]:
recDF.stat.freqItems(['value1','value2']).toPandas()

Unnamed: 0,value1_freqItems,value2_freqItems
0,"[1, -2, 6]","[0.7, 2.1, 3.7]"


## Sampling

In [11]:
df = sqlc.createDataFrame([(1, 10), (1, 20), (2, 10),(2, 20), (2, 30), (3, 20), (3, 30)]).toDF("key", "value")

In [12]:
df.show()

+---+-----+
|key|value|
+---+-----+
|  1|   10|
|  1|   20|
|  2|   10|
|  2|   20|
|  2|   30|
|  3|   20|
|  3|   30|
+---+-----+



In [13]:
dfSampled = df.sample(withReplacement=False, fraction=0.3, seed=11)
dfSampled.show()

+---+-----+
|key|value|
+---+-----+
|  1|   10|
|  1|   20|
|  2|   10|
|  3|   20|
+---+-----+



In [14]:
training, testing = df.randomSplit(weights=[0.3, 0.7], seed=11)

In [15]:
training.show()

+---+-----+
|key|value|
+---+-----+
|  1|   10|
|  1|   20|
|  2|   10|
|  3|   20|
+---+-----+



In [16]:
testing.show()

+---+-----+
|key|value|
+---+-----+
|  2|   20|
|  2|   30|
|  3|   30|
+---+-----+



## Stratified Sampling

In [17]:
dfStrat = df.stat.sampleBy(col="key", fractions={1: 0.7, 2: 0.7, 3: 0.7}, seed=11)
dfStrat.show()

+---+-----+
|key|value|
+---+-----+
|  1|   10|
|  1|   20|
|  2|   10|
|  2|   20|
|  3|   20|
+---+-----+



## Random Data Generation

In [18]:
from pyspark.sql.functions import rand, randn

In [19]:
df = sqlc.range(0, 5)

In [20]:
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [21]:
df2 = df.select("id").withColumn("uniform", rand(5)).withColumn("normal", randn(5))

In [22]:
df2.show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|  0.087440518337355| -0.8192086817250301|
|  1|0.47611851579756026|-0.21311682946326227|
|  2| 0.9147159860432812| -1.2904230199480902|
|  3|0.06498948189958098|-0.05248092572410684|
|  4|0.25441447150956253|-0.02962814769606...|
+---+-------------------+--------------------+



In [23]:
sc

<pyspark.context.SparkContext at 0x7f51e3da4710>

In [24]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Using User Defined Functions

In [25]:
from numpy import NaN
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import DoubleType

udf1 = UserDefinedFunction(lambda x: NaN if x > 0.5 else x, DoubleType())
udf2 = UserDefinedFunction(lambda x: NaN if x > 1.0 else x, DoubleType())

In [26]:
from pyspark.sql.functions import rand, randn

df = sqlc.range(0, 5).withColumn("uniform", rand(5)).withColumn("normal", randn(5))

dfnan = df.withColumn("nanUniform", udf1("uniform")) \
          .withColumn("nanNormal", udf2("normal")) \
          .drop("uniform").withColumnRenamed("nanUniform", "uniform") \
          .drop("normal").withColumnRenamed("nanNormal", "normal")

In [27]:
dfnan.show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|  0.087440518337355| -0.8192086817250301|
|  1|0.47611851579756026|-0.21311682946326227|
|  2|                NaN| -1.2904230199480902|
|  3|0.06498948189958098|-0.05248092572410684|
|  4|0.25441447150956253|-0.02962814769606...|
+---+-------------------+--------------------+



## NA Functions

### Drop

In [28]:
dfnan.na.drop(how='all',subset=['uniform','normal']).show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|  0.087440518337355| -0.8192086817250301|
|  1|0.47611851579756026|-0.21311682946326227|
|  2|                NaN| -1.2904230199480902|
|  3|0.06498948189958098|-0.05248092572410684|
|  4|0.25441447150956253|-0.02962814769606...|
+---+-------------------+--------------------+



### Replace

In [29]:
dfnan.na.replace([NaN],[0.0], 'uniform').show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|  0.087440518337355| -0.8192086817250301|
|  1|0.47611851579756026|-0.21311682946326227|
|  2|                0.0| -1.2904230199480902|
|  3|0.06498948189958098|-0.05248092572410684|
|  4|0.25441447150956253|-0.02962814769606...|
+---+-------------------+--------------------+



### Fill

In [30]:
dfnan.na.fill({'uniform': 0.0, 'normal': 1.0}).show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|  0.087440518337355| -0.8192086817250301|
|  1|0.47611851579756026|-0.21311682946326227|
|  2|                0.0| -1.2904230199480902|
|  3|0.06498948189958098|-0.05248092572410684|
|  4|0.25441447150956253|-0.02962814769606...|
+---+-------------------+--------------------+



In [31]:
dfsummary = dfnan.drop("id").na.drop(how='all').describe()
dfsummary.show()

+-------+-------------------+--------------------+
|summary|            uniform|              normal|
+-------+-------------------+--------------------+
|  count|                  5|                   5|
|   mean|                NaN| -0.4809715209113115|
| stddev|                NaN|  0.5542285793533528|
|    min|0.06498948189958098| -1.2904230199480902|
|    max|                NaN|-0.02962814769606...|
+-------+-------------------+--------------------+



In [32]:
uniformMean = float(dfsummary.rdd.collect()[1][1])
print uniformMean

nan


In [33]:
pandasSummary = dfsummary.toPandas().set_index("summary")
pandasSummary

Unnamed: 0_level_0,uniform,normal
summary,Unnamed: 1_level_1,Unnamed: 2_level_1
count,5.0,5.0
mean,,-0.4809715209113115
stddev,,0.5542285793533528
min,0.0649894818995809,-1.2904230199480902
max,,-0.0296281476960677


In [34]:
uniformMean = float(pandasSummary.loc['mean'].uniform)
print uniformMean

nan


In [35]:
normalMean = float(pandasSummary.loc['mean'].normal)
print normalMean

-0.480971520911


In [36]:
dfnan.na.fill({"uniform": uniformMean, "normal": normalMean}).show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|  0.087440518337355| -0.8192086817250301|
|  1|0.47611851579756026|-0.21311682946326227|
|  2|                NaN| -1.2904230199480902|
|  3|0.06498948189958098|-0.05248092572410684|
|  4|0.25441447150956253|-0.02962814769606...|
+---+-------------------+--------------------+



## Duplicates

In [37]:
dfDuplicates = dfnan.union(sc.parallelize([(5,1,1),(6,1,1)]).toDF())

In [38]:
dfDuplicates.show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|  0.087440518337355| -0.8192086817250301|
|  1|0.47611851579756026|-0.21311682946326227|
|  2|                NaN| -1.2904230199480902|
|  3|0.06498948189958098|-0.05248092572410684|
|  4|0.25441447150956253|-0.02962814769606...|
|  5|                1.0|                 1.0|
|  6|                1.0|                 1.0|
+---+-------------------+--------------------+



In [39]:
dfDuplicates.dropDuplicates(["uniform","normal"]).show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|  0.087440518337355| -0.8192086817250301|
|  3|0.06498948189958098|-0.05248092572410684|
|  2|                NaN| -1.2904230199480902|
|  5|                1.0|                 1.0|
|  4|0.25441447150956253|-0.02962814769606...|
|  1|0.47611851579756026|-0.21311682946326227|
+---+-------------------+--------------------+



In [40]:
sc

<pyspark.context.SparkContext at 0x7f51e3da4710>

In [41]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Transformers and Estimators

### Transformers - Tokenizer

In [42]:
from pyspark.ml.feature import Tokenizer

In [43]:
!rm -rf metastore_db/

sentenceDataFrame = sqlc.createDataFrame([(0, "Hi I heard about Spark"),
                                          (1, "I wish Java could use case classes"),
                                          (2, "Logistic,regression,models,are,neat")]) \
                    .toDF("label", "sentence")

In [44]:
tokenizer = Tokenizer().setInputCol("sentence").setOutputCol("words")
tokenized = tokenizer.transform(sentenceDataFrame)

In [45]:
tokenized.toPandas()

Unnamed: 0,label,sentence,words
0,0,Hi I heard about Spark,"[hi, i, heard, about, spark]"
1,1,I wish Java could use case classes,"[i, wish, java, could, use, case, classes]"
2,2,"Logistic,regression,models,are,neat","[logistic,regression,models,are,neat]"


### Transformers - Vector Assembler

In [46]:
from pyspark.sql.functions import rand, randn
from pyspark.ml.feature import VectorAssembler

dfRandom = sqlc.range(0, 10).select("id") \
            .withColumn("uniform", rand(10)) \
            .withColumn("normal1", randn(10)) \
            .withColumn("normal2", randn(11))

In [47]:
assembler = VectorAssembler(inputCols = ["uniform","normal1","normal2"], outputCol = "features")

dfVec = assembler.transform(dfRandom)

In [48]:
dfVec.select("id","features").toPandas()

Unnamed: 0,id,features
0,0,"[0.41371264721, -0.587748239674, -0.256535324205]"
1,1,"[0.73117192819, 1.57463277597, -0.317032643347]"
2,2,"[0.198291963821, -0.256535324205, -0.506853671..."
3,3,"[0.127141811658, -0.317032643347, 1.42509038959]"
4,4,"[0.760431815341, 0.497762942531, 0.147884030486]"
5,5,"[0.120307152585, -0.506853671746, -0.141369919..."
6,6,"[0.121313639104, 1.42509038959, 0.965766508876]"
7,7,"[0.442929185213, -0.141369919356, -0.726587521..."
8,8,"[0.889878425389, 0.965766508876, 0.891697335754]"
9,9,"[0.0365070771727, -0.502100908234, -1.21126271..."


### Estimator - Logistic Regression

In [49]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors

In [50]:
training = sqlc.createDataFrame([(1.0, Vectors.dense(0.0, 1.1, 0.1)),
                                       (0.0, Vectors.dense(2.0, 1.0, -1.0)),
                                       (0.0, Vectors.dense(2.0, 1.3, 1.0)),
                                       (1.0, Vectors.dense(0.0, 1.2, -0.5))]) \
            .toDF("label", "features")

In [51]:
lr = LogisticRegression()

In [52]:
lr.setMaxIter(10).setRegParam(0.01)

LogisticRegression_48b1b2637ebee62132d1

In [53]:
model1 = lr.fit(training, {'maxIter': 10, 'regParam': 0.01})
model1.coefficients

DenseVector([-3.1009, 2.6082, -0.3802])

In [54]:
model1.transform(training).toPandas()

Unnamed: 0,label,features,rawPrediction,probability,prediction
0,1.0,"[0.0, 1.1, 0.1]","[-2.89919489464, 2.89919489464]","[0.052193376663, 0.947806623337]",1.0
1,0.0,"[2.0, 1.0, -1.0]","[3.14530074644, -3.14530074644]","[0.95872315829, 0.04127684171]",0.0
2,0.0,"[2.0, 1.3, 1.0]","[3.12319457003, -3.12319457003]","[0.95783942353, 0.0421605764704]",0.0
3,1.0,"[0.0, 1.2, -0.5]","[-3.388123842, 3.388123842]","[0.0326686926626, 0.967331307337]",1.0


In [55]:
sc

<pyspark.context.SparkContext at 0x7f51e3da4710>

In [56]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [57]:
from pyspark.sql.functions import rand, randn
from pyspark.ml.feature import VectorAssembler

dfRandom = sqlc.range(0, 10).select("id") \
            .withColumn("uniform", rand(10)) \
            .withColumn("normal1", randn(10)) \
            .withColumn("normal2", randn(11))
            
assembler = VectorAssembler(inputCols = ["uniform","normal1","normal2"], outputCol = "features")

dfVec = assembler.transform(dfRandom)

## Data Normalization

### Normalizer

In [58]:
from pyspark.ml.feature import Normalizer

In [59]:
scaler1 = Normalizer().setInputCol("features").setOutputCol("scaledFeat").setP(1.0)

In [60]:
scaler1.transform(dfVec.select("id","features")).show(5)

+---+--------------------+--------------------+
| id|            features|          scaledFeat|
+---+--------------------+--------------------+
|  0|[0.41371264720975...|[0.32886636983701...|
|  1|[0.73117192818966...|[0.27877135762286...|
|  2|[0.19829196382083...|[0.20619308493718...|
|  3|[0.12714181165849...|[0.06801701322638...|
|  4|[0.76043181534066...|[0.54081735791552...|
+---+--------------------+--------------------+
only showing top 5 rows



### Standard Scaler

In [61]:
from pyspark.ml.feature import StandardScaler

In [62]:
scaler2 = StandardScaler().setInputCol("features").setOutputCol("scaledFeat").setWithStd(True).setWithMean(True)

In [63]:
scaler2Model = scaler2.fit(dfVec.select("id","features"))

In [64]:
scaler2Model.transform(dfVec.select("id","features")).toPandas()[:5]

Unnamed: 0,id,features,scaledFeat
0,0,"[0.41371264721, -0.587748239674, -0.256535324205]","[0.0943617151947, -0.962057764653, -0.34138993..."
1,1,"[0.73117192819, 1.57463277597, -0.317032643347]","[1.1083042573, 1.62893863751, -0.414211116325]"
2,2,"[0.198291963821, -0.256535324205, -0.506853671...","[-0.593676746506, -0.565193629184, -0.64270042..."
3,3,"[0.127141811658, -0.317032643347, 1.42509038959]","[-0.82092531184, -0.637682397399, 1.68279834122]"
4,4,"[0.760431815341, 0.497762942531, 0.147884030486]","[1.20175827215, 0.338617531349, 0.145413343926]"


### MinMax Scaler

In [65]:
from pyspark.ml.feature import MinMaxScaler

In [66]:
scaler3 = MinMaxScaler().setInputCol("features").setOutputCol("scaledFeat").setMin(-1.0).setMax(1.0)

In [67]:
scaler3Model = scaler3.fit(dfVec.select("id","features"))

In [68]:
scaler3Model.transform(dfVec.select("id","features")).toPandas()[:5]

Unnamed: 0,id,features,scaledFeat
0,0,"[0.41371264721, -0.587748239674, -0.256535324205]","[-0.115963827879, -1.0, -0.275721153]"
1,1,"[0.73117192819, 1.57463277597, -0.317032643347]","[0.628048217155, 1.0, -0.321615854237]"
2,2,"[0.198291963821, -0.256535324205, -0.506853671...","[-0.620833563287, -0.693659060941, -0.46561858..."
3,3,"[0.127141811658, -0.317032643347, 1.42509038959]","[-0.78758430389, -0.749613417461, 1.0]"
4,4,"[0.760431815341, 0.497762942531, 0.147884030486]","[0.696623022747, 0.00399621930629, 0.031080961..."
