In [1]:
sc

<pyspark.context.SparkContext at 0x7fa7263fc790>

In [6]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Summary Statistics

In [8]:
from collections import namedtuple
Record = namedtuple('Record',['desc','value1','value2'])

In [9]:
recDF = sc.parallelize([Record("first",1,3.7),
                        Record("second",-2,2.1),
                        Record("third",6,0.7)]).toDF()

In [11]:
recStats = recDF.describe()
recStats.show()

+-------+------------------+------------------+
|summary|            value1|            value2|
+-------+------------------+------------------+
|  count|                 3|                 3|
|   mean|1.6666666666666667| 2.166666666666667|
| stddev| 4.041451884327381|1.5011106998930273|
|    min|                -2|               0.7|
|    max|                 6|               3.7|
+-------+------------------+------------------+



In [12]:
recStatsPandas = recStats.toPandas().set_index('summary')
recStatsPandas

Unnamed: 0_level_0,value1,value2
summary,Unnamed: 1_level_1,Unnamed: 2_level_1
count,3.0,3.0
mean,1.6666666666666667,2.166666666666667
stddev,4.041451884327381,1.5011106998930273
min,-2.0,0.7
max,6.0,3.7


In [16]:
recStatsPandas.loc['mean'].value1

u'1.6666666666666667'

In [13]:
recDF.stat.corr('value1','value2')

-0.5879120879120878

In [14]:
recDF.stat.cov('value1','value2')

-3.566666666666667

In [15]:
recDF.stat.freqItems(['value1','value2']).toPandas()

Unnamed: 0,value1_freqItems,value2_freqItems
0,"[-2, 1, 6]","[0.7, 2.1, 3.7]"


## Sampling

In [17]:
df = sqlc.createDataFrame([(1, 10), (1, 20), (2, 10),(2, 20), (2, 30), (3, 20), (3, 30)]).toDF("key", "value")

In [18]:
df.show()

+---+-----+
|key|value|
+---+-----+
|  1|   10|
|  1|   20|
|  2|   10|
|  2|   20|
|  2|   30|
|  3|   20|
|  3|   30|
+---+-----+



In [20]:
dfSampled = df.sample(withReplacement=False, fraction=0.3, seed=11)
dfSampled.show()

+---+-----+
|key|value|
+---+-----+
|  1|   10|
|  2|   10|
|  2|   30|
+---+-----+



In [21]:
training, testing = df.randomSplit(weights=[0.3, 0.7], seed=11)

In [22]:
training.show()

+---+-----+
|key|value|
+---+-----+
|  1|   10|
|  2|   10|
|  2|   30|
+---+-----+



In [23]:
testing.show()

+---+-----+
|key|value|
+---+-----+
|  1|   20|
|  2|   20|
|  3|   20|
|  3|   30|
+---+-----+



## Stratified Sampling

In [24]:
dfStrat = df.stat.sampleBy(col="key", fractions={1: 0.7, 2: 0.7, 3: 0.7}, seed=11)
dfStrat.show()

+---+-----+
|key|value|
+---+-----+
|  1|   10|
|  1|   20|
|  2|   10|
|  2|   30|
|  3|   20|
|  3|   30|
+---+-----+



## Random Data Generation

In [25]:
from pyspark.sql.functions import rand, randn

In [26]:
df = sqlc.range(0, 5)

In [27]:
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [28]:
df2 = df.select("id").withColumn("uniform", rand(5)).withColumn("normal", randn(5))

In [29]:
df2.show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|0.47611851579756026|-0.21311682946326227|
|  1|0.06498948189958098|-0.05248092572410684|
|  2| 0.7069655052310547|  1.3682472758997855|
|  3| 0.1982919638208397|  -0.256535324205377|
|  4|0.12030715258495939|  -0.506853671746243|
+---+-------------------+--------------------+



In [1]:
sc

<pyspark.context.SparkContext at 0x7fa910da1790>

In [5]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Using User Defined Functions

In [6]:
from numpy import NaN
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import DoubleType

udf1 = UserDefinedFunction(lambda x: NaN if x > 0.5 else x, DoubleType())
udf2 = UserDefinedFunction(lambda x: NaN if x > 1.0 else x, DoubleType())

In [7]:
from pyspark.sql.functions import rand, randn

df = sqlc.range(0, 5).withColumn("uniform", rand(5)).withColumn("normal", randn(5))

dfnan = df.withColumn("nanUniform", udf1("uniform")) \
          .withColumn("nanNormal", udf2("normal")) \
          .drop("uniform").withColumnRenamed("nanUniform", "uniform") \
          .drop("normal").withColumnRenamed("nanNormal", "normal")

In [8]:
dfnan.show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|0.47611851579756026|-0.21311682946326227|
|  1|0.06498948189958098|-0.05248092572410684|
|  2|                NaN|                 NaN|
|  3| 0.1982919638208397|  -0.256535324205377|
|  4|0.12030715258495939|  -0.506853671746243|
+---+-------------------+--------------------+



## NA Functions

### Drop

In [9]:
dfnan.na.drop(how='all',subset=['uniform','normal']).show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|0.47611851579756026|-0.21311682946326227|
|  1|0.06498948189958098|-0.05248092572410684|
|  3| 0.1982919638208397|  -0.256535324205377|
|  4|0.12030715258495939|  -0.506853671746243|
+---+-------------------+--------------------+



### Replace

In [10]:
dfnan.na.replace([NaN],[0.0], 'uniform').show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|0.47611851579756026|-0.21311682946326227|
|  1|0.06498948189958098|-0.05248092572410684|
|  2|                0.0|                 NaN|
|  3| 0.1982919638208397|  -0.256535324205377|
|  4|0.12030715258495939|  -0.506853671746243|
+---+-------------------+--------------------+



### Fill

In [11]:
dfnan.na.fill({'uniform': 0.0, 'normal': 1.0}).show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|0.47611851579756026|-0.21311682946326227|
|  1|0.06498948189958098|-0.05248092572410684|
|  2|                0.0|                 1.0|
|  3| 0.1982919638208397|  -0.256535324205377|
|  4|0.12030715258495939|  -0.506853671746243|
+---+-------------------+--------------------+



In [50]:
dfsummary = dfnan.drop("id").na.drop(how='all').describe()
dfsummary.show()

+-------+-------------------+--------------------+
|summary|            uniform|              normal|
+-------+-------------------+--------------------+
|  count|                  4|                   4|
|   mean|0.21492677852573508|-0.25724668778474724|
| stddev|0.18251201532325845| 0.18813170403403176|
|    min|0.06498948189958098|  -0.506853671746243|
|    max|0.47611851579756026|-0.05248092572410684|
+-------+-------------------+--------------------+



In [54]:
uniformMean = float(dfsummary.rdd.collect()[1][1])
print uniformMean

0.214926778526


In [52]:
pandasSummary = dfsummary.toPandas().set_index("summary")
pandasSummary

Unnamed: 0_level_0,uniform,normal
summary,Unnamed: 1_level_1,Unnamed: 2_level_1
count,4.0,4.0
mean,0.214926778525735,-0.2572466877847472
stddev,0.1825120153232584,0.1881317040340317
min,0.0649894818995809,-0.506853671746243
max,0.4761185157975602,-0.0524809257241068


In [55]:
uniformMean = float(pandasSummary.loc['mean'].uniform)
print uniformMean

0.214926778526


In [56]:
normalMean = float(pandasSummary.loc['mean'].normal)
print normalMean

-0.257246687785


In [57]:
dfnan.na.fill({"uniform": uniformMean, "normal": normalMean}).show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|0.47611851579756026|-0.21311682946326227|
|  1|0.06498948189958098|-0.05248092572410684|
|  2|     0.214926778526|     -0.257246687785|
|  3| 0.1982919638208397|  -0.256535324205377|
|  4|0.12030715258495939|  -0.506853671746243|
+---+-------------------+--------------------+



## Duplicates

In [62]:
dfDuplicates = dfnan.union(sc.parallelize([(5,1,1),(6,1,1)]).toDF())

In [63]:
dfDuplicates.show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|0.47611851579756026|-0.21311682946326227|
|  1|0.06498948189958098|-0.05248092572410684|
|  2|                NaN|                 NaN|
|  3| 0.1982919638208397|  -0.256535324205377|
|  4|0.12030715258495939|  -0.506853671746243|
|  5|                1.0|                 1.0|
|  6|                1.0|                 1.0|
+---+-------------------+--------------------+



In [64]:
dfDuplicates.dropDuplicates(["uniform","normal"]).show()

+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  1|0.06498948189958098|-0.05248092572410684|
|  2|                NaN|                 NaN|
|  5|                1.0|                 1.0|
|  4|0.12030715258495939|  -0.506853671746243|
|  0|0.47611851579756026|-0.21311682946326227|
|  3| 0.1982919638208397|  -0.256535324205377|
+---+-------------------+--------------------+



In [1]:
sc

<pyspark.context.SparkContext at 0x7fd1c5d89790>

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Transformers and Estimators

### Transformers - Tokenizer

In [3]:
from pyspark.ml.feature import Tokenizer

In [4]:
sentenceDataFrame = sqlc.createDataFrame([(0, "Hi I heard about Spark"),
                                          (1, "I wish Java could use case classes"),
                                          (2, "Logistic,regression,models,are,neat")]) \
                    .toDF("label", "sentence")

In [5]:
tokenizer = Tokenizer().setInputCol("sentence").setOutputCol("words")
tokenized = tokenizer.transform(sentenceDataFrame)

In [6]:
tokenized.toPandas()

Unnamed: 0,label,sentence,words
0,0,Hi I heard about Spark,"[hi, i, heard, about, spark]"
1,1,I wish Java could use case classes,"[i, wish, java, could, use, case, classes]"
2,2,"Logistic,regression,models,are,neat","[logistic,regression,models,are,neat]"


### Transformers - Vector Assembler

In [17]:
from pyspark.sql.functions import rand, randn
from pyspark.ml.feature import VectorAssembler

dfRandom = sqlc.range(0, 10).select("id") \
            .withColumn("uniform", rand(10)) \
            .withColumn("normal1", randn(10)) \
            .withColumn("normal2", randn(11))

In [19]:
assembler = VectorAssembler(inputCols = ["uniform","normal1","normal2"], outputCol = "features")

dfVec = assembler.transform(dfRandom)

In [20]:
dfVec.select("id","features").toPandas()

Unnamed: 0,id,features
0,0,"[0.41371264721, -0.587748239674, -0.256535324205]"
1,1,"[0.198291963821, -0.256535324205, -0.506853671..."
2,2,"[0.120307152585, -0.506853671746, -0.141369919..."
3,3,"[0.442929185213, -0.141369919356, -0.726587521..."
4,4,"[0.889878425389, 0.965766508876, 0.891697335754]"
5,5,"[0.273107306848, -0.726587521995, -1.19853855262]"
6,6,"[0.870793547001, -1.19853855262, -0.117110926001]"
7,7,"[0.271493317932, -0.117110926001, 0.304945613282]"
8,8,"[0.603714357844, 0.304945613282, 0.0393394905131]"
9,9,"[0.143566883898, -1.04800065723, -0.963546696012]"


### Estimator - Logistic Regression

In [7]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors

In [9]:
training = sqlc.createDataFrame([(1.0, Vectors.dense(0.0, 1.1, 0.1)),
                                       (0.0, Vectors.dense(2.0, 1.0, -1.0)),
                                       (0.0, Vectors.dense(2.0, 1.3, 1.0)),
                                       (1.0, Vectors.dense(0.0, 1.2, -0.5))]) \
            .toDF("label", "features")

In [10]:
lr = LogisticRegression()

In [11]:
lr.setMaxIter(10).setRegParam(0.01)

LogisticRegression_43f698800b101f959c6d

In [12]:
model1 = lr.fit(training, {'maxIter': 10, 'regParam': 0.01})
model1.coefficients

DenseVector([-3.1009, 2.6082, -0.3802])

In [13]:
model1.transform(training).toPandas()

Unnamed: 0,label,features,rawPrediction,probability,prediction
0,1.0,"[0.0, 1.1, 0.1]","[-2.89919489464, 2.89919489464]","[0.052193376663, 0.947806623337]",1.0
1,0.0,"[2.0, 1.0, -1.0]","[3.14530074644, -3.14530074644]","[0.95872315829, 0.04127684171]",0.0
2,0.0,"[2.0, 1.3, 1.0]","[3.12319457003, -3.12319457003]","[0.95783942353, 0.0421605764704]",0.0
3,1.0,"[0.0, 1.2, -0.5]","[-3.388123842, 3.388123842]","[0.0326686926626, 0.967331307337]",1.0


In [1]:
sc

<pyspark.context.SparkContext at 0x7f2a9e685790>

In [4]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [5]:
from pyspark.sql.functions import rand, randn
from pyspark.ml.feature import VectorAssembler

dfRandom = sqlc.range(0, 10).select("id") \
            .withColumn("uniform", rand(10)) \
            .withColumn("normal1", randn(10)) \
            .withColumn("normal2", randn(11))
            
assembler = VectorAssembler(inputCols = ["uniform","normal1","normal2"], outputCol = "features")

dfVec = assembler.transform(dfRandom)

## Data Normalization

### Normalizer

In [6]:
from pyspark.ml.feature import Normalizer

In [7]:
scaler1 = Normalizer().setInputCol("features").setOutputCol("scaledFeat").setP(1.0)

In [8]:
scaler1.transform(dfVec.select("id","features")).show(5)

+---+--------------------+--------------------+
| id|            features|          scaledFeat|
+---+--------------------+--------------------+
|  0|[0.41371264720975...|[0.32886636983701...|
|  1|[0.19829196382083...|[0.20619308493718...|
|  2|[0.12030715258495...|[0.15654175655718...|
|  3|[0.44292918521277...|[0.33788519635286...|
|  4|[0.88987842538862...|[0.32390519197407...|
+---+--------------------+--------------------+
only showing top 5 rows



### Standard Scaler

In [9]:
from pyspark.ml.feature import StandardScaler

In [10]:
scaler2 = StandardScaler().setInputCol("features").setOutputCol("scaledFeat").setWithStd(True).setWithMean(True)

In [11]:
scaler2Model = scaler2.fit(dfVec.select("id","features"))

In [12]:
scaler2Model.transform(dfVec.select("id","features")).toPandas()[:5]

Unnamed: 0,id,features,scaledFeat
0,0,"[0.41371264721, -0.587748239674, -0.256535324205]","[-0.0321173330608, -0.400475290492, 0.01770640..."
1,1,"[0.198291963821, -0.256535324205, -0.506853671...","[-0.795199539636, 0.116559162465, -0.38815037517]"
2,2,"[0.120307152585, -0.506853671746, -0.141369919...","[-1.07144423862, -0.274196160897, 0.204431268542]"
3,3,"[0.442929185213, -0.141369919356, -0.726587521...","[0.0713760731298, 0.296336216182, -0.744418595..."
4,4,"[0.889878425389, 0.965766508876, 0.891697335754]","[1.65459922076, 2.02461325728, 1.87940775681]"


### MinMax Scaler

In [13]:
from pyspark.ml.feature import MinMaxScaler

In [14]:
scaler3 = MinMaxScaler().setInputCol("features").setOutputCol("scaledFeat").setMin(-1.0).setMax(1.0)

In [15]:
scaler3Model = scaler3.fit(dfVec.select("id","features"))

In [16]:
scaler3Model.transform(dfVec.select("id","features")).toPandas()[:5]

Unnamed: 0,id,features,scaledFeat
0,0,"[0.41371264721, -0.587748239674, -0.256535324205]","[-0.237483245559, -0.435578353707, -0.09866323..."
1,1,"[0.198291963821, -0.256535324205, -0.506853671...","[-0.797329203956, -0.12950974872, -0.338175289..."
2,2,"[0.120307152585, -0.506853671746, -0.141369919...","[-1.0, -0.360824965779, 0.0115304584941]"
3,3,"[0.442929185213, -0.141369919356, -0.726587521...","[-0.161553857247, -0.0230872236347, -0.5484231..."
4,4,"[0.889878425389, 0.965766508876, 0.891697335754]","[1.0, 1.0, 1.0]"
