In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Transformers and Estimators

### Transformers - Tokenizer

In [3]:
from pyspark.ml.feature import Tokenizer

In [4]:
sentenceDataFrame = sqlc.createDataFrame([(0, "Hi I heard about Spark"),
                                          (1, "I wish Java could use case classes"),
                                          (2, "Logistic,regression,models,are,neat")]) \
                    .toDF("label", "sentence")

In [5]:
tokenizer = Tokenizer().setInputCol("sentence").setOutputCol("words")
tokenized = tokenizer.transform(sentenceDataFrame)

In [6]:
tokenized.toPandas()

Unnamed: 0,label,sentence,words
0,0,Hi I heard about Spark,"[hi, i, heard, about, spark]"
1,1,I wish Java could use case classes,"[i, wish, java, could, use, case, classes]"
2,2,"Logistic,regression,models,are,neat","[logistic,regression,models,are,neat]"


### Transformers - Vector Assembler

In [7]:
from pyspark.sql.functions import rand, randn
from pyspark.ml.feature import VectorAssembler

dfRandom = sqlc.range(0, 10).select("id") \
            .withColumn("uniform", rand(10)) \
            .withColumn("normal1", randn(10)) \
            .withColumn("normal2", randn(11))

In [None]:
assembler = VectorAssembler(inputCols = ["uniform","normal1","normal2"], outputCol = "features")

dfVec = assembler.transform(dfRandom)


In [10]:
dfRandom.toPandas() 

Unnamed: 0,id,uniform,normal1,normal2
0,0,0.413713,-0.587748,-0.256535
1,1,0.198292,-0.256535,-0.506854
2,2,0.120307,-0.506854,-0.14137
3,3,0.442929,-0.14137,-0.726588
4,4,0.889878,0.965767,0.891697
5,5,0.273107,-0.726588,-1.198539
6,6,0.870794,-1.198539,-0.117111
7,7,0.271493,-0.117111,0.304946
8,8,0.603714,0.304946,0.039339
9,9,0.143567,-1.048001,-0.963547


In [11]:
dfVec.select("id","features").toPandas()

Unnamed: 0,id,features
0,0,"[0.41371264721, -0.587748239674, -0.256535324205]"
1,1,"[0.198291963821, -0.256535324205, -0.506853671..."
2,2,"[0.120307152585, -0.506853671746, -0.141369919..."
3,3,"[0.442929185213, -0.141369919356, -0.726587521..."
4,4,"[0.889878425389, 0.965766508876, 0.891697335754]"
5,5,"[0.273107306848, -0.726587521995, -1.19853855262]"
6,6,"[0.870793547001, -1.19853855262, -0.117110926001]"
7,7,"[0.271493317932, -0.117110926001, 0.304945613282]"
8,8,"[0.603714357844, 0.304945613282, 0.0393394905131]"
9,9,"[0.143566883898, -1.04800065723, -0.963546696012]"


### Estimator - Logistic Regression

In [12]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors

In [13]:
training = sqlc.createDataFrame([(1.0, Vectors.dense(0.0, 1.1, 0.1)),
                                       (0.0, Vectors.dense(2.0, 1.0, -1.0)),
                                       (0.0, Vectors.dense(2.0, 1.3, 1.0)),
                                       (1.0, Vectors.dense(0.0, 1.2, -0.5))]) \
            .toDF("label", "features")

In [15]:
training.toPandas()

Unnamed: 0,label,features
0,1.0,"[0.0, 1.1, 0.1]"
1,0.0,"[2.0, 1.0, -1.0]"
2,0.0,"[2.0, 1.3, 1.0]"
3,1.0,"[0.0, 1.2, -0.5]"


In [16]:
lr = LogisticRegression()

In [17]:
lr.setMaxIter(10).setRegParam(0.01)

LogisticRegression_4ca4bb0c62e993c179a4

In [18]:
model1 = lr.fit(training, {'maxIter': 10, 'regParam': 0.01})
model1.coefficients

DenseVector([-3.1121, 2.6485, -0.3956])

In [19]:
model1.transform(training).toPandas()

Unnamed: 0,label,features,rawPrediction,probability,prediction
0,1.0,"[0.0, 1.1, 0.1]","[-2.92337480745, 2.92337480745]","[0.0510100852832, 0.948989914717]",1.0
1,0.0,"[2.0, 1.0, -1.0]","[3.13047375344, -3.13047375344]","[0.958132401546, 0.041867598454]",0.0
2,0.0,"[2.0, 1.3, 1.0]","[3.12704527654, -3.12704527654]","[0.957994652988, 0.0420053470116]",0.0
3,1.0,"[0.0, 1.2, -0.5]","[-3.42555867628, 3.42555867628]","[0.0315061711047, 0.968493828895]",1.0


In [20]:
sc.stop()