In [1]:
sc

<pyspark.context.SparkContext at 0x7fd1c5d89790>

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Transformers and Estimators

### Transformers - Tokenizer

In [3]:
from pyspark.ml.feature import Tokenizer

In [4]:
sentenceDataFrame = sqlc.createDataFrame([(0, "Hi I heard about Spark"),
                                          (1, "I wish Java could use case classes"),
                                          (2, "Logistic,regression,models,are,neat")]) \
                    .toDF("label", "sentence")

In [5]:
tokenizer = Tokenizer().setInputCol("sentence").setOutputCol("words")
tokenized = tokenizer.transform(sentenceDataFrame)

In [6]:
tokenized.toPandas()

Unnamed: 0,label,sentence,words
0,0,Hi I heard about Spark,"[hi, i, heard, about, spark]"
1,1,I wish Java could use case classes,"[i, wish, java, could, use, case, classes]"
2,2,"Logistic,regression,models,are,neat","[logistic,regression,models,are,neat]"


### Transformers - Vector Assembler

In [17]:
from pyspark.sql.functions import rand, randn
from pyspark.ml.feature import VectorAssembler

dfRandom = sqlc.range(0, 10).select("id") \
            .withColumn("uniform", rand(10)) \
            .withColumn("normal1", randn(10)) \
            .withColumn("normal2", randn(11))

In [19]:
assembler = VectorAssembler(inputCols = ["uniform","normal1","normal2"], outputCol = "features")

dfVec = assembler.transform(dfRandom)

In [20]:
dfVec.select("id","features").toPandas()

Unnamed: 0,id,features
0,0,"[0.41371264721, -0.587748239674, -0.256535324205]"
1,1,"[0.198291963821, -0.256535324205, -0.506853671..."
2,2,"[0.120307152585, -0.506853671746, -0.141369919..."
3,3,"[0.442929185213, -0.141369919356, -0.726587521..."
4,4,"[0.889878425389, 0.965766508876, 0.891697335754]"
5,5,"[0.273107306848, -0.726587521995, -1.19853855262]"
6,6,"[0.870793547001, -1.19853855262, -0.117110926001]"
7,7,"[0.271493317932, -0.117110926001, 0.304945613282]"
8,8,"[0.603714357844, 0.304945613282, 0.0393394905131]"
9,9,"[0.143566883898, -1.04800065723, -0.963546696012]"


### Estimator - Logistic Regression

In [7]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors

In [9]:
training = sqlc.createDataFrame([(1.0, Vectors.dense(0.0, 1.1, 0.1)),
                                       (0.0, Vectors.dense(2.0, 1.0, -1.0)),
                                       (0.0, Vectors.dense(2.0, 1.3, 1.0)),
                                       (1.0, Vectors.dense(0.0, 1.2, -0.5))]) \
            .toDF("label", "features")

In [10]:
lr = LogisticRegression()

In [11]:
lr.setMaxIter(10).setRegParam(0.01)

LogisticRegression_43f698800b101f959c6d

In [12]:
model1 = lr.fit(training, {'maxIter': 10, 'regParam': 0.01})
model1.coefficients

DenseVector([-3.1009, 2.6082, -0.3802])

In [13]:
model1.transform(training).toPandas()

Unnamed: 0,label,features,rawPrediction,probability,prediction
0,1.0,"[0.0, 1.1, 0.1]","[-2.89919489464, 2.89919489464]","[0.052193376663, 0.947806623337]",1.0
1,0.0,"[2.0, 1.0, -1.0]","[3.14530074644, -3.14530074644]","[0.95872315829, 0.04127684171]",0.0
2,0.0,"[2.0, 1.3, 1.0]","[3.12319457003, -3.12319457003]","[0.95783942353, 0.0421605764704]",0.0
3,1.0,"[0.0, 1.2, -0.5]","[-3.388123842, 3.388123842]","[0.0326686926626, 0.967331307337]",1.0
