# MLlib Overview (24)

In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
# local meansthat driver and executers run on your individual
# computer instead of a cluster. 
spark = SparkSession.builder.master("local").appName("practice") \
        .getOrCreate()

All features passed to a ml model must be Double in Spark. We can have dense or sparse vectors:

In [7]:
# pyspark.ml refers to the modern MLlib package, used with the structured API
# don't use pyspark.mllib (this one is used with RDDs and is older)
from pyspark.ml.linalg import Vectors

denseVec = Vectors.dense(1.0, 2.0, 3.0)
size = 3
idx = [1, 2] # locations of non-zero elements in vector
values = [2.0, 3.0]
sparseVec = Vectors.sparse(size, idx, values)

Simple synthetic dataset:

In [11]:
DATA_PATH = "../../Spark-The-Definitive-Guide/data/simple-ml"
df = spark.read.json(DATA_PATH)

In [13]:
df.show(5)

+-----+----+------+------------------+
|color| lab|value1|            value2|
+-----+----+------+------------------+
|green|good|     1|14.386294994851129|
| blue| bad|     8|14.386294994851129|
| blue| bad|    12|14.386294994851129|
|green|good|    15| 38.97187133755819|
|green|good|    12|14.386294994851129|
+-----+----+------+------------------+
only showing top 5 rows



Basic transformations syntax:

In [64]:
from pyspark.ml.feature import RFormula
supervised = RFormula(formula="lab~ . + color:value1 + color:value2")
# in this case, we need to use fit
fittedRF = supervised.fit(df)
preparedDF = fittedRF.transform(df)
preparedDF.show(5)

+-----+----+------+------------------+--------------------+-----+
|color| lab|value1|            value2|            features|label|
+-----+----+------+------------------+--------------------+-----+
|green|good|     1|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
| blue| bad|     8|14.386294994851129|(10,[2,3,6,9],[8....|  0.0|
| blue| bad|    12|14.386294994851129|(10,[2,3,6,9],[12...|  0.0|
|green|good|    15| 38.97187133755819|(10,[1,2,3,5,8],[...|  1.0|
|green|good|    12|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
+-----+----+------+------------------+--------------------+-----+
only showing top 5 rows



**creating a test set**

In [65]:
train, test = preparedDF.randomSplit([0.7, 0.3])

**basic model**

In [71]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label", featuresCol="features")
# fitting is eager and is performed immediately
fittedLR = lr.fit(train)

In [72]:
## help for every algorithm on MLlib
#print(lr.explainParams())

In [76]:
fittedLR.transform(train).select("label", "prediction").show(5)

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 5 rows



**pipeline**

In [80]:
# split of the original dataset this time
train, test = df.randomSplit([0.7, 0.3])

# stages for the pipeline 
rForm = RFormula() # open so we can test different configurations
lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features")

# creating the pipeline
from pyspark.ml import Pipeline

stages = [rForm, lr]
pipeline = Pipeline().setStages(stages)

Building a parameter grid using the pipeline:

In [83]:
from pyspark.ml.tuning import ParamGridBuilder

params = ParamGridBuilder()\
    .addGrid(rForm.formula, ["lab ~ . + color:value1",
                             "lab ~ . + color:value1 + color:value2"])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .addGrid(lr.regParam, [0.1, 2.0])\
    .build()

Defining evalutation metrics:

In [88]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()\
    .setMetricName("areaUnderROC")\
    .setRawPredictionCol("prediction")\
    .setLabelCol("label")

Training and validation strategy (we use a simple split in this case, but cross validation is also an option):

In [89]:
from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit()\
    .setTrainRatio(0.75)\
    .setEstimatorParamMaps(params)\
    .setEstimator(pipeline)\
    .setEvaluator(evaluator)

In [116]:
tvsFitted = tvs.fit(train)

In [117]:
evaluator.evaluate(tvsFitted.transform(test))

0.8913043478260869

# Preprocessing and FE (25)

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
# local meansthat driver and executers run on your individual
# computer instead of a cluster. 
spark = SparkSession.builder.master("local").appName("practice") \
        .getOrCreate()