In [None]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [None]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Spark ML Tuning

### Elements:
- Model (Estimator or Pipeline)
- Set of ParamMaps (to perform the grid search) - you should use the ***ParamGridBuilder*** utility
- Evaluator (to assess the fitness of the model)

## Cross-Validation

- Splits the dataset into K folds
- Each fold is splitted into a training (2/3) and a test (1/3) sets
- It will fit K models and compute the average of the K evaluation metrics (according to the Evaluator)
- Based on the metrics, it will determine the best set of parameters
- Then it will fit the model one final time, using this set of parameters and the whole dataset
- This is a VERY computationally expensive 

In [None]:
!rm -rf metastore_db/*.lck

training = sqlc.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0),
    (4, "b spark who", 1.0),
    (5, "g d a y", 0.0),
    (6, "spark fly", 1.0),
    (7, "was mapreduce", 0.0),
    (8, "e spark program", 1.0),
    (9, "a e c l", 0.0),
    (10, "spark compile", 1.0),
    (11, "hadoop software", 0.0)
], ["id", "text", "label"])

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer

tokenizer = Tokenizer(inputCol="text", outputCol="words")

hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

lr = LogisticRegression(maxIter=10)

pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [None]:
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

In [None]:
paramGrid

In [None]:
from pyspark.ml.tuning import CrossValidator

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3)

cvModel = crossval.fit(training)

In [None]:
cvModel.avgMetrics

In [None]:
cvModel.bestModel

In [None]:
cvModel.bestModel.stages

In [None]:
lr_best = cvModel.bestModel.stages[2]

In [None]:
lr_best.coefficients

In [None]:
lr_summary = lr_best.summary

In [None]:
lr_summary.areaUnderROC

In [None]:
test = sqlc.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "mapreduce spark"),
    (7, "apache hadoop")
], ["id", "text"])

In [None]:
prediction = cvModel.transform(test)

selected = prediction.select("id", "text", "probability", "prediction")

for row in selected.collect():
    print(row)

## Train-Validation Split

- It uses the entire dataset
- The dataset is splitted into a training and a test sets according to the ***trainRatio*** parameter
- It will fit a model for each set of parameters and evaluate its metrics (according to the Evaluator)
- Based on the metrics, it will determine the best set of parameters
- Then it will fit the model one final time, using this set of parameters and the whole dataset
- This is a much less expensive, but it may not yield good results if the dataset is not large enough

In [None]:
from pyspark.ml.regression import LinearRegression

data = sqlc.read.format("libsvm").load("/usr/local/spark/data/mllib/sample_linear_regression_data.txt")

train, test = data.randomSplit([0.7, 0.3])

lr = LinearRegression(maxIter=20, regParam=0.1)

In [None]:
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

In [None]:
paramGrid

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import TrainValidationSplit

tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(),
                           trainRatio=0.8)

model = tvs.fit(train)

In [None]:
model.bestModel

In [None]:
model.bestModel.coefficients

In [None]:
model.bestModel.summary.r2

In [None]:
prediction = model.transform(test)

In [None]:
prediction.toPandas()[:5]

In [None]:
sc.stop()