## Pipeline & Model Selection
#### Agenda
<hr>
* Introduction to Pipeline
* Connecting Transformers & Estimators
* Persistance of Pipeline
* Hyper-parameter Tuning
* CrossValidation
* Splitting data for Training & Validation

<hr>

### Introduction to Pipeline
<hr>
* Data needs to be transformed .i.e converted to format what machine learning algorithms can understand.
* Once data is converted, it needs to be fed to machine learning models.
* With many posible configurations, this entire thing can be tiresome if not automated.
* Pipelines is a solution to automate this entire thing.

<img src="https://github.com/awantik/machine-learning-slides/blob/master/pipeline-ml2.png?raw=true" width="800px">

<hr>

In [3]:
income_df = spark.read.csv('/FileStore/tables/adult.csv', inferSchema=True, header=True)

In [4]:
display(income_df)

age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K


In [5]:
cat_cols = map(lambda x:x[0]  ,filter( lambda x: x[1] == 'string',income_df.dtypes))

In [6]:
num_cols = map(lambda x:x[0]  ,filter( lambda x: x[1] != 'string',income_df.dtypes))

In [7]:
num_cols

In [8]:
cat_cols

In [9]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [10]:
stringIndexers = []
for col in cat_cols:
  st_indexer = StringIndexer(inputCol=col, outputCol=col + '_tf')
  stringIndexers.append(st_indexer)

In [11]:
vec_cols = map(lambda c:c+'_tf', cat_cols) + num_cols
vec_cols.remove('income_tf')

In [12]:
vec_cols

In [13]:
va = VectorAssembler(inputCols=vec_cols, outputCol='features')

In [14]:
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.pipeline import Pipeline

In [15]:
rf = RandomForestClassifier(numTrees=10, featuresCol='features', labelCol='income_tf', predictionCol='prediction',maxBins=45)
gbt = GBTClassifier(featuresCol='features', labelCol='income_tf', predictionCol='prediction',maxBins=45)

In [16]:
pipelines = []
for model in [gbt,rf]:
  pipeline = Pipeline(stages=stringIndexers+[va,model])
  pipelines.append(pipeline)

In [17]:
train_data, test_data = income_df.randomSplit([.8,.2])

In [18]:
pipeline_models = []
for pipeline in pipelines:
  pipeline_model = pipeline.fit(train_data)
  pipeline_models.append(pipeline_model)

In [19]:
pred_gbt = pipeline_models[0].transform(test_data)
pred_rf = pipeline_models[1].transform(test_data)

In [20]:
display(pred_gbt)

age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,workclass_tf,education_tf,marital-status_tf,occupation_tf,relationship_tf,race_tf,gender_tf,native-country_tf,income_tf,features,rawPrediction,probability,prediction
17,?,27251,11th,7,Widowed,?,Own-child,White,Male,0,0,40,United-States,<=50K,3.0,5.0,4.0,7.0,2.0,0.0,0.0,0.0,0.0,"List(1, 14, List(), List(3.0, 5.0, 4.0, 7.0, 2.0, 0.0, 0.0, 0.0, 17.0, 27251.0, 7.0, 0.0, 0.0, 40.0))","List(1, 2, List(), List(1.5206539415526015, -1.5206539415526015))","List(1, 2, List(), List(0.9544057760957669, 0.04559422390423307))",0.0
17,?,34088,12th,8,Never-married,?,Own-child,White,Female,0,0,25,United-States,<=50K,3.0,11.0,1.0,7.0,2.0,0.0,1.0,0.0,0.0,"List(1, 14, List(), List(3.0, 11.0, 1.0, 7.0, 2.0, 0.0, 1.0, 0.0, 17.0, 34088.0, 8.0, 0.0, 0.0, 25.0))","List(1, 2, List(), List(1.5206539415526015, -1.5206539415526015))","List(1, 2, List(), List(0.9544057760957669, 0.04559422390423307))",0.0
17,?,34505,11th,7,Never-married,?,Own-child,White,Male,0,0,50,United-States,<=50K,3.0,5.0,1.0,7.0,2.0,0.0,0.0,0.0,0.0,"List(1, 14, List(), List(3.0, 5.0, 1.0, 7.0, 2.0, 0.0, 0.0, 0.0, 17.0, 34505.0, 7.0, 0.0, 0.0, 50.0))","List(1, 2, List(), List(1.3848440202239471, -1.3848440202239471))","List(1, 2, List(), List(0.9410156735475067, 0.058984326452493296))",0.0
17,?,35603,11th,7,Never-married,?,Own-child,White,Female,0,0,16,United-States,<=50K,3.0,5.0,1.0,7.0,2.0,0.0,1.0,0.0,0.0,"List(1, 14, List(), List(3.0, 5.0, 1.0, 7.0, 2.0, 0.0, 1.0, 0.0, 17.0, 35603.0, 7.0, 0.0, 0.0, 16.0))","List(1, 2, List(), List(1.5206539415526015, -1.5206539415526015))","List(1, 2, List(), List(0.9544057760957669, 0.04559422390423307))",0.0
17,?,45037,10th,6,Never-married,?,Own-child,White,Female,0,0,25,United-States,<=50K,3.0,7.0,1.0,7.0,2.0,0.0,1.0,0.0,0.0,"List(1, 14, List(), List(3.0, 7.0, 1.0, 7.0, 2.0, 0.0, 1.0, 0.0, 17.0, 45037.0, 6.0, 0.0, 0.0, 25.0))","List(1, 2, List(), List(1.5206539415526015, -1.5206539415526015))","List(1, 2, List(), List(0.9544057760957669, 0.04559422390423307))",0.0
17,?,112942,10th,6,Never-married,?,Own-child,White,Male,0,0,40,United-States,<=50K,3.0,7.0,1.0,7.0,2.0,0.0,0.0,0.0,0.0,"List(1, 14, List(), List(3.0, 7.0, 1.0, 7.0, 2.0, 0.0, 0.0, 0.0, 17.0, 112942.0, 6.0, 0.0, 0.0, 40.0))","List(1, 2, List(), List(1.5206539415526015, -1.5206539415526015))","List(1, 2, List(), List(0.9544057760957669, 0.04559422390423307))",0.0
17,?,114798,11th,7,Never-married,?,Own-child,White,Female,0,0,18,United-States,<=50K,3.0,5.0,1.0,7.0,2.0,0.0,1.0,0.0,0.0,"List(1, 14, List(), List(3.0, 5.0, 1.0, 7.0, 2.0, 0.0, 1.0, 0.0, 17.0, 114798.0, 7.0, 0.0, 0.0, 18.0))","List(1, 2, List(), List(1.5206539415526015, -1.5206539415526015))","List(1, 2, List(), List(0.9544057760957669, 0.04559422390423307))",0.0
17,?,127003,9th,5,Never-married,?,Own-child,Black,Male,0,0,40,United-States,<=50K,3.0,10.0,1.0,7.0,2.0,1.0,0.0,0.0,0.0,"List(1, 14, List(), List(3.0, 10.0, 1.0, 7.0, 2.0, 1.0, 0.0, 0.0, 17.0, 127003.0, 5.0, 0.0, 0.0, 40.0))","List(1, 2, List(), List(1.5206539415526015, -1.5206539415526015))","List(1, 2, List(), List(0.9544057760957669, 0.04559422390423307))",0.0
17,?,143331,11th,7,Never-married,?,Own-child,White,Male,0,0,40,United-States,<=50K,3.0,5.0,1.0,7.0,2.0,0.0,0.0,0.0,0.0,"List(1, 14, List(), List(3.0, 5.0, 1.0, 7.0, 2.0, 0.0, 0.0, 0.0, 17.0, 143331.0, 7.0, 0.0, 0.0, 40.0))","List(1, 2, List(), List(1.5206539415526015, -1.5206539415526015))","List(1, 2, List(), List(0.9544057760957669, 0.04559422390423307))",0.0
17,?,158762,10th,6,Never-married,?,Own-child,White,Female,0,0,20,United-States,<=50K,3.0,7.0,1.0,7.0,2.0,0.0,1.0,0.0,0.0,"List(1, 14, List(), List(3.0, 7.0, 1.0, 7.0, 2.0, 0.0, 1.0, 0.0, 17.0, 158762.0, 6.0, 0.0, 0.0, 20.0))","List(1, 2, List(), List(1.5206539415526015, -1.5206539415526015))","List(1, 2, List(), List(0.9544057760957669, 0.04559422390423307))",0.0


### Persisting the Pipeline
<hr>
* On fitting a pipeline, all the transformers & evaluators are trained.
* Training uses high compute power.
* Trained model can be shipped to customer or validation team.
* Trained model is supported across different language as they are based on PMML

In [22]:
selected_model = pipeline_models[0]

In [23]:
selected_model.save('GBT-Model-saved')

* Loading saved model

In [25]:
from pyspark.ml import PipelineModel
gbt_model = PipelineModel.load('GBT-Model-saved')

### Hyper-parameter Tuning
<hr>
* Models needs right parameters to be configured.
* Finding the right parameters is can be automated.
* ml.tuning have all such library

In [27]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [28]:
horror_data = spark.read.csv('/FileStore/tables/horror_train-a88b4.csv', inferSchema=True, header=True)

In [29]:
display(horror_data)

id,text,author
id26305,"This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.",EAP
id17569,It never once occurred to me that the fumbling might be a mere mistake.,HPL
id11008,"In his left hand was a gold snuff box, from which, as he capered down the hill, cutting all manner of fantastic steps, he took snuff incessantly with an air of the greatest possible self satisfaction.",EAP
id27763,"How lovely is spring As we looked from Windsor Terrace on the sixteen fertile counties spread beneath, speckled by happy cottages and wealthier towns, all looked as in former years, heart cheering and fair.",MWS
id12958,"Finding nothing else, not even gold, the Superintendent abandoned his attempts; but a perplexed look occasionally steals over his countenance as he sits thinking at his desk.",HPL
id22965,"A youth passed in solitude, my best years spent under your gentle and feminine fosterage, has so refined the groundwork of my character that I cannot overcome an intense distaste to the usual brutality exercised on board ship: I have never believed it to be necessary, and when I heard of a mariner equally noted for his kindliness of heart and the respect and obedience paid to him by his crew, I felt myself peculiarly fortunate in being able to secure his services.",MWS
id09674,"The astronomer, perhaps, at this point, took refuge in the suggestion of non luminosity; and here analogy was suddenly let fall.",EAP
id13515,The surcingle hung in ribands from my body.,EAP
id19322,"I knew that you could not say to yourself 'stereotomy' without being brought to think of atomies, and thus of the theories of Epicurus; and since, when we discussed this subject not very long ago, I mentioned to you how singularly, yet with how little notice, the vague guesses of that noble Greek had met with confirmation in the late nebular cosmogony, I felt that you could not avoid casting your eyes upward to the great nebula in Orion, and I certainly expected that you would do so.",EAP
id00912,"I confess that neither the structure of languages, nor the code of governments, nor the politics of various states possessed attractions for me.",MWS


In [30]:
from pyspark.ml.feature import HashingTF, Tokenizer, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.pipeline import Pipeline

In [31]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")

In [32]:
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

In [33]:
labeler = StringIndexer(inputCol="author", outputCol="label", handleInvalid="keep")

In [34]:
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

In [36]:
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build()

In [37]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [38]:
pipeline = Pipeline(stages=[tokenizer, hashingTF, labeler, lr])

In [39]:
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=5)

In [40]:
train, test = horror_data.randomSplit([.8,.2])

In [41]:
crossval_model = crossval.fit(train)

In [42]:
best_model = crossval_model.bestModel

In [43]:
eval = MulticlassClassificationEvaluator()
pred = best_model.transform(test)
eval.evaluate(pred)

##### Extracting Hyper-Parameters

In [45]:
best_model.stages[1]._java_obj.getNumFeatures()

In [46]:
from pyspark.ml.tuning import TrainValidationSplit

In [47]:
tvs = TrainValidationSplit(estimator=pipeline,
                           estimatorParamMaps=paramGrid,
                           evaluator=MulticlassClassificationEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

In [48]:
tvs_model = tvs.fit(train)

In [49]:
tvs_model.bestModel.stages[3]._java_obj.getRegParam()

In [50]:
tvs_model.bestModel.stages[1]._java_obj.getNumFeatures()

In [51]:
pred = tvs_model.bestModel.transform(horror_data)

In [52]:
eval = MulticlassClassificationEvaluator()

In [53]:
eval.evaluate(pred)

##### Changing LogisticRegression to Multinomial Naive Bayes

In [55]:
from pyspark.ml.classification import NaiveBayes

In [56]:
mnb = NaiveBayes(smoothing=1.0, modelType="multinomial", featuresCol="features", labelCol="label")

In [57]:
pipeline = Pipeline(stages=[tokenizer, hashingTF, labeler, mnb])
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10000,50000]).addGrid(lr.regParam, [0.1, 0.01]).build()

In [58]:
tvs = TrainValidationSplit(estimator=pipeline,
                           estimatorParamMaps=paramGrid,
                           evaluator=MulticlassClassificationEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

In [59]:
tvs_model = tvs.fit(train)

In [60]:
pred = tvs_model.bestModel.transform(test)

In [61]:
eval.evaluate(pred)