<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px"> 
# Spark MLlib Lab
---

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
 
plt.style.use('ggplot')
sns.set(font_scale=1.5)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

## Create the spark context

In [3]:
import pyspark as ps    # for the pyspark suite
import warnings         # for displaying warning
from pyspark.sql import SQLContext

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StandardScaler

In [4]:
try:
    # we try to create a SparkContext to work locally on all cpus available
    sc = ps.SparkContext('local[4]')
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    # give a warning if SparkContext already exists (for use inside pyspark)
    warnings.warn("SparkContext already exists in this scope")

Just created a SparkContext


## Label encoding categorical features

Often we have categorical features with values given as strings which we would like to transform to numerical values. The analogue of sklearn's `LabelEncoder` is the `StringIndexer`.

In [8]:
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator

In [10]:
ex_1 = sqlContext.createDataFrame([
    (4, "high"),
    (5, "low"),
    (6, "high"),
    (7, "high"),
    (8,'medium'),
    (9,'mega')
], ["id", "label"])

In [11]:
string_indexer = StringIndexer(
        inputCol='label',
        outputCol='label' + "_index"
    )

In [12]:
ex_2 = string_indexer.fit(ex_1).transform(ex_1)
ex_2.show()

+---+------+-----------+
| id| label|label_index|
+---+------+-----------+
|  4|  high|        0.0|
|  5|   low|        1.0|
|  6|  high|        0.0|
|  7|  high|        0.0|
|  8|medium|        2.0|
|  9|  mega|        3.0|
+---+------+-----------+



In [13]:
onehot = OneHotEncoderEstimator(
        dropLast=True,
        inputCols=['label_index'],
        outputCols=['label' + "_index_1"]
    )

In [14]:
onehot.fit(ex_2).transform(ex_2).show()

+---+------+-----------+-------------+
| id| label|label_index|label_index_1|
+---+------+-----------+-------------+
|  4|  high|        0.0|(3,[0],[1.0])|
|  5|   low|        1.0|(3,[1],[1.0])|
|  6|  high|        0.0|(3,[0],[1.0])|
|  7|  high|        0.0|(3,[0],[1.0])|
|  8|medium|        2.0|(3,[2],[1.0])|
|  9|  mega|        3.0|    (3,[],[])|
+---+------+-----------+-------------+



The one-hot-encoded values are given as a sparse vector for each observation. The first number indicates the length of the sparse vector, the second number in brackets indicates the position that is filled with the last value. As you can see from the last shown entry, dropping a redundant label (`drop_last`) is default here.

## Read in the car evaluation dataset 

```python
df = pd.read_csv('../../../../resource-datasets/car_evaluation/car.csv')
```

Use `acceptability` as target.

In [21]:
df = pd.read_csv('/Users/gonzaloferreiro/Documents/GA_Materials/DS_Inmersive/resource-datasets/car_evaluation/car.csv')
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptability
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [22]:
spark_df = sqlContext.createDataFrame(df)
spark_df.first()

Row(buying='vhigh', maint='vhigh', doors='2', persons='2', lug_boot='small', safety='low', acceptability='unacc')

In [23]:
spark_df.dtyp

[('buying', 'string'),
 ('maint', 'string'),
 ('doors', 'string'),
 ('persons', 'string'),
 ('lug_boot', 'string'),
 ('safety', 'string'),
 ('acceptability', 'string')]

In [24]:
spark_df.select('buying').dtypes

[('buying', 'string')]

In [27]:
[spark_df.dtypes[i][0] for i in \
 range(len(spark_df.dtypes)) if spark_df.dtypes[i][1]=='string']

['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'acceptability']

In [28]:
spark_df

DataFrame[buying: string, maint: string, doors: string, persons: string, lug_boot: string, safety: string, acceptability: string]

## Dummify the categorical variables.

Use first the `StringIndexer`, then the `OneHotEncoderEstimator` to create the dummified variables. Be careful not to use one-hot encoding on the target variable (`acceptability`).

In [108]:
spark_2 = spark_df

cols = [spark_df.dtypes[i][0] for i in \
 range(len(spark_df.dtypes)) if spark_df.dtypes[i][1]=='string']

for each_col in cols[0:len(cols)]:

    string_indexer = StringIndexer(
            inputCol= each_col,
            outputCol= each_col + '_index')
    
    spark_2 = string_indexer.fit(spark_2).transform(spark_2)

In [109]:
new_cols = [spark_2.dtypes[x][0] for x in range(7,13,1)]

In [110]:
onehot = OneHotEncoderEstimator(
            dropLast=True,
            inputCols=new_cols,
            outputCols=[n+'_index_' for n in new_cols]
        )

spark_2 = onehot.fit(spark_2).transform(spark_2)

## Prepare your feature columns with `VectorAssembler`

In [111]:
from pyspark.ml.feature import VectorAssembler

In [112]:
dummy_cols = [spark_2.dtypes[x][0] for x in range(-1,-7,-1)]

In [113]:
dummy_cols

['persons_index_index_',
 'doors_index_index_',
 'lug_boot_index_index_',
 'buying_index_index_',
 'safety_index_index_',
 'maint_index_index_']

In [114]:
vectorAssembler = VectorAssembler(inputCols=dummy_cols,
                                 outputCol='features')

vector_df = vectorAssembler.transform(spark_2)
vector_df.persist()

DataFrame[buying: string, maint: string, doors: string, persons: string, lug_boot: string, safety: string, acceptability: string, buying_index: double, maint_index: double, doors_index: double, persons_index: double, lug_boot_index: double, safety_index: double, acceptability_index: double, maint_index_index_: vector, safety_index_index_: vector, buying_index_index_: vector, lug_boot_index_index_: vector, doors_index_index_: vector, persons_index_index_: vector, features: vector]

In [115]:
vector_df.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|(15,[4,9,11,14],[...|
|(15,[4,9,10,14],[...|
|(15,[4,9,14],[1.0...|
|(15,[4,5,9,11,14]...|
|(15,[4,5,9,10,14]...|
+--------------------+
only showing top 5 rows



## Fit and evaluate a spark decision tree model and tune with grid search

Once done, try also other models.

In [116]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [117]:
final_var = [spark_2.dtypes[x][0] for x in [6,13,14,15,16,17,18]]

In [124]:
model = DecisionTreeClassifier(featuresCol='features',
                           labelCol='acceptability_index')

In [122]:
(data_train, data_test) = vector_df.randomSplit([0.7, 0.3],seed=1)

In [133]:
vector_df.select('features').show(5, truncate=50)


+----------------------------------------+
|                                features|
+----------------------------------------+
|      (15,[4,9,11,14],[1.0,1.0,1.0,1.0])|
|      (15,[4,9,10,14],[1.0,1.0,1.0,1.0])|
|             (15,[4,9,14],[1.0,1.0,1.0])|
|(15,[4,5,9,11,14],[1.0,1.0,1.0,1.0,1.0])|
|(15,[4,5,9,10,14],[1.0,1.0,1.0,1.0,1.0])|
+----------------------------------------+
only showing top 5 rows



In [134]:
vector_df.select('acceptability_index').show(5)

+-------------------+
|acceptability_index|
+-------------------+
|                0.0|
|                0.0|
|                0.0|
|                0.0|
|                0.0|
+-------------------+
only showing top 5 rows



In [137]:
pipeline = Pipeline(stages=[model])

evaluator = MulticlassClassificationEvaluator(
                    predictionCol='prediction',
                    labelCol='acceptability_index',
                    metricName='accuracy')

paramGrid = ParamGridBuilder() \
    .addGrid(model.maxDepth, [3, 4, 5]) \
    .build()

# the actual gridsearch
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)  

# Run cross-validation, and choose the best set of parameters.
model_fit = crossval.fit(data_train)

print('Average cv scores:')
print(np.around(np.array(model_fit.avgMetrics),4))

Average cv scores:
[0.7811 0.7996 0.8155]


In [140]:
java_model = model_fit.bestModel.stages[0]._java_obj

print('Best model parameters:')
print({param.name: java_model.getOrDefault(java_model.getParam(param.name)) 
    for param in paramGrid[0]})
print()
#print(java_model.explainParams())

predictions = model_fit.transform(data_test)

print('Best model test accuracy:')
print(evaluator.evaluate(predictions))

Best model parameters:
{'maxDepth': 5}

Best model test accuracy:
0.8287937743190662
