Links to other notebooks in the same folder:
<a href='http://pivotal.io/data-science'><img src='https://raw.githubusercontent.com/crawles/Logos/master/Pivotal_TealOnWhite.png' width='200px' align='right'></a>

<nav class="navbar navbar-light bg-faded">
    <ul class="nav navbar-nav">
        <li class="">
            <a class="nav-link" href="MLlib Example.ipynb">MLlib Example</a>
        </li>
        <li class="">
            <a class="nav-link">ML Example</a>
        </li>


# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Import-useful-libraries" data-toc-modified-id="Import-useful-libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import useful libraries</a></div><div class="lev1 toc-item"><a href="#Data" data-toc-modified-id="Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data</a></div><div class="lev2 toc-item"><a href="#Spark-DataFrames" data-toc-modified-id="Spark-DataFrames-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Spark DataFrames</a></div><div class="lev1 toc-item"><a href="#Operations-on-DataFrames" data-toc-modified-id="Operations-on-DataFrames-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Operations on DataFrames</a></div><div class="lev1 toc-item"><a href="#Modelling" data-toc-modified-id="Modelling-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Modelling</a></div>

# Import useful libraries

In [1]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
import getopt
import os
import sys
import urllib

from IPython.core.display import display, HTML
from IPython.core.magic import register_cell_magic, register_line_cell_magic, register_line_magic
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas.io.sql as psql
import psycopg2
import seaborn as sns

from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [2]:
# Changes logo to a Pivotal logo
jPrefs = urllib.urlopen("https://raw.githubusercontent.com/crawles/Logos/master/jupyterPrefs.js").read()
HTML('<script>{}</script>'.format(jPrefs))

In [3]:
# Set default cell width
display(HTML('<style>.container {width:80% !important;}</style>'))

# Set default matplotlib settings
mpl.rcParams['figure.figsize'] = (10, 7)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['figure.titlesize'] = 26
mpl.rcParams['axes.labelsize'] = 18
mpl.rcParams['axes.titlesize'] = 22
mpl.rcParams['xtick.labelsize'] = 14
mpl.rcParams['ytick.labelsize'] = 14
mpl.rcParams['legend.fontsize'] = 16

# Set seaborn colours
blue, green, red, purple, yellow, cyan = sns.color_palette()

# Spark stuff
sqlContext = SQLContext(sc)

# Data

In [4]:
column_names = ["sex", "length", "diameter", "height", "whole weight", 
                "shucked weight", "viscera weight", "shell weight", "rings"]
abalone_df = pd.read_csv('abalone.csv', names=column_names)
abalone_df['sex'] = abalone_df['sex'].map({'F': 0, 'I': 1, 'M': 2})
abalone_df.head()

Unnamed: 0,sex,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight,rings
0,2,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,2,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,2,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,1,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


## Spark DataFrames
Spark is moving towards DataFrames as opposed to RDDs.
- <a href="https://databricks.com/blog/2015/02/17/introducing-dataframes-in-spark-for-large-scale-data-science.html">https://databricks.com/blog/2015/02/17/introducing-dataframes-in-spark-for-large-scale-data-science.html</a>
- <a href="https://www.quora.com/Why-are-there-two-ML-implementations-in-Spark-ML-and-MLlib-and-what-are-their-different-features">https://www.quora.com/Why-are-there-two-ML-implementations-in-Spark-ML-and-MLlib-and-what-are-their-different-features</a>

In [5]:
spark_df = sqlContext.createDataFrame(abalone_df)
spark_df.take(5)

[Row(sex=2, length=0.455, diameter=0.365, height=0.095, whole weight=0.514, shucked weight=0.2245, viscera weight=0.10099999999999999, shell weight=0.15, rings=15),
 Row(sex=2, length=0.35, diameter=0.265, height=0.09, whole weight=0.2255, shucked weight=0.0995, viscera weight=0.0485, shell weight=0.07, rings=7),
 Row(sex=0, length=0.53, diameter=0.42, height=0.135, whole weight=0.677, shucked weight=0.2565, viscera weight=0.1415, shell weight=0.21, rings=9),
 Row(sex=2, length=0.44, diameter=0.365, height=0.125, whole weight=0.516, shucked weight=0.2155, viscera weight=0.114, shell weight=0.155, rings=10),
 Row(sex=1, length=0.33, diameter=0.255, height=0.08, whole weight=0.205, shucked weight=0.0895, viscera weight=0.0395, shell weight=0.055, rings=7)]

In [6]:
# Convert to Pandas DataFrame
spark_df.toPandas().head(5)

Unnamed: 0,sex,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight,rings
0,2,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,2,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,2,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,1,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


# Operations on DataFrames

In [7]:
# Apply filters
spark_df.filter(spark_df.length > 0.3).take(5)

[Row(sex=2, length=0.455, diameter=0.365, height=0.095, whole weight=0.514, shucked weight=0.2245, viscera weight=0.10099999999999999, shell weight=0.15, rings=15),
 Row(sex=2, length=0.35, diameter=0.265, height=0.09, whole weight=0.2255, shucked weight=0.0995, viscera weight=0.0485, shell weight=0.07, rings=7),
 Row(sex=0, length=0.53, diameter=0.42, height=0.135, whole weight=0.677, shucked weight=0.2565, viscera weight=0.1415, shell weight=0.21, rings=9),
 Row(sex=2, length=0.44, diameter=0.365, height=0.125, whole weight=0.516, shucked weight=0.2155, viscera weight=0.114, shell weight=0.155, rings=10),
 Row(sex=1, length=0.33, diameter=0.255, height=0.08, whole weight=0.205, shucked weight=0.0895, viscera weight=0.0395, shell weight=0.055, rings=7)]

In [8]:
# Apply filters using SQL syntax. Cannot put a ';' at the end or it will throw an error.
spark_df.registerTempTable('spark_df')
sql = '''
SELECT *
  FROM spark_df
 WHERE length > 0.3
'''
sqlContext.sql(sql).take(5)

[Row(sex=2, length=0.455, diameter=0.365, height=0.095, whole weight=0.514, shucked weight=0.2245, viscera weight=0.10099999999999999, shell weight=0.15, rings=15),
 Row(sex=2, length=0.35, diameter=0.265, height=0.09, whole weight=0.2255, shucked weight=0.0995, viscera weight=0.0485, shell weight=0.07, rings=7),
 Row(sex=0, length=0.53, diameter=0.42, height=0.135, whole weight=0.677, shucked weight=0.2565, viscera weight=0.1415, shell weight=0.21, rings=9),
 Row(sex=2, length=0.44, diameter=0.365, height=0.125, whole weight=0.516, shucked weight=0.2155, viscera weight=0.114, shell weight=0.155, rings=10),
 Row(sex=1, length=0.33, diameter=0.255, height=0.08, whole weight=0.205, shucked weight=0.0895, viscera weight=0.0395, shell weight=0.055, rings=7)]

In [9]:
spark_df.select('sex', 'length').take(5)

[Row(sex=2, length=0.455),
 Row(sex=2, length=0.35),
 Row(sex=0, length=0.53),
 Row(sex=2, length=0.44),
 Row(sex=1, length=0.33)]

In [10]:
spark_df.select(['sex', 'length']).take(5)

[Row(sex=2, length=0.455),
 Row(sex=2, length=0.35),
 Row(sex=0, length=0.53),
 Row(sex=2, length=0.44),
 Row(sex=1, length=0.33)]

In [11]:
train_df, test_df = spark_df.randomSplit([0.8, 0.2])
test_df.take(5)

[Row(sex=2, length=0.44, diameter=0.365, height=0.125, whole weight=0.516, shucked weight=0.2155, viscera weight=0.114, shell weight=0.155, rings=10),
 Row(sex=1, length=0.425, diameter=0.3, height=0.095, whole weight=0.3515, shucked weight=0.141, viscera weight=0.0775, shell weight=0.12, rings=8),
 Row(sex=0, length=0.53, diameter=0.415, height=0.15, whole weight=0.7775, shucked weight=0.237, viscera weight=0.1415, shell weight=0.33, rings=20),
 Row(sex=0, length=0.47, diameter=0.355, height=0.1, whole weight=0.4755, shucked weight=0.1675, viscera weight=0.0805, shell weight=0.185, rings=10),
 Row(sex=0, length=0.44, diameter=0.34, height=0.1, whole weight=0.451, shucked weight=0.188, viscera weight=0.087, shell weight=0.13, rings=10)]

# Modelling

Modelling requires input columns label and features. We will need to take all of our feature information and group them together into a Vector.

In [12]:
assembler = VectorAssembler(inputCols=[x for x in train_df.columns[1:]], outputCol='features')
assembler

VectorAssembler_41b0a3e21d30c933b499

In [13]:
train_df = assembler.transform(train_df).select('sex', 'features')

test_df = assembler.transform(test_df).select('sex', 'features')
test_df.take(5)

[Row(sex=2, features=DenseVector([0.44, 0.365, 0.125, 0.516, 0.2155, 0.114, 0.155, 10.0])),
 Row(sex=1, features=DenseVector([0.425, 0.3, 0.095, 0.3515, 0.141, 0.0775, 0.12, 8.0])),
 Row(sex=0, features=DenseVector([0.53, 0.415, 0.15, 0.7775, 0.237, 0.1415, 0.33, 20.0])),
 Row(sex=0, features=DenseVector([0.47, 0.355, 0.1, 0.4755, 0.1675, 0.0805, 0.185, 10.0])),
 Row(sex=0, features=DenseVector([0.44, 0.34, 0.1, 0.451, 0.188, 0.087, 0.13, 10.0]))]

Next, we need to change our label using StringIndexer which converts our categorical values into indices. They are ordered by frequency.

In [14]:
stringIndexer = StringIndexer(inputCol='sex', outputCol='sex_label')
si_model = stringIndexer.fit(train_df)
train_input = si_model.transform(train_df)

si_model = stringIndexer.fit(test_df)
test_input = si_model.transform(test_df)

test_input.take(5)

[Row(sex=2, features=DenseVector([0.44, 0.365, 0.125, 0.516, 0.2155, 0.114, 0.155, 10.0]), sex_label=0.0),
 Row(sex=1, features=DenseVector([0.425, 0.3, 0.095, 0.3515, 0.141, 0.0775, 0.12, 8.0]), sex_label=1.0),
 Row(sex=0, features=DenseVector([0.53, 0.415, 0.15, 0.7775, 0.237, 0.1415, 0.33, 20.0]), sex_label=2.0),
 Row(sex=0, features=DenseVector([0.47, 0.355, 0.1, 0.4755, 0.1675, 0.0805, 0.185, 10.0]), sex_label=2.0),
 Row(sex=0, features=DenseVector([0.44, 0.34, 0.1, 0.451, 0.188, 0.087, 0.13, 10.0]), sex_label=2.0)]

Finally, we can set up and run our model.

In [15]:
# Set up model
rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol='sex_label')
# Train the model
model = rf.fit(train_input)
model

RandomForestClassificationModel (uid=rfc_fd9333dd79c0) with 3 trees

Now, we apply the model to the test set.

In [16]:
model_results = model.transform(test_input)
model_results.take(5)

[Row(sex=2, features=DenseVector([0.44, 0.365, 0.125, 0.516, 0.2155, 0.114, 0.155, 10.0]), sex_label=0.0, rawPrediction=DenseVector([0.8686, 0.8395, 1.2919]), probability=DenseVector([0.2895, 0.2798, 0.4306]), prediction=2.0),
 Row(sex=1, features=DenseVector([0.425, 0.3, 0.095, 0.3515, 0.141, 0.0775, 0.12, 8.0]), sex_label=1.0, rawPrediction=DenseVector([0.4699, 0.2069, 2.3232]), probability=DenseVector([0.1566, 0.069, 0.7744]), prediction=2.0),
 Row(sex=0, features=DenseVector([0.53, 0.415, 0.15, 0.7775, 0.237, 0.1415, 0.33, 20.0]), sex_label=2.0, rawPrediction=DenseVector([1.1247, 0.9742, 0.9012]), probability=DenseVector([0.3749, 0.3247, 0.3004]), prediction=0.0),
 Row(sex=0, features=DenseVector([0.47, 0.355, 0.1, 0.4755, 0.1675, 0.0805, 0.185, 10.0]), sex_label=2.0, rawPrediction=DenseVector([0.8885, 0.6398, 1.4716]), probability=DenseVector([0.2962, 0.2133, 0.4905]), prediction=2.0),
 Row(sex=0, features=DenseVector([0.44, 0.34, 0.1, 0.451, 0.188, 0.087, 0.13, 10.0]), sex_label=

In [17]:
model_results.toPandas().head()

Unnamed: 0,sex,features,sex_label,rawPrediction,probability,prediction
0,2,"[0.44, 0.365, 0.125, 0.516, 0.2155, 0.114, 0.1...",0.0,"[0.868608276782, 0.839513639863, 1.29187808336]","[0.289536092261, 0.279837879954, 0.430626027785]",2.0
1,1,"[0.425, 0.3, 0.095, 0.3515, 0.141, 0.0775, 0.1...",1.0,"[0.469944725414, 0.206875786535, 2.32317948805]","[0.156648241805, 0.0689585955116, 0.774393162684]",2.0
2,0,"[0.53, 0.415, 0.15, 0.7775, 0.237, 0.1415, 0.3...",2.0,"[1.12465877833, 0.974155782889, 0.901185438783]","[0.374886259443, 0.324718594296, 0.300395146261]",0.0
3,0,"[0.47, 0.355, 0.1, 0.4755, 0.1675, 0.0805, 0.1...",2.0,"[0.888513354988, 0.639838083869, 1.47164856114]","[0.296171118329, 0.21327936129, 0.490549520381]",2.0
4,0,"[0.44, 0.34, 0.1, 0.451, 0.188, 0.087, 0.13, 1...",2.0,"[0.888513354988, 0.639838083869, 1.47164856114]","[0.296171118329, 0.21327936129, 0.490549520381]",2.0
