Links to other notebooks in the same folder:
<a href='http://pivotal.io/data-science'><img src='https://raw.githubusercontent.com/crawles/Logos/master/Pivotal_TealOnWhite.png' width='200px' align='right'></a>

<nav class="navbar navbar-light bg-faded">
    <ul class="nav navbar-nav">
        <li class="">
            <a class="nav-link">MLlib Example</a>
        </li>
        <li class="">
            <a class="nav-link" href="ML Example.ipynb">ML Example</a>
        </li>

# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Import-useful-libraries" data-toc-modified-id="Import-useful-libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import useful libraries</a></div><div class="lev1 toc-item"><a href="#Data" data-toc-modified-id="Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data</a></div><div class="lev2 toc-item"><a href="#Converting-our-DF-to-an-RDD" data-toc-modified-id="Converting-our-DF-to-an-RDD-21"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Converting our DF to an RDD</a></div><div class="lev1 toc-item"><a href="#Train/test-Split" data-toc-modified-id="Train/test-Split-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Train/test Split</a></div><div class="lev1 toc-item"><a href="#Random-Forest" data-toc-modified-id="Random-Forest-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Random Forest</a></div><div class="lev1 toc-item"><a href="#Logistic-Regression" data-toc-modified-id="Logistic-Regression-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Logistic Regression</a></div><div class="lev2 toc-item"><a href="#Convert-Using-SQLContext" data-toc-modified-id="Convert-Using-SQLContext-51"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Convert Using SQLContext</a></div>

# Import useful libraries

In [1]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
import getopt
import os
import sys
import urllib

from IPython.core.display import display, HTML
from IPython.core.magic import register_cell_magic, register_line_cell_magic, register_line_magic
# If we want to move the graph
# %matplotlib notebook
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas.io.sql as psql
import psycopg2

from pyspark.mllib.classification import LogisticRegressionModel, LogisticRegressionWithSGD
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.sql import DataFrame, Row, SQLContext
from pyspark.sql.types import *

import seaborn as sns

In [2]:
# Changes logo to a Pivotal logo
jPrefs = urllib.urlopen("https://raw.githubusercontent.com/crawles/Logos/master/jupyterPrefs.js").read()
HTML('<script>{}</script>'.format(jPrefs))

In [3]:
# Set default cell width
display(HTML('<style>.container {width:80% !important;}</style>'))

# Set default matplotlib settings
mpl.rcParams['figure.figsize'] = (10, 7)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['figure.titlesize'] = 26
mpl.rcParams['axes.labelsize'] = 18
mpl.rcParams['axes.titlesize'] = 22
mpl.rcParams['xtick.labelsize'] = 14
mpl.rcParams['ytick.labelsize'] = 14
mpl.rcParams['legend.fontsize'] = 16

# Set seaborn colours
blue, green, red, purple, yellow, cyan = sns.color_palette()

# Spark stuff
sqlContext = SQLContext(sc)

# Data

In [4]:
column_names = ["sex", "length", "diameter", "height", "whole weight", 
                "shucked weight", "viscera weight", "shell weight", "rings"]
abalone_df = pd.read_csv('abalone.csv', names=column_names)
abalone_df['sex'] = abalone_df['sex'].map({'F': 0, 'I': 1, 'M': 2})
abalone_df.head()

Unnamed: 0,sex,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight,rings
0,2,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,2,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,2,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,1,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


## Converting our DF to an RDD
Here, we take our DataFrame and convert it to a <code>np.array</code>. We can then run the <code>sc.parallelize</code> function on this to create our RDD.

In [5]:
ml_rdd = sc.parallelize(np.array(abalone_df).tolist())
ml_rdd.take(5)

[[2.0, 0.455, 0.365, 0.095, 0.514, 0.2245, 0.10099999999999999, 0.15, 15.0],
 [2.0, 0.35, 0.265, 0.09, 0.2255, 0.0995, 0.0485, 0.07, 7.0],
 [0.0, 0.53, 0.42, 0.135, 0.677, 0.2565, 0.1415, 0.21, 9.0],
 [2.0, 0.44, 0.365, 0.125, 0.516, 0.2155, 0.114, 0.155, 10.0],
 [1.0, 0.33, 0.255, 0.08, 0.205, 0.0895, 0.0395, 0.055, 7.0]]

Next, we map this to a <code>LabeledPoint</code>, where the first column, <code>sex</code>, is our label and the rest are features.

In [6]:
ml_rdd = ml_rdd.map(lambda x: LabeledPoint(x[0], x[1:]))
ml_rdd.take(10)

[LabeledPoint(2.0, [0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15.0]),
 LabeledPoint(2.0, [0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7.0]),
 LabeledPoint(0.0, [0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9.0]),
 LabeledPoint(2.0, [0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10.0]),
 LabeledPoint(1.0, [0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7.0]),
 LabeledPoint(1.0, [0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8.0]),
 LabeledPoint(0.0, [0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20.0]),
 LabeledPoint(0.0, [0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16.0]),
 LabeledPoint(2.0, [0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9.0]),
 LabeledPoint(0.0, [0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19.0])]

# Train/test Split

Here, we split up our data into a training and test set. We can do this by creating a random column and filtering on it.

In [7]:
train_rdd, test_rdd = ml_rdd.randomSplit([0.8, 0.2])

In [8]:
train_rdd.take(5)

[LabeledPoint(2.0, [0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15.0]),
 LabeledPoint(2.0, [0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7.0]),
 LabeledPoint(0.0, [0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9.0]),
 LabeledPoint(1.0, [0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7.0]),
 LabeledPoint(1.0, [0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8.0])]

In [9]:
test_rdd.take(5)

[LabeledPoint(2.0, [0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10.0]),
 LabeledPoint(0.0, [0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16.0]),
 LabeledPoint(2.0, [0.49,0.38,0.135,0.5415,0.2175,0.095,0.19,11.0]),
 LabeledPoint(0.0, [0.535,0.405,0.145,0.6845,0.2725,0.171,0.205,10.0]),
 LabeledPoint(0.0, [0.44,0.34,0.1,0.451,0.188,0.087,0.13,10.0])]

# Random Forest

Now, we can plug in our ml_rdd into a random forest classifier.

In [10]:
rf_model = RandomForest.trainClassifier(train_rdd, 3, categoricalFeaturesInfo={}, numTrees=50)

Next, we use the <code>predict</code> function to get our mode predictions. We can also use the <code>zip</code> function to append the labels to the predictions.

In [11]:
preds = rf_model.predict(test_rdd.map(lambda x: x.features))
results_rdd =  test_rdd.map(lambda x: x.label).zip(preds)
results_rdd.take(10)

[(2.0, 1.0),
 (0.0, 2.0),
 (2.0, 1.0),
 (0.0, 2.0),
 (0.0, 1.0),
 (2.0, 1.0),
 (0.0, 2.0),
 (0.0, 2.0),
 (2.0, 1.0),
 (0.0, 2.0)]

In [12]:
results_rdd.map(lambda x: int(x[0] == x[1])).mean()

0.5266903914590748

# Logistic Regression
Now we will apply a logistic regresison model. We need a binary variable, however, so we will predict whether <code>rings</code> is greater than 14. This first requires us to transform the <code>rings</code> variable to a binary indicator.

In [13]:
log_rdd = ml_rdd.map(lambda x: LabeledPoint(x.label, x.features[1:-1].tolist() + [int(x.features[-1] > 14)]))
log_rdd = ml_rdd.map(lambda x: LabeledPoint(int(x.features[-1] > 14), [x.label] + x.features[1:-1].tolist()))

In [14]:
log_rdd.take(5)

[LabeledPoint(1.0, [2.0,0.365,0.095,0.514,0.2245,0.101,0.15]),
 LabeledPoint(0.0, [2.0,0.265,0.09,0.2255,0.0995,0.0485,0.07]),
 LabeledPoint(0.0, [0.0,0.42,0.135,0.677,0.2565,0.1415,0.21]),
 LabeledPoint(0.0, [2.0,0.365,0.125,0.516,0.2155,0.114,0.155]),
 LabeledPoint(0.0, [1.0,0.255,0.08,0.205,0.0895,0.0395,0.055])]

In [15]:
train_rdd, test_rdd = log_rdd.randomSplit([0.8, 0.2])

In [16]:
log_model = LogisticRegressionWithSGD.train(train_rdd, regType='l1', intercept=True)

In [17]:
preds = log_model.predict(test_rdd.map(lambda x: x.features))
results_rdd = test_rdd.map(lambda x: x.label).zip(preds)
results_rdd.take(10)

[(0.0, 0),
 (0.0, 0),
 (0.0, 0),
 (0.0, 0),
 (0.0, 0),
 (0.0, 0),
 (0.0, 0),
 (0.0, 0),
 (0.0, 0),
 (0.0, 0)]

In [18]:
results_rdd.map(lambda x: int(x[0] == x[1])).mean()

0.9134045077105575

## Convert Using SQLContext
We can convert this into a DataFrame then convert it to an RDD of LabeledPoints

In [19]:
spark_df = sqlContext.createDataFrame(abalone_df)

In [20]:
spark_df.take(5)

[Row(sex=2, length=0.455, diameter=0.365, height=0.095, whole weight=0.514, shucked weight=0.2245, viscera weight=0.10099999999999999, shell weight=0.15, rings=15),
 Row(sex=2, length=0.35, diameter=0.265, height=0.09, whole weight=0.2255, shucked weight=0.0995, viscera weight=0.0485, shell weight=0.07, rings=7),
 Row(sex=0, length=0.53, diameter=0.42, height=0.135, whole weight=0.677, shucked weight=0.2565, viscera weight=0.1415, shell weight=0.21, rings=9),
 Row(sex=2, length=0.44, diameter=0.365, height=0.125, whole weight=0.516, shucked weight=0.2155, viscera weight=0.114, shell weight=0.155, rings=10),
 Row(sex=1, length=0.33, diameter=0.255, height=0.08, whole weight=0.205, shucked weight=0.0895, viscera weight=0.0395, shell weight=0.055, rings=7)]

In [21]:
# In Spark version 2.x, we must use the .rdd command.
# In earlier versions of spark, we can simply do spark_df.map()
spark_df.rdd.map(lambda x: LabeledPoint(x.sex, x[1:])).take(5)

[LabeledPoint(2.0, [0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15.0]),
 LabeledPoint(2.0, [0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7.0]),
 LabeledPoint(0.0, [0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9.0]),
 LabeledPoint(2.0, [0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10.0]),
 LabeledPoint(1.0, [0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7.0])]