In [1]:
sc
sqlContext = SQLContext(sc) 

In [2]:
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import datetime
import math

In [3]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder

from pyspark.ml import Pipeline

https://machinelearningmastery.com/simple-linear-regression-tutorial-for-machine-learning/ <br>
https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/ <br>
algorithm details: *Latent dirichlet allocation*, MI Jordan

**MLlib**
* includes RDD-based and DataFrame-based API
* As of Spark 2.0, the RDD-based APIs in the `spark.mllib` package have entered maintenance mode. The primary Machine Learning API for Spark is now the DataFrame-based API in the `spark.ml` package.

# Spark ML (MLlib DataFrame-based API)

* New (Spark v 1.2).
* Support Pipelines of estimators, transformer and evaluators. 
* Use DataFrame and Dataset.
    * Dataset : A strongly typed collection of objects (This includes DataFrame.).

# MainComponents
* Transformers<br>
`transform()`: takes DataFrame and optional parameters.
     * Convert a dataset to another. 
     * Types <br>
        1) Feature transformer – take a data frame output a data frame with new columns like feature vectors. <br>
        2) Learning model – take a data frame and output a data frame with predicted labels. <br>
* Estimators<br>
`fit()` : takes a DataFrame and parameters.
     * Algorithms that produce transformers by
     * Ex. Linear regression produces a linear regression model with fitted weights and an intercepts, which is a transformer.
* Evaluators<br>
`Evaluator()`
     * Evaluate the performance of a model.   
<br>
  <br>
* ML Parameters
     * Specify parameters for estimators and transformers.
     * Also can use `ParamGridBuilder()` for choosing the model produced by the best-performing set of parameters in `CrossValidator()`.
* ML Pipeline (`PipelineModel()`)
     * In machine learning, the same steps are often repeated with slightly different parameters to find the best results.
     * A Pipeline chains multiple Transformers and Estimators together to specify an ML workflow and runs in order.

# Algorithms

## Logistic Regression 
`LogisticRegression()`
* Input
    1. features - Feature vector.
    2. label - Label to predict.
* Output<br>
Coefficient and intercept of the model.


## Decision Tree
* Pros <br>
Do not require data normalization, can handle numerical/categorical values, and work with missing values.
* Cons <br>
Prone to overfitting and is sensitive to the input data.
  
`DecisionTreeClassifier()` - Binary Decision Tree classifier.
* Input
    1. features - Feature vector.
    2. label - Label to predict.
* Output
    1. prediction – Predicted label.
    2. rawPrediction - Vector of length # classes, with the counts of training instance labels at the tree node which makes the prediction.
    3. probability - Vector of length # classes equal to rawPrediction normalized to a multinomial distribution

## Random Forest (EX4)
* Ensemble learning method
    * Train a certain number of decision trees on data randomly sampled from the original data. 
    * Avoid overfitting and find a global optima that particular decision tree cannot find on
    their own.
* Performs well on high dimensional datasets.
  
`RandomForestClassifier`
* Parameters
    * numTrees : The number of trees to train. Default : 20 
    * featureSubsetStrategy
        * all – use all features.
        * onethird – randomly selects 1/3 of the features.
        * sqrt – randomly select sqrt(number of features).
        * log2 – randomly select log2(number of features).
        * auto – sqrt for classification and onethird for regression (default)
* Make sure to have enough driver memory (configuration)<br>
`pyspark --driver-memory=2g`
<br>
  <br>
* Input
    1. features - Feature vector.
    2. label - Label to predict.
* Output
    1. prediction – Predicted label.
    2. rawPrediction - Vector of length # classes, with the counts of training instance labels at
    the tree node which makes the prediction.
    3. probability - Vector of length # classes equal to rawPrediction normalized to a multinomial distribution.

## K-Mean Clustering (EX5)
* Unsupervised learning
* Dataset should be standardized.
* Example – partition data into groups, anomaly detection, text/topic categorization
<br>
  <br>
`Kmeans`
* Parameters
    * k : Number of clusters to find (default – 2).
    * maxIter : Maximum number of iterations (default – 20). 
    * tol : Convergence tolerance (default – 0.0001).
    * seed : Random seed value for cluster initialization.
<br>
  <br>
* Input<br>
features – feature vector 
* Output<br>
prediction – predicted cluster center

### ex1
Develop a Simple Linear Regression model(without stochastic gradient descent) to predict “petal_width” using “sepal_width” using “iris.csv”.

In [4]:
sqlContext.sql("DROP TABLE IF EXISTS test") #delete test table, if exsists.

DataFrame[]

In [5]:
irisSchema = StructType([StructField("sepal_length", DoubleType(), True), 
                         StructField("sepal_width", DoubleType(), True),
                         StructField("petal_length", DoubleType(), True), 
                         StructField("petal_width", DoubleType(), True),
                         StructField("class", StringType(), True)])

iris = sqlContext.read.format('com.databricks.spark.csv').options(header='false')\
    .load('2018-msan697-example/Data/iris.csv', schema = irisSchema)\
    .select("sepal_length", "petal_length")

In [6]:
train, test = iris.randomSplit([0.9, 0.1])
train.cache()
test.write.saveAsTable("test")

In [7]:
covariance = train.cov("sepal_length", "petal_length")
covariance

1.287408759124089

In [8]:
var = train.select(variance("sepal_length")).first()[0]
var  #Row(0.6608711484593822)

0.7042840274796065

In [9]:
coeff_0 = covariance / var
coeff_0

1.8279681334408335

In [10]:
coeff_1 = train.select(mean("petal_length")).first()[0] - coeff_0 * train.select(mean("sepal_length")).first()[0]
coeff_1

-6.873057760043645

In [11]:
test_output = sqlContext.sql("SELECT sepal_length, petal_length, sepal_length * {0} + {1} AS prediction FROM test".format(coeff_0, coeff_1))
test_output.show()

+------------+------------+------------------+
|sepal_length|petal_length|        prediction|
+------------+------------+------------------+
|         5.0|         1.6|2.2667829071599996|
|         5.2|         1.5|2.6323765338479994|
|         5.2|         1.5|2.6323765338479994|
|         5.4|         1.5| 2.997970160536001|
|         5.4|         1.5| 2.997970160536001|
|         5.4|         1.7| 2.997970160536001|
|         5.8|         4.0|3.7291574139119987|
|         6.1|         4.0| 4.277547853943998|
|         6.2|         4.3|    4.460344667288|
|         6.3|         4.7|    4.643141480632|
|         6.4|         5.6|    4.825938293976|
|         6.9|         5.4| 5.739922360696001|
|         7.4|         6.1|6.6539064274160005|
+------------+------------+------------------+



In [12]:
# rmse (root mean squre error) : https://en.wikipedia.org/wiki/Root-mean-square_deviation
rmse = math.sqrt(test_output.rdd.map(lambda x : (x["prediction"] - x["petal_length"])**2)\
                      .reduce(lambda x,y : x + y)/test_output.count())
rmse

0.8936774199917817

# Algorithms

## Logistic Regression 

### ex2
Adult Data Set : Prediction task is to determine whether a person makes over 50K a year.

1.Create an RDD

In [13]:
def toDoubleSafe(v):
    try:
        return float(v)
    except ValueError:
        return v #if it is not a float type, return a string

census_raw = sc.textFile('2018-msan697-example/Data/adult.raw').map(lambda x:x.split(", "))\
    .map(lambda row: [toDoubleSafe(x) for x in row])

In [14]:
census_raw.take(1)

[[39.0,
  u'State-gov',
  77516.0,
  u'Bachelors',
  u'Never-married',
  u'Adm-clerical',
  u'Not-in-family',
  u'White',
  u'Male',
  2174.0,
  0.0,
  40.0,
  u'United-States',
  u'<=50K']]

2.Convert the RDD to DataFrame

In [15]:
adultschema = StructType([
    StructField("age",DoubleType(),True),
    StructField("capital_gain",DoubleType(),True),
    StructField("capital_loss",DoubleType(),True),
    StructField("fnlwgt",DoubleType(),True),
    StructField("hours_per_week",DoubleType(),True),
    StructField("education",StringType(),True),
    StructField("income",StringType(),True),
    StructField("marital_status",StringType(),True),
    StructField("native_country",StringType(),True),
    StructField("occupation",StringType(),True),
    StructField("race",StringType(),True),
    StructField("relationship",StringType(),True),
    StructField("sex",StringType(),True),
    StructField("workclass",StringType(),True),
])

columns = ["age", "workclass", "fnlwgt", "education", "marital_status",
           "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
           "hours_per_week", "native_country", "income"]

dfraw = sqlContext.createDataFrame(census_raw.map(lambda row: Row(**{x[0]:x[1] for x in zip(columns, row)}), adultschema))

3.clean data <br>
Missing data imputation - Impute the most common row for "?".
* `.na` : returns a DataFrameNA Function for handling missing values. 
* `.replace(to_replace, value, subset=None)`
     * `to_replace` : Value to be replaced.
     * `value` : Value to use to replace holes.
     * `subset` : Optional list of column names to consider.

In [16]:
dfrawrp = dfraw.na.replace(["?"], ["Private"], ["workclass"])
dfrawrpl = dfrawrp.na.replace(["?"], ["Prof-specialty"], ["occupation"])
dfrawnona = dfrawrpl.na.replace(["?"], ["United-States"], ["native_country"])

Convert Strings to Categorical Values
  
1.String Indexer:
When male (0) and female (1), female > male? 
* Convert string categorical values into integer indexes.
* Takes a DataFrame and fits a `StringIndexerModel()` and used it for transformation.

In [17]:
#converting strings to numeric values
from pyspark.ml.feature import StringIndexer

def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        #For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = si.fit(newdf).transform(newdf).drop(c).withColumnRenamed(c+"-num", c)
    return newdf

dfnumeric = indexStringColumns(dfrawnona, ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country", "income"])

In [18]:
dfnumeric.show(5)

+----+------------+------------+--------+--------------+---------+---------+--------------+----------+------------+----+---+--------------+------+
| age|capital_gain|capital_loss|  fnlwgt|hours_per_week|workclass|education|marital_status|occupation|relationship|race|sex|native_country|income|
+----+------------+------------+--------+--------------+---------+---------+--------------+----------+------------+----+---+--------------+------+
|39.0|      2174.0|         0.0| 77516.0|          40.0|      3.0|      2.0|           1.0|       3.0|         1.0| 0.0|0.0|           0.0|   0.0|
|50.0|         0.0|         0.0| 83311.0|          13.0|      1.0|      2.0|           0.0|       2.0|         0.0| 0.0|0.0|           0.0|   0.0|
|38.0|         0.0|         0.0|215646.0|          40.0|      0.0|      0.0|           2.0|       8.0|         1.0| 0.0|0.0|           0.0|   0.0|
|53.0|         0.0|         0.0|234721.0|          40.0|      0.0|      5.0|           0.0|       8.0|         0.0| 1.

2.One-hot encoding:
    * Expand a column to as many columns as there are distinct strings in it and only one column contains a 1 and others are 0.
    * Create a new column as a one-hot-encoded sparse vector. (Replace a column with a vector.)

In [19]:
from pyspark.ml.feature import OneHotEncoder

def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        #For each given colum, create OneHotEncoder. 
        #dropLast : Whether to drop the last category in the encoded vector (default: true)
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = onehotenc.transform(newdf).drop(c).withColumnRenamed(c+"-onehot", c)
    return newdf

dfhot = oneHotEncodeColumns(dfnumeric, ["workclass", "education", "marital_status", "occupation", "relationship", "race", "native_country"])

In [20]:
dfhot.show(5)

+----+------------+------------+--------+--------------+---+------+-------------+--------------+--------------+--------------+-------------+-------------+--------------+
| age|capital_gain|capital_loss|  fnlwgt|hours_per_week|sex|income|    workclass|     education|marital_status|    occupation| relationship|         race|native_country|
+----+------------+------------+--------+--------------+---+------+-------------+--------------+--------------+--------------+-------------+-------------+--------------+
|39.0|      2174.0|         0.0| 77516.0|          40.0|0.0|   0.0|(8,[3],[1.0])|(16,[2],[1.0])| (7,[1],[1.0])|(14,[3],[1.0])|(6,[1],[1.0])|(5,[0],[1.0])|(41,[0],[1.0])|
|50.0|         0.0|         0.0| 83311.0|          13.0|0.0|   0.0|(8,[1],[1.0])|(16,[2],[1.0])| (7,[0],[1.0])|(14,[2],[1.0])|(6,[0],[1.0])|(5,[0],[1.0])|(41,[0],[1.0])|
|38.0|         0.0|         0.0|215646.0|          40.0|0.0|   0.0|(8,[0],[1.0])|(16,[0],[1.0])| (7,[2],[1.0])|(14,[8],[1.0])|(6,[1],[1.0])|(5,[0],[1.

In [21]:
dfhot.select('workclass').show(5)

+-------------+
|    workclass|
+-------------+
|(8,[3],[1.0])|
|(8,[1],[1.0])|
|(8,[0],[1.0])|
|(8,[0],[1.0])|
|(8,[0],[1.0])|
+-------------+
only showing top 5 rows



3.`VectorAssembler()`
    * Merge all the new vectors and the original columns into a single vector.
    * Useful for combining raw features and features generated by different feature transformers into a single feature vector, in order to train ML models like logistic regression and decision trees.
    * ML algorithms work with two columns called **features** and **label** by default.

In [22]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler

input_cols=["age","capital_gain","capital_loss","fnlwgt","hours_per_week","sex","workclass","education","marital_status","occupation","relationship","native_country","race"]

#VectorAssembler takes a number of collumn names(inputCols) and output column name (outputCol)
#and transforms a DataFrame to assemble the values in inputCols into one single vector with outputCol.
va = VectorAssembler(outputCol="features", inputCols=input_cols)
#lpoints - labeled data.
lpoints = va.transform(dfhot).select("features", "income").withColumnRenamed("income", "label")

In [23]:
lpoints.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(103,[0,1,3,4,9,1...|  0.0|
|(103,[0,3,4,7,16,...|  0.0|
|(103,[0,3,4,6,14,...|  0.0|
|(103,[0,3,4,6,19,...|  0.0|
|(103,[0,3,4,5,6,1...|  0.0|
+--------------------+-----+
only showing top 5 rows



In [24]:
lpoints.select("features").first()

Row(features=SparseVector(103, {0: 39.0, 1: 2174.0, 3: 77516.0, 4: 40.0, 9: 1.0, 16: 1.0, 31: 1.0, 40: 1.0, 52: 1.0, 57: 1.0, 98: 1.0}))

Divide the dataset into training and testing sets.

In [25]:
splits = lpoints.randomSplit([0.8, 0.2])

#cache() : the algorithm is interative and training and data sets are going to be reused many times.
adulttrain = splits[0].cache()
adultvalid = splits[1].cache()

4.train the model<br>
Use Spark ML’s LogisticRegression
* Set parameters - regParam, maxIter, fitIntercept. 
* Call `fit()` passing in a DataFrame.

In [26]:
#Train the model.
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(adulttrain)
#The above lines are same as..
#lr = LogisticRegression()
#lrmodel = lr.setParams(regParam=0.01, maxIter=1000, fitIntercept=True).fit(adulttrain)

5.Interpret the model parameters.

In [27]:
print(lrmodel.coefficients)
print(lrmodel.intercept)

[0.0203666857287,0.000140220907508,0.000554517771619,6.94529720864e-07,0.0278534727572,-0.507946988682,0.0198222944162,-0.360971401411,0.053705938659,-0.144947228877,0.264275465667,0.583707064995,-0.613487125557,-1.27081028531,-0.364152225981,-0.0098648845072,0.753315446034,1.11428853791,0.184552881965,-0.946854758573,0.217275533796,-1.11377811006,-1.36017731604,1.65935804497,-1.31927475606,-0.641114076647,1.66411894239,-1.16617532793,-1.48301831848,-1.76967194024,0.852228560536,-0.703211078991,-0.280467622288,-0.358005824639,-0.366563354955,-0.142635801563,0.852837814675,0.205469503478,0.0324821853417,0.649081542716,-0.0362816362082,0.20645767832,-0.723211930094,-0.29060233814,-0.115317262782,-0.604288538632,-0.89554312879,0.47082797088,0.316786449609,-0.874597667244,0.487395927952,0.439071849565,-0.0913080035768,-0.769163801196,-0.329072808892,1.29410003445,-0.511363843502,0.18826521387,-0.617537876063,0.347334208822,0.151961744465,-0.26541465854,0.424595235834,-0.26712102011,0.09718

6.Evaluate classification models.
* First use lrmodel (the developedlinear regression model) to transform the test dataset. 
* Then use `BinaryClassificationEvaluator()` to evaluate the performance.

In [28]:
#Evaluate models using test dataset.
#First, transform the validation set.
validpredicts = lrmodel.transform(adultvalid)
validpredicts.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(103,[0,1,3,4,5,6...|  1.0|[-0.8952527154192...|[0.29002704192954...|       1.0|
|(103,[0,1,3,4,5,6...|  0.0|[0.58240439636611...|[0.64162047005141...|       0.0|
|(103,[0,1,3,4,5,6...|  1.0|[0.24099982801490...|[0.55996002643418...|       0.0|
|(103,[0,1,3,4,5,6...|  1.0|[-0.4505158580492...|[0.38923812317998...|       1.0|
|(103,[0,1,3,4,5,6...|  1.0|[-0.0728871902597...|[0.48178626515791...|       1.0|
|(103,[0,1,3,4,5,6...|  0.0|[1.02311688947125...|[0.73557928693190...|       0.0|
|(103,[0,1,3,4,5,6...|  0.0|[0.45643230683104...|[0.61216747966224...|       0.0|
|(103,[0,1,3,4,5,6...|  1.0|[-0.9878836978227...|[0.27133028955709...|       1.0|
|(103,[0,1,3,4,5,6...|  0.0|[0.62570003326136...|[0.65151381956458...|       0.0|
|(103,[0,1,3,4,5

rawPrediction : log-odds that a sample doesn’t/does belong to the category. <br>
probability : the probability that the sample is not in the category. <br>
prediction : proability that the sample belongs to the category.

In [29]:
print validpredicts.select("rawPrediction").first()
print validpredicts.select("probability").first()

Row(rawPrediction=DenseVector([-0.8953, 0.8953]))
Row(probability=DenseVector([0.29, 0.71]))


In [30]:
#Evaluate the model. default metric : Area Under ROC(rate of correct)
from pyspark.ml.evaluation import BinaryClassificationEvaluator

bceval = BinaryClassificationEvaluator()
print (bceval.getMetricName() +":" + str(bceval.evaluate(validpredicts)))

#Evaluate the model. metric : Area Under PR(precision recall)
bceval.setMetricName("areaUnderPR")
print (bceval.getMetricName() +":" + str(bceval.evaluate(validpredicts)))

areaUnderROC:0.900088781294
areaUnderPR:0.748368696343


n-fold cross-validation: 
* Validate the performance of the model more reliably.
* Divide the dataset into n subsets of equal sizes and train n models excluding a different subset each time and train all n models.
* Calculate the mean error for all n models.
* Choose the set of parameters with the smallest average error.
  
`CrossValidator` class : estimator.
* Takes estimator, evaluator and number of folds to use. 
* Takes several parameters in `setEstimatorParamMaps()`<br>
`ParamGridBuilder()` – combinations of parameters and their values.

In [31]:
# n-fold validation and the results.
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder

#ParamGridBuilder() – combinations of parameters and their values.
paramGrid = ParamGridBuilder()\
            .addGrid(lr.maxIter, [100, 1000])\
            .addGrid(lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5])\
            .build()
            
#setEstimatorParamMaps() takes ParamGridBuilder().
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5).setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(adulttrain)

In [32]:
#You can access the selected model through `bestModel`.
print cvmodel.bestModel.coefficients
print cvmodel.bestModel.intercept
print cvmodel.bestModel._java_obj.getMaxIter()
print cvmodel.bestModel._java_obj.getRegParam()

[0.0228384155137,0.000308060135547,0.000650968290685,8.23615498754e-07,0.0314787008101,-0.696023913327,-0.438392164269,-0.896704263015,-0.42265891452,-0.643995661333,-0.245424314708,0.164968174463,-1.1579958258,-4.63564017146,-0.619089853,-0.219869083772,0.604249500722,0.992013944789,-0.0354232108078,-1.43228769298,0.0300974811123,-1.56142495592,-1.86313222762,1.63008484725,-1.8462056879,-1.00586573922,1.62504110907,-1.62437727519,-2.23146802747,-5.96026277956,1.33998491831,-1.51690279064,-1.03844090324,-1.12079846896,-1.14773616395,-0.867815600293,1.40486027677,-0.0281622218137,-0.158294630169,0.470408387375,-0.232423108134,0.0022364954838,-1.0777475676,-0.497423376013,-0.317349449207,-0.87755147686,-1.26256871455,0.303293442641,0.15372041142,-1.64944942298,0.368932088742,-0.472524045373,0.19112789408,-0.966526616356,-0.0234915370912,0.662285012638,-0.899442819457,-1.03768029307,-1.93334108735,-0.880638122059,-1.11435776535,-1.51953997346,-0.746047308625,-1.52848257843,-1.27909421785,

In [33]:
print BinaryClassificationEvaluator().evaluate(cvmodel.bestModel.transform(adultvalid))
print BinaryClassificationEvaluator(metricName="areaUnderPR").evaluate(cvmodel.bestModel.transform(adultvalid))

0.902927650786
0.761579460014


## Decision Tree
* Pros <br>
Do not require data normalization, can handle numerical/categorical values, and work with missing values.
* Cons <br>
Prone to overfitting and is sensitive to the input data.
  
`DecisionTreeClassifier()` - Binary Decision Tree classifier.
* Input
    1. features - Feature vector.
    2. label - Label to predict.
* Output
    1. prediction – Predicted label.
    2. rawPrediction - Vector of length # classes, with the counts of training instance labels at the tree node which makes the prediction.
    3. probability - Vector of length # classes equal to rawPrediction normalized to a multinomial distribution

### ex3
Optical Recognition of Handwritten Digits Data Sets. : Classify handwritten digits<br>
1.Create an RDD

In [34]:
#Load the data and create an RDD (16 pixels and label)
pen_raw = sc.textFile("2018-msan697-example/Data/penbased.dat", 4)\
    .map(lambda x:  x.split(", ")).map(lambda row: [float(x) for x in row])
pen_raw.first()

[47.0,
 100.0,
 27.0,
 81.0,
 57.0,
 37.0,
 26.0,
 0.0,
 0.0,
 23.0,
 56.0,
 53.0,
 100.0,
 90.0,
 40.0,
 98.0,
 8.0]

2.Convert the RDD to DataFrame and Create a feature vector.
* hard-coding way<br>

In [35]:
#Create a DataFrame
from pyspark.sql.types import *
from pyspark.sql import Row
penschema = StructType([
    StructField("pix1",DoubleType(),True),
    StructField("pix2",DoubleType(),True),
    StructField("pix3",DoubleType(),True),
    StructField("pix4",DoubleType(),True),
    StructField("pix5",DoubleType(),True),
    StructField("pix6",DoubleType(),True),
    StructField("pix7",DoubleType(),True),
    StructField("pix8",DoubleType(),True),
    StructField("pix9",DoubleType(),True),
    StructField("pix10",DoubleType(),True),
    StructField("pix11",DoubleType(),True),
    StructField("pix12",DoubleType(),True),
    StructField("pix13",DoubleType(),True),
    StructField("pix14",DoubleType(),True),
    StructField("pix15",DoubleType(),True),
    StructField("pix16",DoubleType(),True),
    StructField("label",DoubleType(),True)
])

dfpen = sqlContext.createDataFrame(pen_raw.map(lambda x : Row(x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],x[8],x[9],x[10],x[11],x[12],x[13],x[14],x[15],x[16])), penschema)

# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler

va = VectorAssembler(outputCol="features", inputCols=dfpen.columns[0:-1]) #except the last col
dfpen = va.transform(dfpen).select("features", "label")

In [36]:
dfpen.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[47.0,100.0,27.0,...|  8.0|
|[0.0,89.0,27.0,10...|  2.0|
|[0.0,57.0,31.0,68...|  1.0|
|[0.0,100.0,7.0,92...|  4.0|
|[0.0,67.0,49.0,83...|  1.0|
+--------------------+-----+
only showing top 5 rows



* easy way

In [37]:
#Create a DataFrame
from pyspark.sql.types import *
from pyspark.sql import Row
penschema = StructType([
   StructField("features", ArrayType(elementType=FloatType(),containsNull=False),True),
   StructField("label", DoubleType(),True)
])
dfpen = sqlContext.createDataFrame(pen_raw.map(lambda x : Row(x[0:-1],x[-1])), penschema)

from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
dfpen = dfpen.select(list_to_vector_udf(dfpen["features"]).alias("features"),'label')

In [38]:
dfpen.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[47.0,100.0,27.0,...|  8.0|
|[0.0,89.0,27.0,10...|  2.0|
|[0.0,57.0,31.0,68...|  1.0|
|[0.0,100.0,7.0,92...|  4.0|
|[0.0,67.0,49.0,83...|  1.0|
+--------------------+-----+
only showing top 5 rows



split data

In [39]:
pendtsets = dfpen.randomSplit([0.8, 0.2])
pendttrain = pendtsets[0].cache()
pendtvalid = pendtsets[1].cache()

4.Train the data.<br>
Parameters for `DecisionTreeClassifier`
* `maxDepth` : maximum tree depth (default : 5).
* `maxBins` : maximum number of bins when binning continuous features (default : 32).
* `minInstancesPerNode` : minimum number of dataset samples each branch needs to have after a split (default : 1).
* `minInfoGain` : minimum information gain for a split (default : 0)

In [40]:
# Train the data.
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(maxDepth=20, maxBins= 32, minInstancesPerNode=1, minInfoGain = 0)
dtmodel = dt.fit(pendttrain)

In [41]:
# See the trained model.
print dtmodel._call_java('toDebugString')

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4c8088e648581152a875) of depth 20 with 595 nodes
  If (feature 15 <= 51.0)
   If (feature 4 <= 41.0)
    If (feature 9 <= 17.0)
     If (feature 14 <= 71.0)
      If (feature 5 <= 31.0)
       If (feature 0 <= 29.0)
        Predict: 4.0
       Else (feature 0 > 29.0)
        Predict: 0.0
      Else (feature 5 > 31.0)
       Predict: 6.0
     Else (feature 14 > 71.0)
      If (feature 10 <= 12.0)
       If (feature 0 <= 35.0)
        Predict: 1.0
       Else (feature 0 > 35.0)
        Predict: 8.0
      Else (feature 10 > 12.0)
       If (feature 11 <= 15.0)
        Predict: 2.0
       Else (feature 11 > 15.0)
        If (feature 0 <= 0.0)
         Predict: 7.0
        Else (feature 0 > 0.0)
         Predict: 4.0
    Else (feature 9 > 17.0)
     If (feature 1 <= 99.0)
      If (feature 9 <= 62.0)
       If (feature 7 <= 18.0)
        Predict: 0.0
       Else (feature 7 > 18.0)
        If (feature 14 <= 21.0)
         If (featur

5.Evaluate the model.

In [42]:
#Test data.
dtpredicts = dtmodel.transform(pendtvalid)

In [43]:
dtpredicts.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[0.0,0.0,41.0,16....|  9.0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|       8.0|
|[0.0,0.0,51.0,9.0...|  9.0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|       9.0|
|[0.0,4.0,74.0,29....|  1.0|[0.0,407.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,28.0,22.0,49...|  1.0|[0.0,407.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,30.0,46.0,60...|  8.0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|       8.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [44]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")\
    .evaluate(dtpredicts)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0474525


In [45]:
dtpredicts.select('label','prediction').rdd.map(lambda x : (x,1)).countByKey() 

defaultdict(int,
            {Row(label=0.0, prediction=0.0): 197,
             Row(label=0.0, prediction=4.0): 1,
             Row(label=0.0, prediction=6.0): 1,
             Row(label=1.0, prediction=1.0): 179,
             Row(label=1.0, prediction=2.0): 10,
             Row(label=1.0, prediction=3.0): 1,
             Row(label=1.0, prediction=6.0): 1,
             Row(label=1.0, prediction=7.0): 1,
             Row(label=1.0, prediction=8.0): 1,
             Row(label=1.0, prediction=9.0): 1,
             Row(label=2.0, prediction=1.0): 12,
             Row(label=2.0, prediction=2.0): 197,
             Row(label=2.0, prediction=3.0): 1,
             Row(label=2.0, prediction=7.0): 1,
             Row(label=3.0, prediction=0.0): 1,
             Row(label=3.0, prediction=1.0): 1,
             Row(label=3.0, prediction=3.0): 184,
             Row(label=3.0, prediction=5.0): 5,
             Row(label=3.0, prediction=9.0): 1,
             Row(label=4.0, prediction=0.0): 1,
             

In [46]:
#Depreciated in Spark 2.0 -- Use accuracy
from pyspark.mllib.evaluation import MulticlassMetrics

dtresrdd = dtpredicts.select("prediction", "label").rdd #convert DataFrame to RDD.
dtmm = MulticlassMetrics(dtresrdd) 
print dtmm.precision() 
print(dtmm.confusionMatrix())

0.952547452547
DenseMatrix([[ 197.,    0.,    0.,    0.,    1.,    0.,    1.,    0.,    0.,
                 0.],
             [   0.,  179.,   10.,    1.,    0.,    0.,    1.,    1.,    1.,
                 1.],
             [   0.,   12.,  197.,    1.,    0.,    0.,    0.,    1.,    0.,
                 0.],
             [   1.,    1.,    0.,  184.,    0.,    5.,    0.,    0.,    0.,
                 1.],
             [   1.,    1.,    0.,    0.,  193.,    1.,    3.,    1.,    0.,
                 3.],
             [   0.,    1.,    0.,    2.,    2.,  185.,    2.,    0.,    2.,
                 4.],
             [   1.,    1.,    0.,    1.,    0.,    1.,  194.,    1.,    0.,
                 0.],
             [   0.,    1.,    3.,    1.,    0.,    0.,    0.,  192.,    1.,
                 0.],
             [   3.,    0.,    0.,    0.,    0.,    2.,    0.,    2.,  195.,
                 2.],
             [   0.,    0.,    0.,    1.,    2.,    5.,    1.,    1.,    3.,
               19



Using 5 fold validation, find the best model’s maxDepth among 5,10,15,20,25 and 30.

In [47]:
# n-fold validation and the results.
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

#ParamGridBuilder() – combinations of parameters and their values.
paramGrid = ParamGridBuilder()\
    .addGrid(dt.maxDepth, [5,10,15,20,25,30])\
    .build()

#setEstimatorParamMaps() takes ParamGridBuilder().
cv = CrossValidator().setEstimator(dt).setEvaluator(evaluator).setNumFolds(5).setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(pendttrain)

print cvmodel.bestModel._java_obj.getMaxDepth()
print "Accuracy : " +  str(MulticlassClassificationEvaluator().evaluate(cvmodel.bestModel.transform(pendtvalid)))

15
Accuracy : 0.952535081358


Create a pipeline of VectorAssembler and DecisionTreeClassifier to create a decision tree classifier model.