In [1]:
from pyspark import SparkContext
sc=SparkContext(master='local')

from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Naive bayes example').\
config('spark.some.config.option','some-value').getOrCreate()

## Import the data

In [2]:
iris=spark.read.csv('Iris-data.csv',inferSchema=True,header=True)
iris.show(5)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 5 rows



In [8]:
iris=iris.withColumnRenamed('SepalLengthCm','sepalLength')
iris=iris.withColumnRenamed('SepalWidthCm','sepalWidth')
iris=iris.withColumnRenamed('PetalLengthCm','petalLength')
iris=iris.withColumnRenamed('PetalWidthCm','petalWidth')

In [9]:
iris.show(5)

+---+-----------+----------+-----------+----------+-----------+
| Id|sepalLength|sepalWidth|petalLength|petalWidth|    Species|
+---+-----------+----------+-----------+----------+-----------+
|  1|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|  2|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|  3|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|  4|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|  5|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
+---+-----------+----------+-----------+----------+-----------+
only showing top 5 rows



In [16]:
iris.dtypes

[('Id', 'int'),
 ('sepalLength', 'double'),
 ('sepalWidth', 'double'),
 ('petalLength', 'double'),
 ('petalWidth', 'double'),
 ('Species', 'string')]

In [19]:
iris=iris.drop('Id')

In [20]:
iris.describe().show()

+-------+------------------+-------------------+------------------+------------------+--------------+
|summary|       sepalLength|         sepalWidth|       petalLength|        petalWidth|       Species|
+-------+------------------+-------------------+------------------+------------------+--------------+
|  count|               150|                150|               150|               150|           150|
|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|          null|
| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|          null|
|    min|               4.3|                2.0|               1.0|               0.1|   Iris-setosa|
|    max|               7.9|                4.4|               6.9|               2.5|Iris-virginica|
+-------+------------------+-------------------+------------------+------------------+--------------+



## Merge features to create a feature column

In [21]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

In [23]:
iris2=iris.rdd.map(lambda x:Row(features=Vectors.dense(x[:-1]),species=x[-1])).toDF()
iris2.show(5)

+-----------------+-----------+
|         features|    species|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|Iris-setosa|
|[4.9,3.0,1.4,0.2]|Iris-setosa|
|[4.7,3.2,1.3,0.2]|Iris-setosa|
|[4.6,3.1,1.5,0.2]|Iris-setosa|
|[5.0,3.6,1.4,0.2]|Iris-setosa|
+-----------------+-----------+
only showing top 5 rows



## Index label column with string indexer

In [24]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

In [25]:
strindexer=StringIndexer(inputCol='species',outputCol='label')
stages=[strindexer]
pipeline=Pipeline(stages=stages)

### Transform data

In [27]:
iris_df=pipeline.fit(iris2).transform(iris2)
iris_df.show(5)

+-----------------+-----------+-----+
|         features|    species|label|
+-----------------+-----------+-----+
|[5.1,3.5,1.4,0.2]|Iris-setosa|  0.0|
|[4.9,3.0,1.4,0.2]|Iris-setosa|  0.0|
|[4.7,3.2,1.3,0.2]|Iris-setosa|  0.0|
|[4.6,3.1,1.5,0.2]|Iris-setosa|  0.0|
|[5.0,3.6,1.4,0.2]|Iris-setosa|  0.0|
+-----------------+-----------+-----+
only showing top 5 rows



In [28]:
iris_df.groupBy('species','label').count().show()

+---------------+-----+-----+
|        species|label|count|
+---------------+-----+-----+
|    Iris-setosa|  0.0|   50|
| Iris-virginica|  2.0|   50|
|Iris-versicolor|  1.0|   50|
+---------------+-----+-----+



# Naive Bayes

## Split the data

In [29]:
train_df,test_df=iris_df.randomSplit([0.8,0.2],seed=1234)

## Build cross-validation model

### Estimator

In [30]:
from pyspark.ml.classification import NaiveBayes
nb=NaiveBayes(featuresCol='features',labelCol='label')

### Parameter Grid

In [31]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid=ParamGridBuilder().addGrid(nb.smoothing,[0,1,2,4,8]).build()

### Evaluator

In [32]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator=MulticlassClassificationEvaluator()

### Cross validation model

In [34]:
from pyspark.ml.tuning import CrossValidator
cv=CrossValidator(estimator=nb,estimatorParamMaps=param_grid,evaluator=evaluator)

### Fit the model

In [35]:
cv_model=cv.fit(train_df)

### Prediction on the data

In [36]:
train_pred=cv_model.transform(train_df)
train_pred.show(5)

+-----------------+-----------+-----+--------------------+--------------------+----------+
|         features|    species|label|       rawPrediction|         probability|prediction|
+-----------------+-----------+-----+--------------------+--------------------+----------+
|[4.4,3.2,1.3,0.2]|Iris-setosa|  0.0|[-10.991182067744...|[0.69043491224626...|       0.0|
|[4.5,2.3,1.3,0.3]|Iris-setosa|  0.0|[-10.437996923451...|[0.53954099724437...|       0.0|
|[4.6,3.1,1.5,0.2]|Iris-setosa|  0.0|[-11.409627485465...|[0.65314888582843...|       0.0|
|[4.6,3.2,1.4,0.2]|Iris-setosa|  0.0|[-11.326047137369...|[0.68288612816400...|       0.0|
|[4.6,3.4,1.4,0.3]|Iris-setosa|  0.0|[-11.903399773384...|[0.68013704003242...|       0.0|
+-----------------+-----------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [37]:
test_pred=cv_model.transform(test_df)
test_pred.show(5)

+-----------------+-----------+-----+--------------------+--------------------+----------+
|         features|    species|label|       rawPrediction|         probability|prediction|
+-----------------+-----------+-----+--------------------+--------------------+----------+
|[4.3,3.0,1.1,0.1]|Iris-setosa|  0.0|[-9.9571767911548...|[0.72010736400660...|       0.0|
|[4.4,2.9,1.4,0.2]|Iris-setosa|  0.0|[-10.856296304349...|[0.63406956123413...|       0.0|
|[4.4,3.0,1.3,0.2]|Iris-setosa|  0.0|[-10.772715956253...|[0.66463053886253...|       0.0|
|[4.8,3.1,1.6,0.2]|Iris-setosa|  0.0|[-11.744492555091...|[0.64516268270226...|       0.0|
|[5.0,3.3,1.4,0.2]|Iris-setosa|  0.0|[-11.719383524683...|[0.71215569656321...|       0.0|
+-----------------+-----------+-----+--------------------+--------------------+----------+
only showing top 5 rows



### Accuracy

Four metrics are available:
    <ul>
    <li>f1</li>
    <li>weightedRecall</li>
    <li>weightedPrecision</li>
    <li>accuracy</li>
    </ul>

#### Training accuracy

In [43]:
print('accuracy of training(f1): ',evaluator.setMetricName('f1').evaluate(train_pred))
print('accuracy of training(weightedRecall): ',evaluator.setMetricName('weightedRecall').evaluate(train_pred))
print('accuracy of training(weightedPrecision): ',evaluator.setMetricName('weightedPrecision').evaluate(train_pred))
print('accuracy of training(accuracy): ',evaluator.setMetricName('accuracy').evaluate(train_pred))

accuracy of training(f1):  0.9523551844980417
accuracy of training(weightedRecall):  0.9523809523809523
accuracy of training(weightedPrecision):  0.9530175936680002
accuracy of training(accuracy):  0.9523809523809523


#### Testing accuracy

In [44]:
print('accuracy of test data: ',evaluator.setMetricName('f1').evaluate(test_pred))
print('accuracy of test data: ',evaluator.setMetricName('weightedRecall').evaluate(test_pred))
print('accuracy of test data: ',evaluator.setMetricName('weightedPrecision').evaluate(test_pred))
print('accuracy of test data: ',evaluator.setMetricName('accuracy').evaluate(test_pred))

accuracy of test data:  0.958119658119658
accuracy of test data:  0.9583333333333335
accuracy of test data:  0.9635416666666667
accuracy of test data:  0.9583333333333334


### Best model

In [42]:
cv_model.bestModel._java_obj.getSmoothing()

1.0

### confusion matrix on training data

In [46]:
train_conf_mat=train_pred.select('label','prediction')
train_conf_mat.rdd.zipWithIndex().countByKey()

defaultdict(int,
            {Row(label=0.0, prediction=0.0): 40,
             Row(label=1.0, prediction=1.0): 39,
             Row(label=2.0, prediction=2.0): 41,
             Row(label=1.0, prediction=2.0): 4,
             Row(label=2.0, prediction=1.0): 2})

### confusion matrix on test data

In [49]:
test_conf_mat=test_pred.select('label','prediction')
test_conf_mat.rdd.zipWithIndex().countByKey()

defaultdict(int,
            {Row(label=0.0, prediction=0.0): 10,
             Row(label=1.0, prediction=1.0): 7,
             Row(label=2.0, prediction=2.0): 6,
             Row(label=2.0, prediction=1.0): 1})

# Random Forest

In [50]:
from pyspark.ml.classification import RandomForestClassifier
rt=RandomForestClassifier(featuresCol='features',labelCol='label')

## parameter grid

In [55]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid=ParamGridBuilder().addGrid(rt.maxDepth,[2,3,4,5]).addGrid(rt.minInfoGain,[0.0,0.1,0.2,0.3]).build()

## evaluator

In [52]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator=MulticlassClassificationEvaluator()

## Cross validation

In [57]:
from pyspark.ml.tuning import CrossValidator
cv=CrossValidator(estimator=rt,estimatorParamMaps=param_grid,evaluator=evaluator)

## Fit the model

In [58]:
cv_model=cv.fit(train_df)

## Prediction of data

### training data

In [59]:
rt_train_pred=cv_model.transform(train_df)
rt_train_pred.show(5)

+-----------------+-----------+-----+--------------+-------------+----------+
|         features|    species|label| rawPrediction|  probability|prediction|
+-----------------+-----------+-----+--------------+-------------+----------+
|[4.4,3.2,1.3,0.2]|Iris-setosa|  0.0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.5,2.3,1.3,0.3]|Iris-setosa|  0.0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.6,3.1,1.5,0.2]|Iris-setosa|  0.0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.6,3.2,1.4,0.2]|Iris-setosa|  0.0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.6,3.4,1.4,0.3]|Iris-setosa|  0.0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
+-----------------+-----------+-----+--------------+-------------+----------+
only showing top 5 rows



### testing data

In [60]:
rt_test_pred=cv_model.transform(test_df)
rt_test_pred.show(5)

+-----------------+-----------+-----+--------------+-------------+----------+
|         features|    species|label| rawPrediction|  probability|prediction|
+-----------------+-----------+-----+--------------+-------------+----------+
|[4.3,3.0,1.1,0.1]|Iris-setosa|  0.0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.4,2.9,1.4,0.2]|Iris-setosa|  0.0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.4,3.0,1.3,0.2]|Iris-setosa|  0.0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.8,3.1,1.6,0.2]|Iris-setosa|  0.0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[5.0,3.3,1.4,0.2]|Iris-setosa|  0.0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
+-----------------+-----------+-----+--------------+-------------+----------+
only showing top 5 rows



## Accuracy

In [61]:
print('accuracy of training(f1): ',evaluator.setMetricName('f1').evaluate(rt_train_pred))
print('accuracy of training(weightedRecall): ',evaluator.setMetricName('weightedRecall').evaluate(rt_train_pred))
print('accuracy of training(weightedPrecision): ',evaluator.setMetricName('weightedPrecision').evaluate(rt_train_pred))
print('accuracy of training(accuracy): ',evaluator.setMetricName('accuracy').evaluate(rt_train_pred))

accuracy of training(f1):  0.9523809523809523
accuracy of training(weightedRecall):  0.9523809523809523
accuracy of training(weightedPrecision):  0.9523809523809523
accuracy of training(accuracy):  0.9523809523809523


In [62]:
print('accuracy of testing(f1): ',evaluator.setMetricName('f1').evaluate(rt_test_pred))
print('accuracy of testing(weightedRecall): ',evaluator.setMetricName('weightedRecall').evaluate(rt_test_pred))
print('accuracy of testing(weightedPrecision): ',evaluator.setMetricName('weightedPrecision').evaluate(rt_test_pred))
print('accuracy of testing(accuracy): ',evaluator.setMetricName('accuracy').evaluate(rt_test_pred))

accuracy of testing(f1):  0.958119658119658
accuracy of testing(weightedRecall):  0.9583333333333335
accuracy of testing(weightedPrecision):  0.9635416666666667
accuracy of testing(accuracy):  0.9583333333333334


## confusion matrix

### Training 

In [63]:
rt_train_confmatrix=rt_train_pred.select('label','prediction')
rt_train_confmatrix.rdd.zipWithIndex().countByKey()

defaultdict(int,
            {Row(label=0.0, prediction=0.0): 40,
             Row(label=1.0, prediction=1.0): 40,
             Row(label=2.0, prediction=1.0): 3,
             Row(label=2.0, prediction=2.0): 40,
             Row(label=1.0, prediction=2.0): 3})

### Testing

In [64]:
rt_test_confmatrix=rt_test_pred.select('label','prediction')
rt_test_confmatrix.rdd.zipWithIndex().countByKey()

defaultdict(int,
            {Row(label=0.0, prediction=0.0): 10,
             Row(label=1.0, prediction=1.0): 7,
             Row(label=2.0, prediction=2.0): 6,
             Row(label=2.0, prediction=1.0): 1})

## Best model`

In [65]:
cv_model.bestModel._java_obj.getMaxDepth()

3

In [66]:
cv_model.bestModel._java_obj.getMinInfoGain()

0.1