In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import findspark
findspark.init()
findspark.find()
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
sqlcontext=SQLContext(sc)

In [156]:
trainingDf=spark.read.csv("df.csv",header=True,inferSchema=True)
test=spark.createDataFrame([(38,71000)]).toDF("Age","Salary")

In [157]:
trainingDf.show(),test.show()

+---+------+---------+
|Age|Salary|Purchased|
+---+------+---------+
| 59| 88000|        0|
| 35| 61000|        1|
| 37| 70000|        0|
| 52| 21000|        0|
| 48|141000|        1|
| 37| 93000|        0|
| 37| 62000|        1|
| 48|138000|        0|
| 41| 79000|        1|
| 37| 78000|        0|
| 39|134000|        0|
| 49| 89000|        0|
| 55| 39000|        0|
| 37| 77000|        1|
| 35| 57000|        1|
| 36| 63000|        1|
| 42| 73000|        0|
+---+------+---------+

+---+------+
|Age|Salary|
+---+------+
| 38| 71000|
+---+------+



(None, None)

In [158]:
trainingDf.columns

['Age', 'Salary', 'Purchased']

In [159]:
from pyspark.ml.feature import VectorAssembler

In [160]:
va=VectorAssembler().setInputCols(["Age","Salary"]).setOutputCol("features")
indexedTrain=va.transform(trainingDf)
indexedTest=va.transform(test)
indexedTrain.show(),indexedTest.show()

+---+------+---------+---------------+
|Age|Salary|Purchased|       features|
+---+------+---------+---------------+
| 59| 88000|        0| [59.0,88000.0]|
| 35| 61000|        1| [35.0,61000.0]|
| 37| 70000|        0| [37.0,70000.0]|
| 52| 21000|        0| [52.0,21000.0]|
| 48|141000|        1|[48.0,141000.0]|
| 37| 93000|        0| [37.0,93000.0]|
| 37| 62000|        1| [37.0,62000.0]|
| 48|138000|        0|[48.0,138000.0]|
| 41| 79000|        1| [41.0,79000.0]|
| 37| 78000|        0| [37.0,78000.0]|
| 39|134000|        0|[39.0,134000.0]|
| 49| 89000|        0| [49.0,89000.0]|
| 55| 39000|        0| [55.0,39000.0]|
| 37| 77000|        1| [37.0,77000.0]|
| 35| 57000|        1| [35.0,57000.0]|
| 36| 63000|        1| [36.0,63000.0]|
| 42| 73000|        0| [42.0,73000.0]|
+---+------+---------+---------------+

+---+------+--------------+
|Age|Salary|      features|
+---+------+--------------+
| 38| 71000|[38.0,71000.0]|
+---+------+--------------+



(None, None)

In [161]:
from pyspark.ml.classification import NaiveBayes

In [162]:
nb=NaiveBayes(featuresCol="features",labelCol="Purchased")

In [163]:
model=nb.fit(indexedTrain)

In [164]:
pred_results=model.transform(indexedTest)

In [165]:
pred_results.show()

+---+------+--------------+--------------------+--------------------+----------+
|Age|Salary|      features|       rawPrediction|         probability|prediction|
+---+------+--------------+--------------------+--------------------+----------+
| 38| 71000|[38.0,71000.0]|[-324.82800977208...|[0.59421288295250...|       0.0|
+---+------+--------------+--------------------+--------------------+----------+



In [166]:
pred_results.collect()

[Row(Age=38, Salary=71000, features=DenseVector([38.0, 71000.0]), rawPrediction=DenseVector([-324.828, -325.2094]), probability=DenseVector([0.5942, 0.4058]), prediction=0.0)]

In [155]:
#Predict random results
from pyspark.ml.linalg import * # importing vectors

model.predict(DenseVector([38.0, 77700.0]))

0.0

## Person with age=38 and salary=71000 does not make the purchase

### Probability(YES)=0.4058, Probability(NO)=0.5942