In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import findspark
findspark.init()
findspark.find()
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
sqlcontext=SQLContext(sc)

In [363]:
fruitDF=spark.read.csv("fruits.csv",header=True,inferSchema=True)

In [364]:
fruitDF.show()
fruitDF.columns

+---------+------+------+
|PulpColor| Taste|Edible|
+---------+------+------+
|      red|  sour|   yes|
|      red|  sour|    no|
|      red|bitter|    no|
|   yellow| sweet|   yes|
|   yellow|bitter|    no|
|    green|  sour|   yes|
|    green| sweet|   yes|
|    green|bitter|    no|
+---------+------+------+



['PulpColor', 'Taste', 'Edible']

In [365]:
from pyspark.ml.feature import RFormula

In [366]:
rf=RFormula(formula="Edible ~ PulpColor + Taste" )

In [367]:
finalizedDF=rf.fit(fruitDF).transform(fruitDF)
finalizedDF.show()
finalizedDF.collect()


+---------+------+------+-----------------+-----+
|PulpColor| Taste|Edible|         features|label|
+---------+------+------+-----------------+-----+
|      red|  sour|   yes|[0.0,1.0,0.0,1.0]|  1.0|
|      red|  sour|    no|[0.0,1.0,0.0,1.0]|  0.0|
|      red|bitter|    no|[0.0,1.0,1.0,0.0]|  0.0|
|   yellow| sweet|   yes|        (4,[],[])|  1.0|
|   yellow|bitter|    no|    (4,[2],[1.0])|  0.0|
|    green|  sour|   yes|[1.0,0.0,0.0,1.0]|  1.0|
|    green| sweet|   yes|    (4,[0],[1.0])|  1.0|
|    green|bitter|    no|[1.0,0.0,1.0,0.0]|  0.0|
+---------+------+------+-----------------+-----+



[Row(PulpColor='red', Taste='sour', Edible='yes', features=DenseVector([0.0, 1.0, 0.0, 1.0]), label=1.0),
 Row(PulpColor='red', Taste='sour', Edible='no', features=DenseVector([0.0, 1.0, 0.0, 1.0]), label=0.0),
 Row(PulpColor='red', Taste='bitter', Edible='no', features=DenseVector([0.0, 1.0, 1.0, 0.0]), label=0.0),
 Row(PulpColor='yellow', Taste='sweet', Edible='yes', features=SparseVector(4, {}), label=1.0),
 Row(PulpColor='yellow', Taste='bitter', Edible='no', features=SparseVector(4, {2: 1.0}), label=0.0),
 Row(PulpColor='green', Taste='sour', Edible='yes', features=DenseVector([1.0, 0.0, 0.0, 1.0]), label=1.0),
 Row(PulpColor='green', Taste='sweet', Edible='yes', features=SparseVector(4, {0: 1.0}), label=1.0),
 Row(PulpColor='green', Taste='bitter', Edible='no', features=DenseVector([1.0, 0.0, 1.0, 0.0]), label=0.0)]

In [368]:
#test_df=spark.createDataFrame(finalizedDF.tail(1))
#test_df.show()

In [369]:
#train,test=finalizedDF.randomSplit([0.7,0.3],1234)

In [2]:
from pyspark.ml.classification import NaiveBayes, LogisticRegression
from pyspark.ml.linalg import *

In [371]:
nb=NaiveBayes(featuresCol="features",labelCol="label")

In [372]:
model=nb.fit(finalizedDF)

In [373]:
model.predict(DenseVector([0.0, 0.0, 0.0, 1.0]))

1.0

In [374]:
model.predictProbability(DenseVector([0.0, 0.0, 0.0, 1.0]))

DenseVector([0.3529, 0.6471])

In [None]:
# fruit is edible

In [3]:
sv=SparseVector(4,[0],[1.0])

In [4]:
sv.toArray()

array([1., 0., 0., 0.])