In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import findspark
findspark.init()
findspark.find()
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
sqlcontext=SQLContext(sc)

In [41]:
trainingDf=spark.read.csv("vehicle_stolen_dataset.csv",header=True,inferSchema=True)

In [42]:
trainingDf=trainingDf.select("brand","color","time","stoled")

In [43]:
trainingDf.show()
trainingDf.columns

+------+-----+-----+------+
| brand|color| time|stoled|
+------+-----+-----+------+
|   BMW|black|night|   yes|
|  Audi|black|night|    no|
|NISSAN|black|night|   yes|
|  VEGA|  red|  day|   yes|
|   BMW| blue|  day|    no|
|  Audi|black|  day|   yes|
|  VEGA|  red|night|    no|
|  Audi| blue|  day|   yes|
|  VEGA|black|  day|   yes|
|NISSAN| blue|  day|    no|
|   BMW|black|night|   yes|
|NISSAN|  red|  day|    no|
|  VEGA|black|night|   yes|
|   BMW|  red|  day|    no|
|  Audi|black|  day|   yes|
|  Audi| blue|night|   yes|
|  Audi|  red|  day|    no|
|NISSAN|black|  day|   yes|
|   BMW| blue|  day|   yes|
|   BMW|  red|night|   yes|
+------+-----+-----+------+



['brand', 'color', 'time', 'stoled']

In [39]:
#train,test=trainingDf.randomSplit([0.7,0.3])
#test=spark.createDataFrame([("Audi","blue","day","yes")]).toDF("brand","color","time","stoled")

In [40]:
from pyspark.ml.feature import RFormula

In [44]:
rf=RFormula(formula="stoled ~ brand + color + time")
indexed_trainingDF=rf.fit(trainingDf).transform(trainingDf)

In [45]:
indexed_trainingDF.show()

+------+-----+-----+------+--------------------+-----+
| brand|color| time|stoled|            features|label|
+------+-----+-----+------+--------------------+-----+
|   BMW|black|night|   yes| (6,[1,3],[1.0,1.0])|  0.0|
|  Audi|black|night|    no| (6,[0,3],[1.0,1.0])|  1.0|
|NISSAN|black|night|   yes| (6,[2,3],[1.0,1.0])|  0.0|
|  VEGA|  red|  day|   yes| (6,[4,5],[1.0,1.0])|  0.0|
|   BMW| blue|  day|    no| (6,[1,5],[1.0,1.0])|  1.0|
|  Audi|black|  day|   yes|[1.0,0.0,0.0,1.0,...|  0.0|
|  VEGA|  red|night|    no|       (6,[4],[1.0])|  1.0|
|  Audi| blue|  day|   yes| (6,[0,5],[1.0,1.0])|  0.0|
|  VEGA|black|  day|   yes| (6,[3,5],[1.0,1.0])|  0.0|
|NISSAN| blue|  day|    no| (6,[2,5],[1.0,1.0])|  1.0|
|   BMW|black|night|   yes| (6,[1,3],[1.0,1.0])|  0.0|
|NISSAN|  red|  day|    no|[0.0,0.0,1.0,0.0,...|  1.0|
|  VEGA|black|night|   yes|       (6,[3],[1.0])|  0.0|
|   BMW|  red|  day|    no|[0.0,1.0,0.0,0.0,...|  1.0|
|  Audi|black|  day|   yes|[1.0,0.0,0.0,1.0,...|  0.0|
|  Audi| b

In [14]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.linalg import *

In [55]:
train,test=indexed_trainingDF.randomSplit([0.7,0.3],1234)


In [56]:
nb=NaiveBayes(featuresCol="features",labelCol="label")

In [57]:
#stages=[rf,nb]

In [58]:
#from pyspark.ml import Pipeline

In [59]:
#pipeline=Pipeline().setStages(stages)

In [60]:
model=nb.fit(train)

In [61]:
pred_results=model.transform(test)

In [62]:
pred_results.show()

+-----+-----+-----+------+--------------------+-----+--------------------+--------------------+----------+
|brand|color| time|stoled|            features|label|       rawPrediction|         probability|prediction|
+-----+-----+-----+------+--------------------+-----+--------------------+--------------------+----------+
| Audi|black|  day|   yes|[1.0,0.0,0.0,1.0,...|  0.0|[-5.8900165215892...|[0.66029056470057...|       0.0|
| Audi|black|  day|   yes|[1.0,0.0,0.0,1.0,...|  0.0|[-5.8900165215892...|[0.66029056470057...|       0.0|
| Audi| blue|night|   yes|       (6,[0],[1.0])|  0.0|[-3.1114930638296...|[0.43682310469314...|       1.0|
+-----+-----+-----+------+--------------------+-----+--------------------+--------------------+----------+



In [63]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [64]:
evaluator=MulticlassClassificationEvaluator().setLabelCol("label")

In [65]:
accuracy=evaluator.evaluate(pred_results)

In [66]:
accuracy

0.8000000000000002