In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import findspark
findspark.init()
findspark.find()
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
sqlcontext=SQLContext(sc)

In [101]:
trainingDf=spark.read.csv("vehicle_stolen_dataset.csv",header=True,inferSchema=True)

In [102]:
trainingDf=trainingDf.select("brand","color","time","stoled")

In [103]:
trainingDf.show()
trainingDf.columns

+------+-----+-----+------+
| brand|color| time|stoled|
+------+-----+-----+------+
|   BMW|black|night|   yes|
|  Audi|black|night|    no|
|NISSAN|black|night|   yes|
|  VEGA|  red|  day|   yes|
|   BMW| blue|  day|    no|
|  Audi|black|  day|   yes|
|  VEGA|  red|night|    no|
|  Audi| blue|  day|   yes|
|  VEGA|black|  day|   yes|
|NISSAN| blue|  day|    no|
|   BMW|black|night|   yes|
|NISSAN|  red|  day|    no|
|  VEGA|black|night|   yes|
|   BMW|  red|  day|    no|
|  Audi|black|  day|   yes|
|  Audi| blue|night|   yes|
|  Audi|  red|  day|    no|
|NISSAN|black|  day|   yes|
|   BMW| blue|  day|   yes|
|   BMW|  red|night|   yes|
+------+-----+-----+------+



['brand', 'color', 'time', 'stoled']

In [104]:
from pyspark.ml.feature import RFormula

In [105]:
rf=RFormula(formula="stoled ~ brand + color + time")


In [106]:
indexedDf=rf.fit(trainingDf).transform(trainingDf)

In [107]:
indexedDf.show()

+------+-----+-----+------+--------------------+-----+
| brand|color| time|stoled|            features|label|
+------+-----+-----+------+--------------------+-----+
|   BMW|black|night|   yes| (6,[1,3],[1.0,1.0])|  0.0|
|  Audi|black|night|    no| (6,[0,3],[1.0,1.0])|  1.0|
|NISSAN|black|night|   yes| (6,[2,3],[1.0,1.0])|  0.0|
|  VEGA|  red|  day|   yes| (6,[4,5],[1.0,1.0])|  0.0|
|   BMW| blue|  day|    no| (6,[1,5],[1.0,1.0])|  1.0|
|  Audi|black|  day|   yes|[1.0,0.0,0.0,1.0,...|  0.0|
|  VEGA|  red|night|    no|       (6,[4],[1.0])|  1.0|
|  Audi| blue|  day|   yes| (6,[0,5],[1.0,1.0])|  0.0|
|  VEGA|black|  day|   yes| (6,[3,5],[1.0,1.0])|  0.0|
|NISSAN| blue|  day|    no| (6,[2,5],[1.0,1.0])|  1.0|
|   BMW|black|night|   yes| (6,[1,3],[1.0,1.0])|  0.0|
|NISSAN|  red|  day|    no|[0.0,0.0,1.0,0.0,...|  1.0|
|  VEGA|black|night|   yes|       (6,[3],[1.0])|  0.0|
|   BMW|  red|  day|    no|[0.0,1.0,0.0,0.0,...|  1.0|
|  Audi|black|  day|   yes|[1.0,0.0,0.0,1.0,...|  0.0|
|  Audi| b

In [180]:
train,test=indexedDf.randomSplit([0.6,0.4],1234)

In [181]:
from pyspark.ml.classification import LogisticRegression 

In [182]:
lr=LogisticRegression(featuresCol="features",labelCol="label")

In [183]:
model=lr.fit(train)

In [184]:
pred_results=model.transform(test)

In [185]:
pred_results.show()

+------+-----+-----+------+--------------------+-----+--------------------+--------------------+----------+
| brand|color| time|stoled|            features|label|       rawPrediction|         probability|prediction|
+------+-----+-----+------+--------------------+-----+--------------------+--------------------+----------+
|  Audi|black|  day|   yes|[1.0,0.0,0.0,1.0,...|  0.0|[1.18683994821813...|[0.76617541515698...|       0.0|
|  Audi|black|  day|   yes|[1.0,0.0,0.0,1.0,...|  0.0|[1.18683994821813...|[0.76617541515698...|       0.0|
|  Audi| blue|night|   yes|       (6,[0],[1.0])|  0.0|[-1.9873533605182...|[0.12053714562044...|       1.0|
|   BMW| blue|  day|   yes| (6,[1,5],[1.0,1.0])|  0.0|[0.17632085360392...|[0.54396636635511...|       0.0|
|NISSAN|black|night|   yes| (6,[2,3],[1.0,1.0])|  0.0|[0.45179567556003...|[0.61106608698419...|       0.0|
|NISSAN|  red|  day|    no|[0.0,0.0,1.0,0.0,...|  1.0|[-1.5777855837937...|[0.17110932601919...|       1.0|
+------+-----+-----+------+-

In [186]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [187]:
evaluator=MulticlassClassificationEvaluator().setLabelCol("label")

In [188]:
accuracy=evaluator.evaluate(pred_results)

In [189]:
accuracy

0.8518518518518519

In [190]:
from pyspark.ml.linalg import Vectors, DenseVector
model.predict(DenseVector([1.0, 0.0, 0.0, 0.0, 1.0, 1.0]))

1.0

In [168]:
indexedDf.collect()

[Row(brand='BMW', color='black', time='night', stoled='yes', features=SparseVector(6, {1: 1.0, 3: 1.0}), label=0.0),
 Row(brand='Audi', color='black', time='night', stoled='no', features=SparseVector(6, {0: 1.0, 3: 1.0}), label=1.0),
 Row(brand='NISSAN', color='black', time='night', stoled='yes', features=SparseVector(6, {2: 1.0, 3: 1.0}), label=0.0),
 Row(brand='VEGA', color='red', time='day', stoled='yes', features=SparseVector(6, {4: 1.0, 5: 1.0}), label=0.0),
 Row(brand='BMW', color='blue', time='day', stoled='no', features=SparseVector(6, {1: 1.0, 5: 1.0}), label=1.0),
 Row(brand='Audi', color='black', time='day', stoled='yes', features=DenseVector([1.0, 0.0, 0.0, 1.0, 0.0, 1.0]), label=0.0),
 Row(brand='VEGA', color='red', time='night', stoled='no', features=SparseVector(6, {4: 1.0}), label=1.0),
 Row(brand='Audi', color='blue', time='day', stoled='yes', features=SparseVector(6, {0: 1.0, 5: 1.0}), label=0.0),
 Row(brand='VEGA', color='black', time='day', stoled='yes', features=Sp