In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import findspark
findspark.init()
findspark.find()
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
sqlcontext=SQLContext(sc)

In [54]:
playtennisDf=spark.read.csv("playtennis1.csv",header=True,inferSchema=True)
playtennisDf.show()

+--------+-----------+--------+-----+-----+
| Outlook|Temperature|Humidity|Windy|Class|
+--------+-----------+--------+-----+-----+
|   sunny|        hot|    high|false|    n|
|   sunny|        hot|    high| true|    n|
|Overcast|        hot|    high|false|    p|
|    rain|       mild|    high|false|    p|
|    rain|       cool|  normal|false|    p|
|    rain|       cool|  normal| true|    n|
|Overcast|       cool|  normal| true|    p|
|   sunny|       mild|    high|false|    n|
|   sunny|       cool|  normal|false|    p|
|    rain|       mild|  normal|false|    p|
|   sunny|       mild|  normal| true|    p|
|Overcast|       mild|    high| true|    p|
|Overcast|        hot|  normal|false|    p|
|    rain|       mild|    high| true|    n|
|    rain|        hot|    high|false|    n|
+--------+-----------+--------+-----+-----+



In [55]:
tennisDf=playtennisDf.select(playtennisDf["Windy"].cast("Integer"),'Outlook', 'Temperature', 'Humidity','Class')

In [56]:
#test_data=spark.createDataFrame([(0,"rain","hot","high")]).toDF("Windy","Outlook","Temperature","Humidity")


In [57]:
from pyspark.ml.feature import StringIndexer

In [58]:
#indexing strings for tennisdf
indexer_train=StringIndexer().setInputCols(["Windy","Outlook","Temperature","Humidity","Class"])\
.setOutputCols(["Windy_ind","Outlook_ind","Temperature_ind","Humidity_ind","label"])

In [59]:
indexed_tennisDF=indexer_train.fit(tennisDf).transform(tennisDf)

In [60]:
indexed_tennisDF.show()

+-----+--------+-----------+--------+-----+-----+---------+---------------+-----------+------------+
|Windy| Outlook|Temperature|Humidity|Class|label|Windy_ind|Temperature_ind|Outlook_ind|Humidity_ind|
+-----+--------+-----------+--------+-----+-----+---------+---------------+-----------+------------+
|    0|   sunny|        hot|    high|    n|  1.0|      0.0|            1.0|        1.0|         0.0|
|    1|   sunny|        hot|    high|    n|  1.0|      1.0|            1.0|        1.0|         0.0|
|    0|Overcast|        hot|    high|    p|  0.0|      0.0|            1.0|        2.0|         0.0|
|    0|    rain|       mild|    high|    p|  0.0|      0.0|            0.0|        0.0|         0.0|
|    0|    rain|       cool|  normal|    p|  0.0|      0.0|            2.0|        0.0|         1.0|
|    1|    rain|       cool|  normal|    n|  1.0|      1.0|            2.0|        0.0|         1.0|
|    1|Overcast|       cool|  normal|    p|  0.0|      1.0|            2.0|        2.0|    

In [61]:
# Assembling indep features with vectorassembler
from pyspark.ml.feature import VectorAssembler

In [62]:
va=VectorAssembler().setInputCols(["Windy_ind","Outlook_ind","Temperature_ind","Humidity_ind"]).setOutputCol("features")

In [63]:
finalized_DF=va.transform(indexed_tennisDF)

In [64]:
finalized_DF.show()

+-----+--------+-----------+--------+-----+-----+---------+---------------+-----------+------------+-----------------+
|Windy| Outlook|Temperature|Humidity|Class|label|Windy_ind|Temperature_ind|Outlook_ind|Humidity_ind|         features|
+-----+--------+-----------+--------+-----+-----+---------+---------------+-----------+------------+-----------------+
|    0|   sunny|        hot|    high|    n|  1.0|      0.0|            1.0|        1.0|         0.0|[0.0,1.0,1.0,0.0]|
|    1|   sunny|        hot|    high|    n|  1.0|      1.0|            1.0|        1.0|         0.0|[1.0,1.0,1.0,0.0]|
|    0|Overcast|        hot|    high|    p|  0.0|      0.0|            1.0|        2.0|         0.0|[0.0,2.0,1.0,0.0]|
|    0|    rain|       mild|    high|    p|  0.0|      0.0|            0.0|        0.0|         0.0|        (4,[],[])|
|    0|    rain|       cool|  normal|    p|  0.0|      0.0|            2.0|        0.0|         1.0|[0.0,0.0,2.0,1.0]|
|    1|    rain|       cool|  normal|    n|  1.0

In [65]:
train,test=finalized_DF.randomSplit([0.8,0.2],1234)

In [66]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.linalg import * #importing vectors

In [67]:
nb=NaiveBayes(featuresCol="features",labelCol="label")

In [68]:
model=nb.fit(train)



In [69]:
pred_results=model.transform(test)

In [70]:
pred_results.select("Windy","Outlook","Temperature","Humidity","Class","label","probability","prediction").show()

+-----+--------+-----------+--------+-----+-----+--------------------+----------+
|Windy| Outlook|Temperature|Humidity|Class|label|         probability|prediction|
+-----+--------+-----------+--------+-----+-----+--------------------+----------+
|    0|Overcast|        hot|  normal|    p|  0.0|[0.74052395776247...|       0.0|
|    0|    rain|       mild|    high|    p|  0.0|[0.53333333333333...|       0.0|
+-----+--------+-----------+--------+-----+-----+--------------------+----------+



In [71]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [72]:
evaluator=MulticlassClassificationEvaluator()

In [73]:
accuracy=evaluator.evaluate(pred_results)
accuracy

1.0

In [87]:
# false, rain, hot, high (0.0,0.0,2.0,0.0) DenseVector([0.0, 0.0, 2.0, 0.0]))
# features rain=0.0,sunny=1.0,over=2.0    OUTLOOK
#          false=0.0,true=1.0         Wind
#          hot=2.0,mild=0.0,cool=1.0 Temperature 
#          high=0.0, normal=1.0        humidity 

#          format=wind,outlook,temp,humidity
model.predict(DenseVector([0.0, 0.0, 2.0, 1.0]))



0.0

In [89]:
model.predictProbability(DenseVector([0.0, 0.0, 2.0, 1.0]))

DenseVector([0.6114, 0.3886])

In [90]:
model.predictRaw(DenseVector([0.0, 0.0, 2.0, 1.0]))

DenseVector([-4.5688, -5.0219])