In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import findspark
findspark.init()
findspark.find()
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
sqlcontext=SQLContext(sc)

In [75]:
playtennisDf=spark.read.csv("playtennis1.csv",header=True,inferSchema=True)
playtennisDf.show()

+--------+-----------+--------+-----+-----+
| Outlook|Temperature|Humidity|Windy|Class|
+--------+-----------+--------+-----+-----+
|   sunny|        hot|    high|false|    n|
|   sunny|        hot|    high| true|    n|
|Overcast|        hot|    high|false|    p|
|    rain|       mild|    high|false|    p|
|    rain|       cool|  normal|false|    p|
|    rain|       cool|  normal| true|    n|
|Overcast|       cool|  normal| true|    p|
|   sunny|       mild|    high|false|    n|
|   sunny|       cool|  normal|false|    p|
|    rain|       mild|  normal|false|    p|
|   sunny|       mild|  normal| true|    p|
|Overcast|       mild|    high| true|    p|
|Overcast|        hot|  normal|false|    p|
|    rain|       mild|    high| true|    n|
|    rain|        hot|    high|false|    n|
+--------+-----------+--------+-----+-----+



In [76]:
playtennisDf=playtennisDf.select(playtennisDf["Windy"].cast("Integer"),'Outlook', 'Temperature', 'Humidity','Class')

In [77]:
playtennisDf.show()

+-----+--------+-----------+--------+-----+
|Windy| Outlook|Temperature|Humidity|Class|
+-----+--------+-----------+--------+-----+
|    0|   sunny|        hot|    high|    n|
|    1|   sunny|        hot|    high|    n|
|    0|Overcast|        hot|    high|    p|
|    0|    rain|       mild|    high|    p|
|    0|    rain|       cool|  normal|    p|
|    1|    rain|       cool|  normal|    n|
|    1|Overcast|       cool|  normal|    p|
|    0|   sunny|       mild|    high|    n|
|    0|   sunny|       cool|  normal|    p|
|    0|    rain|       mild|  normal|    p|
|    1|   sunny|       mild|  normal|    p|
|    1|Overcast|       mild|    high|    p|
|    0|Overcast|        hot|  normal|    p|
|    1|    rain|       mild|    high|    n|
|    0|    rain|        hot|    high|    n|
+-----+--------+-----------+--------+-----+



In [93]:
train,test=playtennisDf.randomSplit([0.75,0.25],123)

In [94]:
from pyspark.ml.feature import RFormula

In [95]:
rf=RFormula(formula="Class ~ Windy + Outlook + Temperature + Humidity",labelCol="label")


In [96]:
from pyspark.ml.classification import NaiveBayes

In [97]:
nb=NaiveBayes(featuresCol="features",labelCol="label")


In [98]:
stages=[rf,nb]

In [99]:
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [100]:
pipeline=Pipeline().setStages(stages)

In [102]:
model=pipeline.fit(train)


In [103]:
pred_res=model.transform(test)

In [107]:
pred_res.show()

+-----+--------+-----------+--------+-----+--------------------+-----+--------------------+--------------------+----------+
|Windy| Outlook|Temperature|Humidity|Class|            features|label|       rawPrediction|         probability|prediction|
+-----+--------+-----------+--------+-----+--------------------+-----+--------------------+--------------------+----------+
|    0|    rain|       cool|  normal|    p|       (6,[1],[1.0])|  0.0|[-2.5314266654228...|[0.48837209302325...|       1.0|
|    0|   sunny|       cool|  normal|    p|           (6,[],[])|  0.0|[-0.5389965007326...|[0.58333333333333...|       0.0|
|    1|Overcast|       cool|  normal|    p| (6,[0,2],[1.0,1.0])|  0.0|[-4.2361747576613...|[0.79612468407750...|       0.0|
|    1|    rain|       mild|    high|    n|[1.0,1.0,0.0,1.0,...|  1.0|[-7.7102094632757...|[0.53152290154383...|       0.0|
|    1|   sunny|        hot|    high|    n|[1.0,0.0,0.0,0.0,...|  1.0|[-6.2286049223515...|[0.39962222660765...|       1.0|
+-----+-

In [105]:
evaluator=MulticlassClassificationEvaluator(labelCol="prediction")

In [106]:
evaluator.setLabelCol("label").evaluate(pred_res) #accuracy

0.6000000000000001