In [2]:
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession                    

In [3]:
spark = SparkSession.builder.master("local").\
    appName("CNPJ").\
        config("spark.executer.memory","1gb").\
            getOrCreate()

In [4]:
iris = spark.read.csv("D:\downloads\iris.csv",inferSchema=True,header=True)

In [5]:
iris.show(5)

+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 5 rows



In [8]:
formula = RFormula(formula="class ~ .",featuresCol="features",labelCol="label",handleInvalid="skip")

In [9]:
iris_transf = formula.fit(iris).transform(iris).select("features","label")

In [12]:
iris_transf.show(150)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
|[5.4,3.9,1.7,0.4]|  0.0|
|[4.6,3.4,1.4,0.3]|  0.0|
|[5.0,3.4,1.5,0.2]|  0.0|
|[4.4,2.9,1.4,0.2]|  0.0|
|[4.9,3.1,1.5,0.1]|  0.0|
|[5.4,3.7,1.5,0.2]|  0.0|
|[4.8,3.4,1.6,0.2]|  0.0|
|[4.8,3.0,1.4,0.1]|  0.0|
|[4.3,3.0,1.1,0.1]|  0.0|
|[5.8,4.0,1.2,0.2]|  0.0|
|[5.7,4.4,1.5,0.4]|  0.0|
|[5.4,3.9,1.3,0.4]|  0.0|
|[5.1,3.5,1.4,0.3]|  0.0|
|[5.7,3.8,1.7,0.3]|  0.0|
|[5.1,3.8,1.5,0.3]|  0.0|
|[5.4,3.4,1.7,0.2]|  0.0|
|[5.1,3.7,1.5,0.4]|  0.0|
|[4.6,3.6,1.0,0.2]|  0.0|
|[5.1,3.3,1.7,0.5]|  0.0|
|[4.8,3.4,1.9,0.2]|  0.0|
|[5.0,3.0,1.6,0.2]|  0.0|
|[5.0,3.4,1.6,0.4]|  0.0|
|[5.2,3.5,1.5,0.2]|  0.0|
|[5.2,3.4,1.4,0.2]|  0.0|
|[4.7,3.2,1.6,0.2]|  0.0|
|[4.8,3.1,1.6,0.2]|  0.0|
|[5.4,3.4,1.5,0.4]|  0.0|
|[5.2,4.1,1.5,0.1]|  0.0|
|[5.5,4.2,1.4,0.2]|  0.0|
|[4.9,3.1,1.5,0.1]|  0.0|
|[5.0,3.2,1.

In [13]:
irisTreino, irisTeste = iris_transf.randomSplit([0.7,0.3])

In [14]:
irisTreino.count()

101

In [15]:
irisTeste.count()

49

In [16]:
nb = NaiveBayes(labelCol="label",featuresCol="features")

In [17]:
modelo = nb.fit(irisTreino)

In [18]:
previsao = modelo.transform(irisTeste)

In [19]:
previsao.show()

+-----------------+-----+--------------------+--------------------+----------+
|         features|label|       rawPrediction|         probability|prediction|
+-----------------+-----+--------------------+--------------------+----------+
|[4.4,2.9,1.4,0.2]|  0.0|[-10.742669979103...|[0.66754198228247...|       0.0|
|[4.7,3.2,1.6,0.2]|  0.0|[-11.668963000583...|[0.68629094963761...|       0.0|
|[4.8,3.0,1.4,0.3]|  0.0|[-11.498474161851...|[0.66836168856034...|       0.0|
|[4.8,3.1,1.6,0.2]|  0.0|[-11.630685896805...|[0.67795207925316...|       0.0|
|[4.8,3.4,1.6,0.2]|  0.0|[-11.957499330152...|[0.71510209607847...|       0.0|
|[4.9,2.4,3.3,1.0]|  1.0|[-17.146298726659...|[0.11571231205200...|       1.0|
|[4.9,3.1,1.5,0.1]|  0.0|[-11.143374328795...|[0.72628737406873...|       0.0|
|[5.0,2.3,3.3,1.0]|  1.0|[-17.108021622880...|[0.11181084969987...|       1.0|
|[5.0,3.3,1.4,0.2]|  0.0|[-11.602385467587...|[0.74133795084324...|       0.0|
|[5.0,3.4,1.5,0.2]|  0.0|[-11.905072011764...|[0.738

In [20]:
avaliar = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label",metricName="accuracy")

In [21]:
resultado = avaliar.evaluate(previsao)

In [22]:
resultado

0.5918367346938775