# Atividade

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').getOrCreate()

from pyspark.ml.feature import RFormula
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Preparando os dados

In [3]:
iris_temp = spark.read.csv("/home/elvin/download/iris.csv", inferSchema=True, header=True)
iris_temp.show()

                                                                                

+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
|        5.4|       3.9|        1.7|       0.4|Iris-setosa|
|        4.6|       3.4|        1.4|       0.3|Iris-setosa|
|        5.0|       3.4|        1.5|       0.2|Iris-setosa|
|        4.4|       2.9|        1.4|       0.2|Iris-setosa|
|        4.9|       3.1|        1.5|       0.1|Iris-setosa|
|        5.4|       3.7|        1.5|       0.2|Iris-setosa|
|        4.8|       3.4|        1.6|       0.2|Iris-setosa|
|        4.8|       3.0|        1.4|       0.1|Iris-setosa|
|        4.3|       3.0|        1.1|    

In [14]:
formula = RFormula(formula='class ~ .', featuresCol='features', labelCol='label', handleInvalid='skip')
iris_trans = formula.fit(iris_temp).transform(iris_temp).select('features', 'label')
iris_trans.show()

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
|[5.4,3.9,1.7,0.4]|  0.0|
|[4.6,3.4,1.4,0.3]|  0.0|
|[5.0,3.4,1.5,0.2]|  0.0|
|[4.4,2.9,1.4,0.2]|  0.0|
|[4.9,3.1,1.5,0.1]|  0.0|
|[5.4,3.7,1.5,0.2]|  0.0|
|[4.8,3.4,1.6,0.2]|  0.0|
|[4.8,3.0,1.4,0.1]|  0.0|
|[4.3,3.0,1.1,0.1]|  0.0|
|[5.8,4.0,1.2,0.2]|  0.0|
|[5.7,4.4,1.5,0.4]|  0.0|
|[5.4,3.9,1.3,0.4]|  0.0|
|[5.1,3.5,1.4,0.3]|  0.0|
|[5.7,3.8,1.7,0.3]|  0.0|
|[5.1,3.8,1.5,0.3]|  0.0|
+-----------------+-----+
only showing top 20 rows



In [10]:
iris_trans.select('label').distinct().show()

+-----+
|label|
+-----+
|  0.0|
|  1.0|
|  2.0|
+-----+



In [13]:
iris_treino, iris_teste = iris_trans.randomSplit([0.7, 0.3])
print(iris_treino.count())
print(iris_teste.count())

107
43


## Criando os modelos de classificação

In [16]:
rfclassifier = RandomForestClassifier(featuresCol='features', labelCol='label')
modelo = rfclassifier.fit(iris_treino)
previsao = modelo.transform(iris_teste)
previsao.show()

+-----------------+-----+--------------------+--------------------+----------+
|         features|label|       rawPrediction|         probability|prediction|
+-----------------+-----+--------------------+--------------------+----------+
|[4.4,2.9,1.4,0.2]|  0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|[4.6,3.4,1.4,0.3]|  0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|[4.8,3.4,1.9,0.2]|  0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|[4.9,2.4,3.3,1.0]|  1.0|[0.0,19.945906432...|[0.0,0.9972953216...|       1.0|
|[4.9,2.5,4.5,1.7]|  2.0|[0.0,17.973684210...|[0.0,0.8986842105...|       1.0|
|[4.9,3.1,1.5,0.1]|  0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|[5.0,2.3,3.3,1.0]|  1.0|[0.0,19.945906432...|[0.0,0.9972953216...|       1.0|
|[5.0,3.4,1.5,0.2]|  0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|[5.0,3.5,1.3,0.3]|  0.0|      [20.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|[5.0,3.6,1.4,0.2]|  0.0|      [20.0,0.0,0.0]|      

In [18]:
avaliacao = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
accuracy = avaliacao.evaluate(previsao)
print(accuracy)

0.9767441860465116


In [19]:
nbclassifier = NaiveBayes(modelType="multinomial", featuresCol='features', labelCol='label')
modelo2 = nbclassifier.fit(iris_treino)
previsao2 = modelo2.transform(iris_teste)
previsao2.show()

+-----------------+-----+--------------------+--------------------+----------+
|         features|label|       rawPrediction|         probability|prediction|
+-----------------+-----+--------------------+--------------------+----------+
|[4.4,2.9,1.4,0.2]|  0.0|[-10.882805061628...|[0.62421525719613...|       0.0|
|[4.6,3.4,1.4,0.3]|  0.0|[-11.934366673348...|[0.66949503664793...|       0.0|
|[4.8,3.4,1.9,0.2]|  0.0|[-12.679188312511...|[0.62368343836520...|       0.0|
|[4.9,2.4,3.3,1.0]|  1.0|[-17.263123600977...|[0.09853767015200...|       1.0|
|[4.9,2.5,4.5,1.7]|  2.0|[-22.232621251688...|[0.01732881069558...|       2.0|
|[4.9,3.1,1.5,0.1]|  0.0|[-11.284586525440...|[0.68725680233504...|       0.0|
|[5.0,2.3,3.3,1.0]|  1.0|[-17.223941367806...|[0.09520515532030...|       1.0|
|[5.0,3.4,1.5,0.2]|  0.0|[-12.046560795491...|[0.70013291645220...|       0.0|
|[5.0,3.5,1.3,0.3]|  0.0|[-12.132226340255...|[0.71545902799630...|       0.0|
|[5.0,3.6,1.4,0.2]|  0.0|[-12.072379820241...|[0.738

In [20]:
accuracy2 = avaliacao.evaluate(previsao2)
print(accuracy2)

0.7906976744186046
