## Spark MILIB - modelo de classificação com -Decision Tree

Descrição

- facil de compreender e facil de explicar
- Variáveis preditoras são usadas para construir uma arvore que progressivamente preve valores target
- Dados de treino são usados para construir uma arvore de decisao
- as arvores de decisão se torna um modelo quye é usado para prever decisoes com novos dados

### Objetivo do estudo 
### classificar as espécies de flores listadas no dataset iris

In [26]:
# imports
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
# criando a sessao spark
spSession = SparkSession.builder.master('local').appName('SparkMlLib').getOrCreate()

In [3]:
# carregando dataset iris
irisRDD = sc.textFile('5-Arquivos-Cap11/data/iris.csv')

In [5]:
irisRDD.take(5)

['Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species',
 '5.1,3.5,1.4,0.2,setosa',
 '4.9,3,1.4,0.2,setosa',
 '4.7,3.2,1.3,0.2,setosa',
 '4.6,3.1,1.5,0.2,setosa']

In [6]:
## colocando em um cache para otimizar a performance
irisRDD.cache()

5-Arquivos-Cap11/data/iris.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [9]:
# removendo a primiera linha
irisRDD2 = irisRDD.filter(lambda x: "Sepal" not in x)
irisRDD2.cache()
irisRDD2.take(5)

['5.1,3.5,1.4,0.2,setosa',
 '4.9,3,1.4,0.2,setosa',
 '4.7,3.2,1.3,0.2,setosa',
 '4.6,3.1,1.5,0.2,setosa',
 '5,3.6,1.4,0.2,setosa']

Limpeza dos dados

In [10]:
irisRDD3 = irisRDD2.map(lambda x: x.split(','))

In [12]:
irisRDD3.take(5)

[['5.1', '3.5', '1.4', '0.2', 'setosa'],
 ['4.9', '3', '1.4', '0.2', 'setosa'],
 ['4.7', '3.2', '1.3', '0.2', 'setosa'],
 ['4.6', '3.1', '1.5', '0.2', 'setosa'],
 ['5', '3.6', '1.4', '0.2', 'setosa']]

In [14]:
irisRDD4 = irisRDD3.map(lambda x: Row(SEPAL_LENGTH=float(x[0]), SEPAL_WIDTH=float(x[1]),
                                     PETAL_LENGTH=float(x[2]), PETAL_WIDTH=float(x[3]), SPECIES=x[4]))
irisRDD4.take(5)

[Row(SEPAL_LENGTH=5.1, SEPAL_WIDTH=3.5, PETAL_LENGTH=1.4, PETAL_WIDTH=0.2, SPECIES='setosa'),
 Row(SEPAL_LENGTH=4.9, SEPAL_WIDTH=3.0, PETAL_LENGTH=1.4, PETAL_WIDTH=0.2, SPECIES='setosa'),
 Row(SEPAL_LENGTH=4.7, SEPAL_WIDTH=3.2, PETAL_LENGTH=1.3, PETAL_WIDTH=0.2, SPECIES='setosa'),
 Row(SEPAL_LENGTH=4.6, SEPAL_WIDTH=3.1, PETAL_LENGTH=1.5, PETAL_WIDTH=0.2, SPECIES='setosa'),
 Row(SEPAL_LENGTH=5.0, SEPAL_WIDTH=3.6, PETAL_LENGTH=1.4, PETAL_WIDTH=0.2, SPECIES='setosa')]

In [15]:
# criando dataframe
irisDF = spSession.createDataFrame(irisRDD4)
irisDF.head(5)

[Row(SEPAL_LENGTH=5.1, SEPAL_WIDTH=3.5, PETAL_LENGTH=1.4, PETAL_WIDTH=0.2, SPECIES='setosa'),
 Row(SEPAL_LENGTH=4.9, SEPAL_WIDTH=3.0, PETAL_LENGTH=1.4, PETAL_WIDTH=0.2, SPECIES='setosa'),
 Row(SEPAL_LENGTH=4.7, SEPAL_WIDTH=3.2, PETAL_LENGTH=1.3, PETAL_WIDTH=0.2, SPECIES='setosa'),
 Row(SEPAL_LENGTH=4.6, SEPAL_WIDTH=3.1, PETAL_LENGTH=1.5, PETAL_WIDTH=0.2, SPECIES='setosa'),
 Row(SEPAL_LENGTH=5.0, SEPAL_WIDTH=3.6, PETAL_LENGTH=1.4, PETAL_WIDTH=0.2, SPECIES='setosa')]

In [16]:
irisDF.cache()

DataFrame[SEPAL_LENGTH: double, SEPAL_WIDTH: double, PETAL_LENGTH: double, PETAL_WIDTH: double, SPECIES: string]

In [17]:
# criando indice numérico para species
stringIndice = StringIndexer(inputCol='SPECIES', outputCol='IDX_SPECIES')
si_model = stringIndice.fit(irisDF)
irisNormDF = si_model.transform(irisDF)

In [20]:
irisNormDF.select('SPECIES','IDX_SPECIES').distinct().collect()

[Row(SPECIES='setosa', IDX_SPECIES=0.0),
 Row(SPECIES='virginica', IDX_SPECIES=2.0),
 Row(SPECIES='versicolor', IDX_SPECIES=1.0)]

### Análise exploratória

In [23]:
irisNormDF.describe().show()

+-------+------------------+------------------+------------------+------------------+---------+------------------+
|summary|      SEPAL_LENGTH|       SEPAL_WIDTH|      PETAL_LENGTH|       PETAL_WIDTH|  SPECIES|       IDX_SPECIES|
+-------+------------------+------------------+------------------+------------------+---------+------------------+
|  count|               150|               150|               150|               150|      150|               150|
|   mean| 5.843333333333332|3.0573333333333337| 3.758000000000001|1.1993333333333331|     null|               1.0|
| stddev|0.8280661279778634|0.4358662849366978|1.7652982332594662|0.7622376689603467|     null|0.8192319205190406|
|    min|               4.3|               2.0|               1.0|               0.1|   setosa|               0.0|
|    max|               7.9|               4.4|               6.9|               2.5|virginica|               2.0|
+-------+------------------+------------------+------------------+--------------

In [25]:
# encontrando a correlação entre a variáveos
for i in irisNormDF.columns:
    if not(isinstance(irisNormDF.select(i).take(1)[0][0], str)):
        print("Correlação entre a variável target SPECIES e a variavel ", i, irisNormDF.stat.corr('IDX_SPECIES',i))

Correlação entre a variável target SPECIES e a variavel  SEPAL_LENGTH 0.7825612318100814
Correlação entre a variável target SPECIES e a variavel  SEPAL_WIDTH -0.4266575607811232
Correlação entre a variável target SPECIES e a variavel  PETAL_LENGTH 0.9490346990083887
Correlação entre a variável target SPECIES e a variavel  PETAL_WIDTH 0.9565473328764027
Correlação entre a variável target SPECIES e a variavel  IDX_SPECIES 1.0


## Pré processamento dos dados

In [34]:
# Criando um LabeledPOint (target, (Vector[features]))

def transformaVar(row):
    obj = (row['SPECIES'], row['IDX_SPECIES'], Vectors.dense([row['SEPAL_LENGTH'],row['SEPAL_WIDTH'],
                                                              row['PETAL_LENGTH'], row['PETAL_WIDTH']]))
    return obj

In [35]:
irisRDD5 = irisNormDF.rdd.map(transformaVar)

In [36]:
irisRDD5.take(5)

[('setosa', 0.0, DenseVector([5.1, 3.5, 1.4, 0.2])),
 ('setosa', 0.0, DenseVector([4.9, 3.0, 1.4, 0.2])),
 ('setosa', 0.0, DenseVector([4.7, 3.2, 1.3, 0.2])),
 ('setosa', 0.0, DenseVector([4.6, 3.1, 1.5, 0.2])),
 ('setosa', 0.0, DenseVector([5.0, 3.6, 1.4, 0.2]))]

In [39]:
irisDF = spSession.createDataFrame(irisRDD5,['species','label','features'])


In [40]:
irisDF.select('species','label','features').distinct().show()

+----------+-----+-----------------+
|   species|label|         features|
+----------+-----+-----------------+
|    setosa|  0.0|[4.6,3.6,1.0,0.2]|
| virginica|  2.0|[6.3,2.8,5.1,1.5]|
| virginica|  2.0|[6.3,3.3,6.0,2.5]|
|versicolor|  1.0|[6.0,2.9,4.5,1.5]|
|    setosa|  0.0|[4.4,3.0,1.3,0.2]|
|    setosa|  0.0|[4.3,3.0,1.1,0.1]|
|versicolor|  1.0|[5.9,3.2,4.8,1.8]|
|    setosa|  0.0|[5.1,3.5,1.4,0.2]|
|versicolor|  1.0|[6.8,2.8,4.8,1.4]|
| virginica|  2.0|[6.7,3.0,5.2,2.3]|
|versicolor|  1.0|[5.7,2.6,3.5,1.0]|
|versicolor|  1.0|[5.5,2.4,3.7,1.0]|
| virginica|  2.0|[6.4,3.1,5.5,1.8]|
|    setosa|  0.0|[4.8,3.1,1.6,0.2]|
|versicolor|  1.0|[5.6,2.9,3.6,1.3]|
|versicolor|  1.0|[6.6,3.0,4.4,1.4]|
| virginica|  2.0|[6.8,3.0,5.5,2.1]|
| virginica|  2.0|[7.7,2.8,6.7,2.0]|
|    setosa|  0.0|[4.9,3.0,1.4,0.2]|
| virginica|  2.0|[5.7,2.5,5.0,2.0]|
+----------+-----+-----------------+
only showing top 20 rows



In [41]:
irisDF.cache()

DataFrame[species: string, label: double, features: vector]

## Machine Learning

In [43]:
(dados_treino, dados_teste) = irisDF.randomSplit([0.7,0.3])

In [44]:
# maxdepth = 2 numero de nós na arvore de decisao
dfClassificer = DecisionTreeClassifier(maxDepth=2, labelCol='label', featuresCol='features')
modelo = dfClassificer.fit(dados_treino)

In [45]:
modelo.numNodes

5

In [46]:
modelo.depth

2

In [47]:
previsoes = modelo.transform(dados_teste)

In [48]:
previsoes.select('prediction','species','label').collect()

[Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=1.0, species='versicolor', label=1.0),
 Row(prediction=1.0, species='versicolor', label=1.0),
 Row(prediction=1.0, species='versicolor', label=1.0),
 Row(prediction=1.0, species='versicolor', label=1.0),
 Row(prediction=1.0, species='versicolor', label=1.0),
 Row(predic

In [49]:
avaliador = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='label',metricName='accuracy')
avaliador.evaluate(previsoes)

0.9487179487179487

In [50]:
previsoes.groupBy('label','prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|   14|
|  2.0|       2.0|    9|
|  1.0|       2.0|    2|
|  0.0|       0.0|   14|
+-----+----------+-----+

