# Aula Interativa 2 - Módulo 2 - Desenvolvimento de Soluções utilizando Spark

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("aula interativa 2") \
        .getOrCreate()

spark.version

'3.2.1'

In [6]:
titanic_df = spark.read.csv("/home/jovyan/work/titanic.csv", header="True", inferSchema="True")
                            
titanic_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [9]:
titanic_df.count()

891

In [10]:
titanic_df.groupBy("survived").count().show()

+--------+-----+
|survived|count|
+--------+-----+
|       1|  342|
|       0|  549|
+--------+-----+



In [11]:
titanic_df.createOrReplaceTempView("table")

spark.sql("SELECT Survived, count(*) FROM table GROUP BY Survived").show()

+--------+--------+
|Survived|count(1)|
+--------+--------+
|       1|     342|
|       0|     549|
+--------+--------+



In [12]:
spark.sql("SELECT Survived, avg(Fare) FROM table GROUP BY Survived").show()

+--------+------------------+
|Survived|         avg(Fare)|
+--------+------------------+
|       1| 48.39540760233917|
|       0|22.117886885245877|
+--------+------------------+



In [14]:
# Criando uma UDF
def uppercase(str):
    return str.upper()
    
spark.udf.register("upperUDF", uppercase)

spark.sql("SELECT upperUDF(Name) FROM table").show(5)

+--------------------+
|      upperUDF(Name)|
+--------------------+
|BRAUND, MR. OWEN ...|
|CUMINGS, MRS. JOH...|
|HEIKKINEN, MISS. ...|
|FUTRELLE, MRS. JA...|
|ALLEN, MR. WILLIA...|
+--------------------+
only showing top 5 rows



In [24]:
# Criando o modelo de Árvore de Decisão
from pyspark.ml.classification import DecisionTreeClassifier

dtc = DecisionTreeClassifier(labelCol="Survived", featuresCol="features")

In [48]:
#
from pyspark.ml.feature import VectorAssembler

vector_assembler = VectorAssembler(inputCols=["Age", "Fare", "SexVector"], outputCol="features")

In [22]:
mean_age = titanic_df.agg({"Age": "mean"}).collect()[0][0]
mean_age

29.69911764705882

In [23]:
titanic_df = titanic_df.fillna(mean_age, subset=["Age"])

In [34]:
# Dados categóricos
from pyspark.ml.feature import StringIndexer, OneHotEncoder

sex_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
sex_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVector')

In [49]:
# Criando o pipeline
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[sex_indexer, sex_encoder, vector_assembler, dtc])

In [36]:
# Dividir o dado em treino (70%) e teste do modelo (30%)
train_data,test_data = titanic_df.randomSplit([0.7,0.3])

In [37]:
pipelineModel = pipeline.fit(train_data)

In [38]:
# testando o modelo
dtc_predictions = pipelineModel.transform(test_data)

In [39]:
# Avaliando o modelo
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_evaluator =  MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")
                                                   
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
                                                   
dtc_acc            

0.7786259541984732

In [40]:
dtcModel = pipelineModel.stages[3]

dtcModel.depth

5

In [41]:
dtcModel.toDebugString

'DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6339c68355c7, depth=5, numNodes=35, numClasses=2, numFeatures=3\n  If (feature 2 in {1.0})\n   If (feature 0 <= 8.5)\n    If (feature 1 <= 21.0375)\n     Predict: 1.0\n    Else (feature 1 > 21.0375)\n     If (feature 1 <= 54.270849999999996)\n      If (feature 1 <= 26.125)\n       Predict: 1.0\n      Else (feature 1 > 26.125)\n       Predict: 0.0\n     Else (feature 1 > 54.270849999999996)\n      Predict: 1.0\n   Else (feature 0 > 8.5)\n    If (feature 1 <= 26.125)\n     If (feature 0 <= 14.75)\n      If (feature 1 <= 12.4125)\n       Predict: 1.0\n      Else (feature 1 > 12.4125)\n       Predict: 0.0\n     Else (feature 0 > 14.75)\n      Predict: 0.0\n    Else (feature 1 > 26.125)\n     If (feature 1 <= 30.5979)\n      If (feature 0 <= 53.0)\n       Predict: 1.0\n      Else (feature 0 > 53.0)\n       Predict: 0.0\n     Else (feature 1 > 30.5979)\n      Predict: 0.0\n  Else (feature 2 not in {1.0})\n   If (feature 1 <= 44.65)

In [42]:
dtcModel.featureImportances

SparseVector(3, {0: 0.106, 1: 0.2605, 2: 0.6335})

In [50]:
vector_assembler.getInputCols()

['Age', 'Fare', 'SexVector']

In [51]:
# A importância de cada atributo no modelo
list(zip(vector_assembler.getInputCols(), dtcModel.featureImportances))

[('Age', 0.10599036747724096),
 ('Fare', 0.2605250560765023),
 ('SexVector', 0.6334845764462568)]