In [80]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Aula Interativa 2 - ML") \
        .getOrCreate()

spark.version

'3.3.0'

In [81]:
titanic_df = spark.read.csv('/home/pcalais/XPE/cientista-dados/aula2/titanic.csv', header='True', inferSchema='True')

titanic_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [97]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

sex_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndexer')

sex_encoder = OneHotEncoder(inputCol='SexIndexer', outputCol='SexVector')

In [98]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['Age', 'Pclass', 'Fare', 'SexVector'], outputCol='features')


In [99]:
from pyspark.ml.classification import DecisionTreeClassifier

classifier = DecisionTreeClassifier(labelCol='Survived', featuresCol='features')

In [100]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[sex_indexer, sex_encoder, assembler, classifier])

In [101]:
train_data, test_data = titanic_df.randomSplit([0.7, 0.3])

In [102]:
predictSurvivedModel = pipeline.fit(train_data)

In [87]:
mean_age = titanic_df.agg({'Age': 'mean'}).collect()[0][0]
mean_age

29.69911764705882

In [88]:
titanic_df = titanic_df.fillna(mean_age, subset=['Age'])

In [89]:
train_data, test_data = titanic_df.randomSplit([0.7, 0.3])
predictSurvivedModel = pipeline.fit(train_data)

In [109]:
predictions = predictSurvivedModel.transform(test_data)
predictions.select('passengerId', 'fare', 'pclass', 'age', 'sex', 'rawPrediction', 'prediction').show(50)



+-----------+--------+------+-----------------+------+-------------+----------+
|passengerId|    fare|pclass|              age|   sex|rawPrediction|prediction|
+-----------+--------+------+-----------------+------+-------------+----------+
|          2| 71.2833|     1|             38.0|female|  [7.0,111.0]|       1.0|
|         11|    16.7|     3|              4.0|female|  [16.0,21.0]|       1.0|
|         17|  29.125|     3|              2.0|  male|    [1.0,4.0]|       1.0|
|         21|    26.0|     2|             35.0|  male| [268.0,35.0]|       0.0|
|         22|    13.0|     2|             34.0|  male| [268.0,35.0]|       0.0|
|         23|  8.0292|     3|             15.0|female|    [9.0,3.0]|       0.0|
|         27|   7.225|     3|29.69911764705882|  male| [268.0,35.0]|       0.0|
|         28|   263.0|     1|             19.0|  male|  [65.0,30.0]|       0.0|
|         32|146.5208|     1|29.69911764705882|female|  [7.0,111.0]|       1.0|
|         34|    10.5|     2|           

In [110]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='Survived', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)

accuracy



0.8333333333333334

In [111]:
decisionTreeModel = predictSurvivedModel.stages[3]

In [112]:
decisionTreeModel.depth

5

In [113]:
decisionTreeModel.toDebugString

'DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ed941394ccf0, depth=5, numNodes=29, numClasses=2, numFeatures=4\n  If (feature 3 in {1.0})\n   If (feature 0 <= 3.5)\n    If (feature 1 <= 2.5)\n     Predict: 1.0\n    Else (feature 1 > 2.5)\n     If (feature 2 <= 36.8771)\n      Predict: 1.0\n     Else (feature 2 > 36.8771)\n      Predict: 0.0\n   Else (feature 0 > 3.5)\n    If (feature 1 <= 1.5)\n     If (feature 0 <= 17.5)\n      Predict: 1.0\n     Else (feature 0 > 17.5)\n      Predict: 0.0\n    Else (feature 1 > 1.5)\n     If (feature 2 <= 49.7521)\n      Predict: 0.0\n     Else (feature 2 > 49.7521)\n      If (feature 2 <= 69.425)\n       Predict: 1.0\n      Else (feature 2 > 69.425)\n       Predict: 0.0\n  Else (feature 3 not in {1.0})\n   If (feature 1 <= 2.5)\n    If (feature 0 <= 3.5)\n     Predict: 0.0\n    Else (feature 0 > 3.5)\n     Predict: 1.0\n   Else (feature 1 > 2.5)\n    If (feature 2 <= 24.075)\n     If (feature 2 <= 7.987500000000001)\n      If (feature 

In [114]:
list(zip(assembler.getInputCols(), decisionTreeModel.featureImportances))

[('Age', 0.1218858507486862),
 ('Pclass', 0.20540803023136753),
 ('Fare', 0.12335088444459665),
 ('SexVector', 0.5493552345753496)]

In [115]:
assembler.getInputCols()

['Age', 'Pclass', 'Fare', 'SexVector']