In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Aula Interativa 2 - ML") \
        .getOrCreate()

spark.version

23/01/31 02:28:11 WARN Utils: Your hostname, Deboras-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.68.104 instead (on interface en0)
23/01/31 02:28:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/31 02:28:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/31 02:28:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


'3.3.1'

In [2]:
titanic_df = spark.read.csv('/home/pcalais/XPE/cientista-dados/aula2/titanic.csv', header='True', inferSchema='True')

titanic_df.printSchema()

AnalysisException: Path does not exist: file:/home/pcalais/XPE/cientista-dados/aula2/titanic.csv

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

sex_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
sex_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVector')


In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['Age', 'Pclass', 'Fare', 'SexVector'], outputCol='features')


In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

classifier = DecisionTreeClassifier(labelCol='Survived', featuresCol='features')

classifier

DecisionTreeClassifier_332520258ac0

In [None]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[sex_indexer, sex_encoder, assembler, classifier])

In [None]:
train_data, test_data = titanic_df.randomSplit([0.7, 0.3])

In [None]:
predictSurvivedModel = pipeline.fit(train_data)

In [None]:
mean_age = titanic_df.agg({'Age': 'mean'}).collect()[0][0]
mean_age

29.699117647058763

In [None]:
titanic_df = titanic_df.fillna(mean_age, subset=['Age'])

In [None]:
train_data, test_data = titanic_df.randomSplit([0.7, 0.3])
predictSurvivedModel = pipeline.fit(train_data)

titanic_df.groupBy('Sex').count().show()

+------+-----+
|   Sex|count|
+------+-----+
|female|  314|
|  male|  577|
+------+-----+



In [None]:
predictions = predictSurvivedModel.transform(test_data)
predictions.select('passengerId', 'sex', 'sexVector', 'rawPrediction', 'prediction').show(5)



+-----------+------+-------------+-------------+----------+
|passengerId|   sex|    sexVector|rawPrediction|prediction|
+-----------+------+-------------+-------------+----------+
|          1|  male|(1,[0],[1.0])| [256.0,35.0]|       0.0|
|          2|female|    (1,[],[])|  [3.0,101.0]|       1.0|
|          3|female|    (1,[],[])|  [26.0,44.0]|       1.0|
|          8|  male|(1,[0],[1.0])|    [0.0,3.0]|       1.0|
|         10|female|    (1,[],[])|  [3.0,101.0]|       1.0|
+-----------+------+-------------+-------------+----------+
only showing top 5 rows



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='Survived', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)

accuracy



0.8303886925795053

In [None]:
decisionTreeModel = predictSurvivedModel.stages[3]

decisionTreeModel.depth

5

In [None]:
decisionTreeModel.toDebugString

'DecisionTreeClassificationModel: uid=DecisionTreeClassifier_332520258ac0, depth=5, numNodes=29, numClasses=2, numFeatures=4\n  If (feature 3 in {1.0})\n   If (feature 2 <= 26.125)\n    If (feature 0 <= 4.5)\n     Predict: 1.0\n    Else (feature 0 > 4.5)\n     Predict: 0.0\n   Else (feature 2 > 26.125)\n    If (feature 0 <= 52.5)\n     If (feature 1 <= 1.5)\n      If (feature 2 <= 31.1375)\n       Predict: 1.0\n      Else (feature 2 > 31.1375)\n       Predict: 0.0\n     Else (feature 1 > 1.5)\n      Predict: 0.0\n    Else (feature 0 > 52.5)\n     Predict: 0.0\n  Else (feature 3 not in {1.0})\n   If (feature 1 <= 2.5)\n    If (feature 0 <= 4.5)\n     If (feature 1 <= 1.5)\n      Predict: 0.0\n     Else (feature 1 > 1.5)\n      Predict: 1.0\n    Else (feature 0 > 4.5)\n     Predict: 1.0\n   Else (feature 1 > 2.5)\n    If (feature 2 <= 24.808349999999997)\n     If (feature 0 <= 36.25)\n      Predict: 1.0\n     Else (feature 0 > 36.25)\n      If (feature 0 <= 48.5)\n       Predict: 0.0\n  

In [None]:
decisionTreeModel.featureImportances

SparseVector(4, {0: 0.1024, 1: 0.1788, 2: 0.1476, 3: 0.5712})

In [None]:
list(zip(assembler.getInputCols(), decisionTreeModel.featureImportances))

[('Age', 0.10235979363731586),
 ('Pclass', 0.17880959819711095),
 ('Fare', 0.14762111134855063),
 ('SexVector', 0.5712094968170226)]