In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler


In [14]:

# Iniciar uma sessão Spark
spark = SparkSession.builder \
    .appName("Titanic Survival Prediction") \
    .getOrCreate()

# Importando os dados
titanic = spark.read.csv("/user/titanic.csv", inferSchema=True, header=True, sep=";")

In [15]:
titanic.show()

+--------+-----+--------------------+------+----+-----+-----+--------+----+-------+--------+--------------------+
|Survived|Class|                Name|   Sex| Age|SibSp|ParCh|  Ticket|Fare|  Cabin|Embarked|            HomeDest|
+--------+-----+--------------------+------+----+-----+-----+--------+----+-------+--------+--------------------+
|       1|    1|Allen, Miss. Elis...|female|  29|    0|    0|   24160| 211|     B5|       S|        St Louis, MO|
|       1|    1|Allison, Master. ...|  male|   1|    1|    2|  113781| 152|C22 C26|       S|Montreal, PQ / Ch...|
|       0|    1|Allison, Miss. He...|female|   2|    1|    2|  113781| 152|C22 C26|       S|Montreal, PQ / Ch...|
|       0|    1|Allison, Mr. Huds...|  male|  30|    1|    2|  113781| 152|C22 C26|       S|Montreal, PQ / Ch...|
|       0|    1|Allison, Mrs. Hud...|female|  25|    1|    2|  113781| 152|C22 C26|       S|Montreal, PQ / Ch...|
|       1|    1| Anderson, Mr. Harry|  male|  48|    0|    0|   19952|  27|    E12|     

In [16]:
titanic = titanic.withColumn("HomeDest", when(col("HomeDest").isNull(), "Nao Informado").otherwise(col("HomeDest")))
titanic = titanic.withColumn("Cabin", when(col("Cabin").isNull(), "Nao Informado").otherwise(col("Cabin")))
titanic = titanic.withColumn("Ticket", when(col("Ticket").isNull(), "0").otherwise(col("Ticket")))
titanic = titanic.withColumn("Fare", when(col("Fare").isNull(), 0).otherwise(col("Fare")))
titanic = titanic.withColumn("Embarked", when(col("Embarked").isNull(), "Z").otherwise(col("Embarked")))
titanic = titanic.withColumn("SibSp", col("SibSp").cast("string"))
titanic = titanic.withColumn("ParCh", col("ParCh").cast("string"))

In [20]:
titanic.describe().show()

+-------+-------------------+------------------+--------------------+------+-----------------+------------------+------------------+------------------+-----------------+-----+--------+-------------------+
|summary|           Survived|             Class|                Name|   Sex|              Age|             SibSp|             ParCh|            Ticket|             Fare|Cabin|Embarked|           HomeDest|
+-------+-------------------+------------------+--------------------+------+-----------------+------------------+------------------+------------------+-----------------+-----+--------+-------------------+
|  count|               1309|              1309|                1309|  1309|             1046|              1309|              1309|              1309|             1309| 1309|    1309|               1309|
|   mean| 0.3819709702062643| 2.294881588999236|                NULL|  NULL|29.89770554493308|0.4988540870893812|0.3850267379679144| 249039.1368861024|33.33766233766234| NULL|    N

In [23]:
null_counts = [(col_name, titanic.where(col(col_name).isNull()).count()) for col_name in titanic.columns]
for col_name, count in null_counts:
    print(f"{col_name}: {count}")

Survived: 0
Class: 0
Name: 0
Sex: 0
Age: 263
SibSp: 0
ParCh: 0
Ticket: 0
Fare: 0
Cabin: 0
Embarked: 0
HomeDest: 0


In [24]:
titanic = titanic.drop("Name")

In [26]:
num_rows = titanic.count()

# Calculando a proporção para amostragem
sampling_ratio = 425 / num_rows

# Amostrando os sobreviventes e não sobreviventes
sobreviventes = titanic.filter(col("Survived") == 1).sample(False, sampling_ratio, seed=1234)
nao_sobreviventes = titanic.filter(col("Survived") == 0).sample(False, sampling_ratio, seed=1234)

# Unindo os DataFrames amostrados
balanced_df = sobreviventes.union(nao_sobreviventes)

In [13]:


# Preparando o dataset para modelagem
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(balanced_df) for column in ["Sex", "Embarked", "AgeGroup"]]
pipeline = Pipeline(stages=indexers)
transformed_df = pipeline.fit(balanced_df).transform(balanced_df)

assembler = VectorAssembler(inputCols=["Sex_index", "Fare", "Relatives", "Class_index", "AgeGroup_index", "Embarked_index"], outputCol="features")
final_df = assembler.transform(transformed_df)

# Dividindo o dataset em conjuntos de treinamento e validação
(training_data, validation_data) = final_df.randomSplit([0.7, 0.3], seed=1234)

# Criando o modelo de árvore de decisão
dt = DecisionTreeClassifier(labelCol="Survived", featuresCol="features")

# Treinando o modelo
model = dt.fit(training_data)

# Avaliando o modelo no conjunto de validação
predictions = model.transform(validation_data)

# Calculando a acurácia
evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

# Calculando a precisão para sobreviventes e não sobreviventes
predictions.groupBy("Survived", "prediction").count().show()

# Calculando a área sob a curva ROC
evaluator_roc = BinaryClassificationEvaluator(labelCol="Survived")
roc_auc = evaluator_roc.evaluate(predictions)
print("ROC AUC:", roc_auc)

# Encerrando a sessão Spark
spark.stop()


TypeError: object of type 'DataFrame' has no len()