In [1]:
from pyspark.sql import SparkSession

# Crear una sesión de Spark
spark = SparkSession.builder.appName("HealthcareClassification").getOrCreate()

# Cargar el dataset
df = spark.read.csv("healthcare_dataset.csv", header=True, inferSchema=True)

# Mostrar las primeras filas
df.show(5)

# Verificar el esquema
df.printSchema()


+-------------+---+------+----------+-----------------+-----------------+----------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+
|         Name|Age|Gender|Blood Type|Medical Condition|Date of Admission|          Doctor|            Hospital|Insurance Provider|    Billing Amount|Room Number|Admission Type|Discharge Date| Medication|Test Results|
+-------------+---+------+----------+-----------------+-----------------+----------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+
|Bobby JacksOn| 30|  Male|        B-|           Cancer|       2024-01-31|   Matthew Smith|     Sons and Miller|        Blue Cross|18856.281305978155|        328|        Urgent|    2024-02-02|Paracetamol|      Normal|
| LesLie TErRy| 62|  Male|        A+|          Obesity|       2019-08-20| Samantha Davies|             Kim Inc|          Medicare|33

In [2]:
# Elimunar columnas que no aportan al modelo
df = df.drop("Name", "Doctor", "Hospital", "Room Number", "Date of Admission", "Discharge Date")

In [3]:
from pyspark.sql.functions import col, sum

# Lista de columnas
columnas = df.columns

# Conteo de nulos por columna
df.select([sum(col(c).isNull().cast("int")).alias(c) for c in columnas]).show()


+---+------+----------+-----------------+------------------+--------------+--------------+----------+------------+
|Age|Gender|Blood Type|Medical Condition|Insurance Provider|Billing Amount|Admission Type|Medication|Test Results|
+---+------+----------+-----------------+------------------+--------------+--------------+----------+------------+
|  0|     0|         0|                0|                 0|             0|             0|         0|           0|
+---+------+----------+-----------------+------------------+--------------+--------------+----------+------------+



In [4]:
# En caso de que existan valores perdidos
# df = df.dropna()

In [5]:
# Indexar variables categoricas

from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

categorical_cols = ["Gender", "Blood Type", "Medical Condition", "Insurance Provider", "Admission Type", "Medication", "Test Results"]

indexers = [StringIndexer(inputCol=c, outputCol=c + "_index") for c in categorical_cols]

pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)


In [6]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 1. USAR LOS NOMBRES EXACTOS, tal como aparecen en el DataFrame
feature_cols = [
    "Age", "Billing Amount",
    "Gender_index", "Blood Type_index", "Medical Condition_index",
    "Insurance Provider_index", "Admission Type_index", "Medication_index"
]

# 2. VectorAssembler 
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = assembler.transform(df)

# 3. Definir label (Test Results)
df = df.select("features", col("Test Results_index").alias("label"))

# 4. División entrenamiento / prueba
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# 5. Entrenar modelo
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50)
rf_model = rf.fit(train_data)
rf_pred = rf_model.transform(test_data)

# 6. Funciones para métricas y matriz de confusión
def evaluar_modelo(pred_df, nombre_modelo):
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    
    acc = evaluator.evaluate(pred_df, {evaluator.metricName: "accuracy"})
    prec = evaluator.evaluate(pred_df, {evaluator.metricName: "weightedPrecision"})
    rec = evaluator.evaluate(pred_df, {evaluator.metricName: "weightedRecall"})
    f1 = evaluator.evaluate(pred_df, {evaluator.metricName: "f1"})

    print(f"\n📊 Resultados para {nombre_modelo}:")
    print(f"Accuracy  : {acc:.3f}")
    print(f"Precision : {prec:.3f}")
    print(f"Recall    : {rec:.3f}")
    print(f"F1-score  : {f1:.3f}")
