In [None]:
import pandas as pd
import os

In [None]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("aljarah/xAPI-Edu-Data")

# print("Path to dataset files:", path)

In [None]:
df_path = r"C:\Users\Fhuan\.cache\kagglehub\datasets\aljarah\xAPI-Edu-Data\versions\6\xAPI-Edu-Data.csv"
df = pd.read_csv(df_path)
display(df)

In [None]:
df.columns

In [None]:
# Reemplazar valores vacíos por NaN
df.replace(["?", "NA", "None", ""], pd.NA, inplace=True)

# Imputar valores numéricos con la media
num_cols = ['raisedhands','VisITedResources','AnnouncementsView','Discussion']
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# Imputar categóricos con la moda
cat_cols = [
    'gender','NationalITy','PlaceofBirth','StageID','GradeID','SectionID',
    'Topic','Semester','Relation','ParentAnsweringSurvey',
    'ParentschoolSatisfaction','StudentAbsenceDays','Class'
]
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# Resultado pandas
print("DataFrame procesado con Pandas:")
print(df.head())



# ================================
#     PREPROCESAMIENTO EN SPARK
# ================================
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import (
    StringIndexer, OneHotEncoder, VectorAssembler, MinMaxScaler
)

spark = SparkSession.builder.appName("Preprocessing").getOrCreate()

# Convertir pandas → Spark DataFrame
spark_df = spark.createDataFrame(df)

# --- 1. Codificación de variables categóricas ---
categorical_cols = [
    'gender','NationalITy','PlaceofBirth','StageID','GradeID','SectionID',
    'Topic','Semester','Relation','ParentAnsweringSurvey',
    'ParentschoolSatisfaction','StudentAbsenceDays','Class'
]

indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_idx", handleInvalid="keep")
    for col in categorical_cols
]

# One-Hot Encoder
encoders = [
    OneHotEncoder(inputCol=f"{c}_idx", outputCol=f"{c}_oh")
    for c in categorical_cols
]

# --- 2. Columnas numéricas ---
numeric_cols = ['raisedhands','VisITedResources','AnnouncementsView','Discussion']

# Ensamblar características
assembler = VectorAssembler(
    inputCols=[f"{c}_oh" for c in categorical_cols] + numeric_cols,
    outputCol="vector_features"
)

# --- 3. Normalización ---
scaler = MinMaxScaler(inputCol="vector_features", outputCol="scaled_features")



In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

# Convertir variable objetivo a numérica
label_indexer = StringIndexer(
    inputCol="Class",
    outputCol="label",
    handleInvalid="keep"
)

# Modelo predictivo
rf = RandomForestClassifier(
    featuresCol="scaled_features",
    labelCol="label",
    numTrees=100,
    maxDepth=6
)

# Crear pipeline completo
pipeline = Pipeline(stages=indexers + encoders + [
    assembler, scaler, label_indexer, rf
])


In [None]:
train_data, test_data = spark_df.randomSplit([0.8, 0.2], seed=42)


In [None]:
model = pipeline.fit(train_data)


In [None]:
predictions = model.transform(test_data)

predictions.select("label", "prediction", "probability").show(10)


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Precisión del modelo: {accuracy:.4f}")


In [None]:
f1 = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
).evaluate(predictions)

print(f"F1-Score: {f1:.4f}")


In [None]:
pred_pd = predictions.select("label", "prediction").toPandas()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(pred_pd["label"], pred_pd["prediction"])

plt.figure()
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Matriz de Confusión")
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.show()


In [None]:
rf_model = model.stages[-1]

importances = rf_model.featureImportances.toArray()

plt.figure()
plt.bar(range(len(importances)), importances)
plt.title("Importancia de Características")
plt.xlabel("Índice de Variable")
plt.ylabel("Importancia")
plt.show()
rf_model = model.stages[-1]

importances = rf_model.featureImportances.toArray()

plt.figure()
plt.bar(range(len(importances)), importances)
plt.title("Importancia de Características")
plt.xlabel("Índice de Variable")
plt.ylabel("Importancia")
plt.show()


In [None]:
pred_pd["label"].value_counts().plot(kind="bar")
plt.title("Distribución de Clases Reales")
plt.show()

pred_pd["prediction"].value_counts().plot(kind="bar")
plt.title("Distribución de Clases Predichas")
plt.show()
