In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, isnan, when, mean


spark = SparkSession.builder \
    .appName("Tarea3_16") \
    .getOrCreate()


In [2]:
# 2. Cargar datos desde el CSV
df = spark.read.csv("student_depression_dataset.csv", header=True, inferSchema=True)

In [3]:
# 3. Mostrar primeras filas y esquema
df.show(5)
df.printSchema()


+---+------+----+-------------+----------+-----------------+-------------+----+------------------+----------------+-------------------+--------------+-------+-------------------------------------+----------------+----------------+--------------------------------+----------+
| id|Gender| Age|         City|Profession|Academic Pressure|Work Pressure|CGPA|Study Satisfaction|Job Satisfaction|     Sleep Duration|Dietary Habits| Degree|Have you ever had suicidal thoughts ?|Work/Study Hours|Financial Stress|Family History of Mental Illness|Depression|
+---+------+----+-------------+----------+-----------------+-------------+----+------------------+----------------+-------------------+--------------+-------+-------------------------------------+----------------+----------------+--------------------------------+----------+
|  2|  Male|33.0|Visakhapatnam|   Student|              5.0|          0.0|8.97|               2.0|             0.0|        '5-6 hours'|       Healthy|B.Pharm|                 

In [4]:
# 4. Limpieza de datos
# Quitar duplicados
df = df.dropDuplicates()

In [5]:
# Quitar filas con valores nulos o vacíos
df = df.dropna(how="any")

In [8]:
# Contar valores nulos por columna (análisis previo)
null_counts = df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df.columns])
null_counts.show()

+---+------+---+----+----------+-----------------+-------------+----+------------------+----------------+--------------+--------------+------+-------------------------------------+----------------+----------------+--------------------------------+----------+
| id|Gender|Age|City|Profession|Academic Pressure|Work Pressure|CGPA|Study Satisfaction|Job Satisfaction|Sleep Duration|Dietary Habits|Degree|Have you ever had suicidal thoughts ?|Work/Study Hours|Financial Stress|Family History of Mental Illness|Depression|
+---+------+---+----+----------+-----------------+-------------+----+------------------+----------------+--------------+--------------+------+-------------------------------------+----------------+----------------+--------------------------------+----------+
|  0|     0|  0|   0|         0|                0|            0|   0|                 0|               0|             0|             0|     0|                                    0|               0|               0|         

In [9]:
# 5. Transformaciones necesarias (renombrar columnas, convertir tipos si aplica)
# Ejemplo: cambiar nombres de columnas si tienen espacios
for c in df.columns:
    df = df.withColumnRenamed(c, c.strip().replace(" ", "_"))

In [10]:
# 6. Análisis exploratorio (EDA)
print("=== Descripción estadística ===")
df.describe().show()

=== Descripción estadística ===
+-------+------------------+------+-----------------+-------------+----------------+------------------+--------------------+------------------+------------------+--------------------+--------------+--------------+----------+-------------------------------------+-----------------+------------------+--------------------------------+-------------------+
|summary|                id|Gender|              Age|         City|      Profession| Academic_Pressure|       Work_Pressure|              CGPA|Study_Satisfaction|    Job_Satisfaction|Sleep_Duration|Dietary_Habits|    Degree|Have_you_ever_had_suicidal_thoughts_?| Work/Study_Hours|  Financial_Stress|Family_History_of_Mental_Illness|         Depression|
+-------+------------------+------+-----------------+-------------+----------------+------------------+--------------------+------------------+------------------+--------------------+--------------+--------------+----------+-------------------------------------+

In [11]:
# Distribución por columna clave
if 'Gender' in df.columns or 'gender' in df.columns:
    df.groupBy("Gender").count().show()

+------+-----+
|Gender|count|
+------+-----+
|Female|12354|
|  Male|15547|
+------+-----+



In [12]:
# Ejemplo: promedio de edad o síntomas
numeric_cols = [c for (c, dtype) in df.dtypes if dtype in ('int', 'double')]
for colname in numeric_cols:
    df.select(mean(col(colname)).alias(f"Average_{colname}")).show()

+----------------+
|      Average_id|
+----------------+
|70442.1494211677|
+----------------+

+-----------------+
|      Average_Age|
+-----------------+
|25.82230027597577|
+-----------------+

+-------------------------+
|Average_Academic_Pressure|
+-------------------------+
|       3.1412135765743163|
+-------------------------+

+---------------------+
|Average_Work_Pressure|
+---------------------+
| 4.300921113938568...|
+---------------------+

+-----------------+
|     Average_CGPA|
+-----------------+
|7.656104171893657|
+-----------------+

+--------------------------+
|Average_Study_Satisfaction|
+--------------------------+
|         2.943837138453819|
+--------------------------+

+------------------------+
|Average_Job_Satisfaction|
+------------------------+
|    6.809791763736067E-4|
+------------------------+

+------------------------+
|Average_Work/Study_Hours|
+------------------------+
|       7.156983620658758|
+------------------------+

+------------------+
|

In [13]:
# 7. Guardar resultados procesados
df.write.mode("overwrite").csv("processed_student_depression_data", header=True)

In [14]:
# Finalizar sesión Spark
spark.stop()