In [1]:
# Python 3.10.11
import os 
os.environ['SPARK_HOME']=r'C:/spark/'
os.environ['HADOOP_HOME'] = r'C:/hadoop/'
os.environ['PYSPARK_DRIVER_PYTHON']='jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS']='lab'
os.environ['PYSPARK_PYTHON']='python'

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

# Initialize a Spark session
spark = SparkSession.builder.appName("ETL_Desastres").getOrCreate()
spark

In [10]:
# Datos crudos y dataframes de spark
clima_data = [(2023, 22.5, 230.1), (2024, 22.7, 228.6), (2025, 22.9, 227.5), (2026, 23.1, 226.7),
             (2027, 23.2, 226.4), (2028, 23.4, 226.2), (2029, 23.6, 226.1), (2030, 23.8, 225.1)]
desastres_data = [(2023, 2, 15, 6, 7, 50), (2024, 1, 12, 8, 9, 46), (2025, 3, 16, 5, 6, 47),
                  (2026, 4, 12, 10, 13, 52), (2027, 5, 12, 6, 5, 41), (2028, 4, 18, 3, 2, 39),
                  (2029, 2, 19, 5, 6, 49), (2030, 4, 20, 6, 7, 50)]
muertes_data = [(2023, 1000, 1300, 1200, 1150, 1500), (2024, 1200, 1250, 1260, 1678, 1940),
                (2025, 987, 1130, 1160, 1245, 1200), (2026, 1560, 1578, 1856, 1988, 1245),
                (2027, 1002, 943, 1345, 1232, 986), (2028, 957, 987, 1856, 1567, 1756),
                (2029, 1285, 1376, 1465, 1432, 1236), (2030, 1145, 1456, 1345, 1654, 1877)]

In [11]:
clima_schema = StructType([
    StructField("año", IntegerType(), True),
    StructField("Temperatura", FloatType(), True),
    StructField("Oxigeno", FloatType(), True)
])

desastres_schema = StructType([
    StructField("año", IntegerType(), True),
    StructField("Tsunamis", IntegerType(), True),
    StructField("Olas_Calor", IntegerType(), True),
    StructField("Terremotos", IntegerType(), True),
    StructField("Erupciones", IntegerType(), True),
    StructField("Incendios", IntegerType(), True)
])

muertes_schema = StructType([
    StructField("año", IntegerType(), True),
    StructField("R_Menor15", IntegerType(), True),
    StructField("R_15_a_30", IntegerType(), True),
    StructField("R_30_a_45", IntegerType(), True),
    StructField("R_45_a_60", IntegerType(), True),
    StructField("R_M_a_60", IntegerType(), True)
])

clima_df = spark.createDataFrame(clima_data, clima_schema)
desastres_df = spark.createDataFrame(desastres_data, desastres_schema)
muertes_df = spark.createDataFrame(muertes_data, muertes_schema)

In [12]:
from pyspark.sql.functions import when

clima_df = clima_df.withColumn("Cuatrenio", when(clima_df["año"] < 2026, "2023-2026").otherwise("2027-2030"))
clima_df.show()

+----+-----------+-------+---------+
| año|Temperatura|Oxigeno|Cuatrenio|
+----+-----------+-------+---------+
|2023|       22.5|  230.1|2023-2026|
|2024|       22.7|  228.6|2023-2026|
|2025|       22.9|  227.5|2023-2026|
|2026|       23.1|  226.7|2027-2030|
|2027|       23.2|  226.4|2027-2030|
|2028|       23.4|  226.2|2027-2030|
|2029|       23.6|  226.1|2027-2030|
|2030|       23.8|  225.1|2027-2030|
+----+-----------+-------+---------+



In [14]:
# Agregacion final
final_result = clima_df.join(desastres_df, "año").join(muertes_df, "año") \
    .groupBy("Cuatrenio") \
    .agg(avg("Temperatura").alias("Temp_AVG"),
         avg("Oxigeno").alias("Oxi_AVG"),
         sum("Tsunamis").alias("T_Tsunamis"),
         sum("Olas_Calor").alias("T_OlasCalor"),
         sum("Terremotos").alias("T_Terremotos"),
         sum("Erupciones").alias("T_Erupciones"),
         sum("Incendios").alias("T_Incendios"),
         avg((muertes_df["R_Menor15"] + muertes_df["R_15_a_30"])).alias("M_Jovenes_AVG"),
         avg((muertes_df["R_30_a_45"] + muertes_df["R_45_a_60"])).alias("M_Adutos_AVG"),
         avg(muertes_df["R_M_a_60"]).alias("M_Ancianos_AVG"))

# Mostrar resultados
final_result.show()

+---------+------------------+------------------+----------+-----------+------------+------------+-----------+-------------+------------------+------------------+
|Cuatrenio|          Temp_AVG|           Oxi_AVG|T_Tsunamis|T_OlasCalor|T_Terremotos|T_Erupciones|T_Incendios|M_Jovenes_AVG|      M_Adutos_AVG|    M_Ancianos_AVG|
+---------+------------------+------------------+----------+-----------+------------+------------+-----------+-------------+------------------+------------------+
|2027-2030|23.420000076293945|             226.1|        19|         81|          30|          33|        231|       2457.8|            3148.0|            1420.0|
|2023-2026|22.700000127156574|228.73333740234375|         6|         43|          19|          22|        143|       2289.0|2564.3333333333335|1546.6666666666667|
+---------+------------------+------------------+----------+-----------+------------+------------+-----------+-------------+------------------+------------------+



In [15]:
spark.stop()