<a href="https://colab.research.google.com/github/diogocristovao/SPBD_tp1/blob/main/spbd_tp1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#@title Install PySpark
!pip install pyspark findspark --quiet

In [3]:
#@title Download the dataset

!wget -q -O energy-readings.csv https://raw.githubusercontent.com/smduarte/spbd-2425/refs/heads/main/docs/labs/projs/energy-readings.csv
!head -10 energy-readings.csv

date;sensor;energy
2024-02-01 00:00:00;D;2615.0
2024-02-01 00:00:18;C;1098.8
2024-02-01 00:00:25;A;650.5
2024-02-01 00:00:33;J;966.7
2024-02-01 00:00:42;H;2145.4
2024-02-01 00:00:54;E;1874.0
2024-02-01 00:01:52;K;841.2
2024-02-01 00:02:00;E;1874.1
2024-02-01 00:02:20;I;927.2


In [14]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]') \
						.appName('energy').getOrCreate()

sc = spark.sparkContext
try :
    readings = spark.read.csv('energy-readings.csv',
                             sep =';', header=True, inferSchema=True)

    readings.printSchema()


    readings.show(11)
except Exception as err:
    print(err)

root
 |-- date: timestamp (nullable = true)
 |-- sensor: string (nullable = true)
 |-- energy: double (nullable = true)

+-------------------+------+------+
|               date|sensor|energy|
+-------------------+------+------+
|2024-02-01 00:00:00|     D|2615.0|
|2024-02-01 00:00:18|     C|1098.8|
|2024-02-01 00:00:25|     A| 650.5|
|2024-02-01 00:00:33|     J| 966.7|
|2024-02-01 00:00:42|     H|2145.4|
|2024-02-01 00:00:54|     E|1874.0|
|2024-02-01 00:01:52|     K| 841.2|
|2024-02-01 00:02:00|     E|1874.1|
|2024-02-01 00:02:20|     I| 927.2|
|2024-02-01 00:02:36|     K| 841.3|
|2024-02-01 00:03:24|     G| 833.7|
+-------------------+------+------+
only showing top 11 rows



In [16]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Configura a Spark Session
spark = SparkSession.builder.master('local[*]') \
                    .appName('energy').getOrCreate()

# Carrega o arquivo CSV
try:
    readings = spark.read.csv('energy-readings.csv',
                              sep=';', header=True, inferSchema=True)

    # Imprime o esquema dos dados
    readings.printSchema()

    # Filtra apenas os dados de fevereiro de 2024
    readings_february = readings.filter((year("date") == 2024) & (month("date") == 2))

    # Calcula o valor máximo e mínimo de energia para cada sensor no mês de fevereiro
    energy_per_sensor = readings_february.groupBy("sensor").agg(
        max("energy").alias("max_energy"),
        min("energy").alias("min_energy")
    )

    # Exibe o valor máximo e mínimo de energia de cada sensor no mês de fevereiro
    energy_per_sensor.show()

except Exception as err:
   print(err)


root
 |-- date: timestamp (nullable = true)
 |-- sensor: string (nullable = true)
 |-- energy: double (nullable = true)

+------+----------+----------+
|sensor|max_energy|min_energy|
+------+----------+----------+
|     K|    1067.7|     841.2|
|     F|    908.41|     748.0|
|     E|   2322.76|    1874.0|
|     B|    757.31|     627.5|
|     D|    3102.4|    2615.0|
|     C|   1356.02|    1098.8|
|     J|   1197.55|     966.7|
|     A|    816.88|     650.5|
|     G|   1002.17|     833.7|
|     I|   1278.61|     927.2|
|     H|    2625.0|    2145.4|
+------+----------+----------+



In [8]:
#@title Alinea a)
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Configura a Spark Session
spark = SparkSession.builder.master('local[*]') \
                    .appName('energy').getOrCreate()

# Carrega o arquivo CSV
try:
    readings = spark.read.csv('energy-readings.csv',
                              sep=';', header=True, inferSchema=True)

    # Imprime o esquema dos dados
    readings.printSchema()

    # Filtra apenas os dados de fevereiro de 2024
    readings = readings.filter((year("date") == 2024) & (month("date") == 2))

    # Para cada sensor, queremos encontrar o valor de energia inicial e final em fevereiro
    # Usa o primeiro e o último valor de energia de cada sensor para calcular o consumo total
    total_energy = readings.groupBy("sensor").agg(
        (max("energy") - min("energy")).alias("total_energy_consumed")
    )

     # Calcula a soma da energia total gasta por todos os sensores
    total_energy_sum = total_energy.agg(sum("total_energy_consumed").alias("total_energy_all_sensors")).collect()[0]["total_energy_all_sensors"]


    # Exibe o consumo total de energia de cada sensor
    total_energy.show()

     # Exibe a soma da energia total gasta pelos 11 sensores
    print("Total energy consumed by all sensors:", total_energy_sum)


except Exception as err:
   print(err)

root
 |-- date: timestamp (nullable = true)
 |-- sensor: string (nullable = true)
 |-- energy: double (nullable = true)

+------+---------------------+
|sensor|total_energy_consumed|
+------+---------------------+
|     K|                226.5|
|     F|   160.40999999999997|
|     E|    448.7600000000002|
|     B|   129.80999999999995|
|     D|    487.4000000000001|
|     C|               257.22|
|     J|    230.8499999999999|
|     A|               166.38|
|     G|    168.4699999999999|
|     I|   351.40999999999985|
|     H|    479.5999999999999|
+------+---------------------+

Total energy consumed by all sensors: 3106.8099999999995


In [7]:
#@title Alinea b)

from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Configura a Spark Session
spark = SparkSession.builder.master('local[*]') \
                    .appName('energy').getOrCreate()

# Carrega o arquivo CSV
try:
    readings = spark.read.csv('energy-readings.csv',
                              sep=';', header=True, inferSchema=True)

    # Imprime o esquema dos dados
    readings.printSchema()

    # Filtra apenas os dados de fevereiro de 2024
    readings = readings.filter((year("date") == 2024) & (month("date") == 2))

    # Extrai a data (sem o tempo) para agrupar por dia
    readings = readings.withColumn("date", to_date("date"))

    # Usa Window Functions para pegar a última leitura de cada sensor por dia
    window_spec = Window.partitionBy("sensor", "date").orderBy(desc("date"))
    daily_last_reading = readings.withColumn("row_number", row_number().over(window_spec)) \
                                 .filter(col("row_number") == 1) \
                                 .drop("row_number")

    # Calcula o total acumulado diário de energia consumida por sensor
    daily_running_total = daily_last_reading.groupBy("date").agg(
        sum("energy").alias("running_total_energy")
    ).orderBy("date")

    # Exibe o total acumulado de energia consumida por dia
    daily_running_total.show(truncate=False)

except Exception as err:
   print(err)

root
 |-- date: timestamp (nullable = true)
 |-- sensor: string (nullable = true)
 |-- energy: double (nullable = true)

+----------+--------------------+
|date      |running_total_energy|
+----------+--------------------+
|2024-02-01|13328.000000000002  |
|2024-02-02|13448.300000000001  |
|2024-02-09|14377.2             |
|2024-02-10|14433.5             |
|2024-02-11|14547.599999999997  |
|2024-02-12|14665.599999999999  |
|2024-02-13|14776.300000000001  |
|2024-02-14|14889.299999999997  |
|2024-02-15|14982.4             |
|2024-02-16|15063.799999999997  |
|2024-02-18|15293.6             |
|2024-02-19|15351.599999999999  |
|2024-02-20|15431.400000000001  |
|2024-02-21|15515.4             |
|2024-02-22|15598.5             |
|2024-02-23|15675.400000000001  |
|2024-02-24|15839.800000000001  |
|2024-02-25|15903.369999999997  |
|2024-02-26|16003.189999999997  |
|2024-02-27|16095.89            |
+----------+--------------------+
only showing top 20 rows





For each sensor, separately:
Compute the total energy consumed and the average energy consumption per day.


In [9]:
#@title Alinea c)

# Configura a Spark Session
spark = SparkSession.builder.master('local[*]') \
                    .appName('energy').getOrCreate()

# Carrega o arquivo CSV
try:
    readings = spark.read.csv('energy-readings.csv',
                              sep=';', header=True, inferSchema=True)

    # Imprime o esquema dos dados
    readings.printSchema()

    # Filtra apenas os dados de fevereiro de 2024
    readings = readings.filter((year("date") == 2024) & (month("date") == 2))

    # Calcula o consumo total de energia para cada sensor
    total_energy = readings.groupBy("sensor").agg(
        (max("energy") - min("energy")).alias("total_energy_consumed")
    )

    # Calcula o número de dias de leitura em fevereiro para cada sensor
    days_count = readings.groupBy("sensor").agg(countDistinct("date").alias("days_count"))

    # Junta os dados de total de energia e de contagem de dias
    total_energy = total_energy.join(days_count, on="sensor")

    # Calcula o consumo médio de energia por dia para cada sensor
    total_energy = total_energy.withColumn(
        "average_daily_energy_consumed", col("total_energy_consumed") / col("days_count")
    )

    # Exibe o consumo total de energia e o consumo médio diário de cada sensor
    total_energy.select("sensor", "total_energy_consumed", "average_daily_energy_consumed").show()

except Exception as err:
   print(err)

root
 |-- date: timestamp (nullable = true)
 |-- sensor: string (nullable = true)
 |-- energy: double (nullable = true)

+------+---------------------+-----------------------------+
|sensor|total_energy_consumed|average_daily_energy_consumed|
+------+---------------------+-----------------------------+
|     K|                226.5|          0.01350143061516452|
|     F|   160.40999999999997|         0.009727713765918737|
|     E|    448.7600000000002|          0.02545433919455475|
|     B|   129.80999999999995|         0.007758651604805448|
|     D|    487.4000000000001|          0.02851292851292852|
|     C|               257.22|         0.014900938477580814|
|     J|    230.8499999999999|         0.013889891696750896|
|     A|               166.38|          0.00989709119029207|
|     G|    168.4699999999999|         0.010093463543226884|
|     I|   351.40999999999985|          0.02112854737854737|
|     H|    479.5999999999999|          0.02783678681293168|
+------+-----------------

For each sensor, separately:

Compute the day of the month with minimum and maximum energy consumption.

In [19]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Configura a Spark Session
spark = SparkSession.builder.master('local[*]') \
                    .appName('energy').getOrCreate()

# Carrega o arquivo CSV
try:
    readings = spark.read.csv('energy-readings.csv',
                              sep=';', header=True, inferSchema=True)

    # Imprime o esquema dos dados
    readings.printSchema()

    # Filtra apenas os dados de fevereiro de 2024
    readings_february = readings.filter((year("date") == 2024) & (month("date") == 2))

    # Calcula o consumo diário de energia para cada sensor
    daily_energy = readings_february.groupBy("sensor", "date").agg(
        (max("energy") - min("energy")).alias("daily_energy_consumed")
    )

    # Encontra o valor mínimo e máximo de consumo diário para cada sensor
    min_energy_per_sensor = daily_energy.groupBy("sensor").agg(
        min("daily_energy_consumed").alias("min_daily_energy")
    )

    max_energy_per_sensor = daily_energy.groupBy("sensor").agg(
        max("daily_energy_consumed").alias("max_daily_energy")
    )

    # Faz o join para encontrar o dia correspondente ao consumo mínimo de energia para cada sensor
    min_energy_day = min_energy_per_sensor.join(
        daily_energy,
        (daily_energy["sensor"] == min_energy_per_sensor["sensor"]) &
        (daily_energy["daily_energy_consumed"] == min_energy_per_sensor["min_daily_energy"])
    ).select(daily_energy["sensor"], daily_energy["date"].alias("min_energy_date"), "min_daily_energy")

    # Faz o join para encontrar o dia correspondente ao consumo máximo de energia para cada sensor
    max_energy_day = max_energy_per_sensor.join(
        daily_energy,
        (daily_energy["sensor"] == max_energy_per_sensor["sensor"]) &
        (daily_energy["daily_energy_consumed"] == max_energy_per_sensor["max_daily_energy"])
    ).select(daily_energy["sensor"], daily_energy["date"].alias("max_energy_date"), "max_daily_energy")

    # Exibe o dia de consumo mínimo de energia para cada sensor
    print("Dia com consumo mínimo de energia para cada sensor:")
    min_energy_day.show()

    # Exibe o dia de consumo máximo de energia para cada sensor
    print("Dia com consumo máximo de energia para cada sensor:")
    max_energy_day.show()

except Exception as err:
    print(err)


root
 |-- date: timestamp (nullable = true)
 |-- sensor: string (nullable = true)
 |-- energy: double (nullable = true)

Column date#643, sensor#644 are ambiguous. It's probably because you joined several Datasets together, and some of these Datasets are the same. This column points to one of the Datasets but Spark is unable to figure out which one. Please alias the Datasets with different names via `Dataset.as` before joining them, and specify the column using qualified name, e.g. `df.as("a").join(df.as("b"), $"a.id" > $"b.id")`. You can also set spark.sql.analyzer.failAmbiguousSelfJoin to false to disable this check.
