<a href="https://colab.research.google.com/github/diogocristovao/SPBD_tp1/blob/main/spbd_tp1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#@title Install PySpark
!pip install pyspark findspark --quiet

In [4]:
#@title Download the dataset

!wget -q -O energy-readings.csv https://raw.githubusercontent.com/smduarte/spbd-2425/refs/heads/main/docs/labs/projs/energy-readings.csv
!head -10 energy-readings.csv

date;sensor;energy
2024-02-01 00:00:00;D;2615.0
2024-02-01 00:00:18;C;1098.8
2024-02-01 00:00:25;A;650.5
2024-02-01 00:00:33;J;966.7
2024-02-01 00:00:42;H;2145.4
2024-02-01 00:00:54;E;1874.0
2024-02-01 00:01:52;K;841.2
2024-02-01 00:02:00;E;1874.1
2024-02-01 00:02:20;I;927.2


In [5]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]') \
						.appName('energy').getOrCreate()

sc = spark.sparkContext
try :
    readings = spark.read.csv('energy-readings.csv',
                             sep =';', header=True, inferSchema=True)

    readings.printSchema()


    readings.show(11)
except Exception as err:
    print(err)

root
 |-- date: timestamp (nullable = true)
 |-- sensor: string (nullable = true)
 |-- energy: double (nullable = true)

+-------------------+------+------+
|               date|sensor|energy|
+-------------------+------+------+
|2024-02-01 00:00:00|     D|2615.0|
|2024-02-01 00:00:18|     C|1098.8|
|2024-02-01 00:00:25|     A| 650.5|
|2024-02-01 00:00:33|     J| 966.7|
|2024-02-01 00:00:42|     H|2145.4|
|2024-02-01 00:00:54|     E|1874.0|
|2024-02-01 00:01:52|     K| 841.2|
|2024-02-01 00:02:00|     E|1874.1|
|2024-02-01 00:02:20|     I| 927.2|
|2024-02-01 00:02:36|     K| 841.3|
|2024-02-01 00:03:24|     G| 833.7|
+-------------------+------+------+
only showing top 11 rows



In [None]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Configura a Spark Session
spark = SparkSession.builder.master('local[*]') \
                    .appName('energy').getOrCreate()

# Carrega o arquivo CSV
try:
    readings = spark.read.csv('energy-readings.csv',
                              sep=';', header=True, inferSchema=True)

    # Imprime o esquema dos dados
    readings.printSchema()

    # Filtra apenas os dados de fevereiro de 2024
    readings_february = readings.filter((year("date") == 2024) & (month("date") == 2))

    # Calcula o valor máximo e mínimo de energia para cada sensor no mês de fevereiro
    energy_per_sensor = readings_february.groupBy("sensor").agg(
        max("energy").alias("max_energy"),
        min("energy").alias("min_energy")
    )

    # Exibe o valor máximo e mínimo de energia de cada sensor no mês de fevereiro
    energy_per_sensor.show()

except Exception as err:
   print(err)


root
 |-- date: timestamp (nullable = true)
 |-- sensor: string (nullable = true)
 |-- energy: double (nullable = true)

+------+----------+----------+
|sensor|max_energy|min_energy|
+------+----------+----------+
|     K|    1067.7|     841.2|
|     F|    908.41|     748.0|
|     E|   2322.76|    1874.0|
|     B|    757.31|     627.5|
|     D|    3102.4|    2615.0|
|     C|   1356.02|    1098.8|
|     J|   1197.55|     966.7|
|     A|    816.88|     650.5|
|     G|   1002.17|     833.7|
|     I|   1278.61|     927.2|
|     H|    2625.0|    2145.4|
+------+----------+----------+



In [16]:
#@title Alinea a)
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Configura a Spark Session
spark = SparkSession.builder.master('local[*]') \
                    .appName('energy').getOrCreate()

# Carrega o arquivo CSV
try:
    readings = spark.read.csv('energy-readings.csv',
                              sep=';', header=True, inferSchema=True)

    # Imprime o esquema dos dados
    readings.printSchema()

    # Filtra apenas os dados de fevereiro de 2024
    readings = readings.filter((year("date") == 2024) & (month("date") == 2))

    # Para cada sensor, queremos encontrar o valor de energia inicial e final em fevereiro
    # Usa o primeiro e o último valor de energia de cada sensor para calcular o consumo total
    total_energy = readings.groupBy("sensor").agg(
        round((max("energy") - min("energy")), 2).alias("total_energy_consumed")
    )

    # Calcula a soma da energia total gasta por todos os sensores
    total_energy_sum = total_energy.agg(round(sum("total_energy_consumed"), 2).alias("total_energy_all_sensors")).collect()[0]["total_energy_all_sensors"]



    # Exibe o consumo total de energia de cada sensor
    total_energy.show()

     # Exibe a soma da energia total gasta pelos 11 sensores
    print("Total energy consumed by all sensors:", total_energy_sum)


except Exception as err:
   print(err)

root
 |-- date: timestamp (nullable = true)
 |-- sensor: string (nullable = true)
 |-- energy: double (nullable = true)

+------+---------------------+
|sensor|total_energy_consumed|
+------+---------------------+
|     K|                226.5|
|     F|               160.41|
|     E|               448.76|
|     B|               129.81|
|     D|                487.4|
|     C|               257.22|
|     J|               230.85|
|     A|               166.38|
|     G|               168.47|
|     I|               351.41|
|     H|                479.6|
+------+---------------------+

Total energy consumed by all sensors: 3106.81


In [17]:
#@title Alinea b)

from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Configura a Spark Session
spark = SparkSession.builder.master('local[*]') \
                    .appName('energy').getOrCreate()

# Carrega o arquivo CSV
try:
    readings = spark.read.csv('energy-readings.csv',
                              sep=';', header=True, inferSchema=True)

    # Imprime o esquema dos dados
    readings.printSchema()

    # Filtra apenas os dados de fevereiro de 2024
    readings = readings.filter((year("date") == 2024) & (month("date") == 2))

    # Extrai a data (sem o tempo) para agrupar por dia
    readings = readings.withColumn("date", to_date("date"))

    # Usa Window Functions para pegar a última leitura de cada sensor por dia
    window_spec = Window.partitionBy("sensor", "date").orderBy(desc("date"))
    daily_last_reading = readings.withColumn("row_number", row_number().over(window_spec)) \
                                 .filter(col("row_number") == 1) \
                                 .drop("row_number")

    daily_running_total = daily_last_reading.groupBy("date").agg(
        round(sum("energy"), 2).alias("running_total_energy")
    ).orderBy("date")

    # Exibe o total acumulado de energia consumida por dia
    daily_running_total.show(truncate=False)

except Exception as err:
   print(err)

root
 |-- date: timestamp (nullable = true)
 |-- sensor: string (nullable = true)
 |-- energy: double (nullable = true)

+----------+--------------------+
|date      |running_total_energy|
+----------+--------------------+
|2024-02-01|13328.0             |
|2024-02-02|13448.3             |
|2024-02-09|14377.2             |
|2024-02-10|14433.5             |
|2024-02-11|14547.6             |
|2024-02-12|14665.6             |
|2024-02-13|14776.3             |
|2024-02-14|14889.3             |
|2024-02-15|14982.4             |
|2024-02-16|15063.8             |
|2024-02-18|15293.6             |
|2024-02-19|15351.6             |
|2024-02-20|15431.4             |
|2024-02-21|15515.4             |
|2024-02-22|15598.5             |
|2024-02-23|15675.4             |
|2024-02-24|15839.8             |
|2024-02-25|15903.37            |
|2024-02-26|16003.19            |
|2024-02-27|16095.89            |
+----------+--------------------+
only showing top 20 rows





For each sensor, separately:
Compute the total energy consumed and the average energy consumption per day.


In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max, min, countDistinct, year, month, to_date, round

# Configura a Spark Session
spark = SparkSession.builder.master('local[*]').appName('energy').getOrCreate()

# Carrega o arquivo CSV
try:
    readings = spark.read.csv('energy-readings.csv', sep=';', header=True, inferSchema=True)

    # Certifique-se de que a coluna 'date' está no formato de data
    readings = readings.withColumn("date", to_date(col("date"), "yyyy-MM-dd"))

    # Imprime o esquema dos dados
    readings.printSchema()

    # Filtra apenas os dados de fevereiro de 2024
    readings = readings.filter((year("date") == 2024) & (month("date") == 2))

    # Calcula o consumo total de energia para cada sensor com arredondamento
    total_energy = readings.groupBy("sensor").agg(
        round((max("energy") - min("energy")), 2).alias("total_energy_consumed")
    )

    # Calcula o número de dias de leitura em fevereiro para cada sensor
    days_count = readings.groupBy("sensor").agg(countDistinct("date").alias("days_count"))

    # Exibe a coluna "days_count" para cada sensor
    print("Número de dias de leitura (days_count) para cada sensor:")
    days_count.show()

    # Junta os dados de total de energia e de contagem de dias
    total_energy = total_energy.join(days_count, on="sensor")

    # Calcula o consumo médio de energia por dia para cada sensor com arredondamento
    total_energy = total_energy.withColumn(
        "average_daily_energy_consumed", round(col("total_energy_consumed") / col("days_count"), 2)
    )

    # Exibe o consumo total de energia e o consumo médio diário de cada sensor
    total_energy.select("sensor", "total_energy_consumed", "average_daily_energy_consumed").show()

except Exception as err:
    print("Erro:", err)


root
 |-- date: date (nullable = true)
 |-- sensor: string (nullable = true)
 |-- energy: double (nullable = true)

Número de dias de leitura (days_count) para cada sensor:
+------+----------+
|sensor|days_count|
+------+----------+
|     K|        22|
|     F|        22|
|     E|        22|
|     B|        22|
|     D|        22|
|     C|        22|
|     J|        22|
|     A|        22|
|     G|        22|
|     I|        22|
|     H|        22|
+------+----------+

+------+---------------------+-----------------------------+
|sensor|total_energy_consumed|average_daily_energy_consumed|
+------+---------------------+-----------------------------+
|     K|                226.5|                         10.3|
|     F|               160.41|                         7.29|
|     E|               448.76|                         20.4|
|     B|               129.81|                          5.9|
|     D|                487.4|                        22.15|
|     C|               257.22|        

For each sensor, separately:

Compute the day of the month with minimum and maximum energy consumption.

In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, min, max, year, month, to_date, first, last

# Configura a Spark Session
spark = SparkSession.builder.master('local[*]').appName('energy').getOrCreate()

# Carrega o arquivo CSV
try:
    # Carrega o arquivo CSV e converte a coluna 'date' para o tipo de data
    readings = spark.read.csv('energy-readings.csv', sep=';', header=True, inferSchema=True)
    readings = readings.withColumn("date", to_date(col("date"), "yyyy-MM-dd"))

    # Filtra apenas os dados de fevereiro de 2024
    readings_february = readings.filter((year("date") == 2024) & (month("date") == 2))

    # Para cada sensor e dia, obtemos a primeira e última leitura do dia
    daily_consumption = readings_february.groupBy("sensor", "date").agg(
        first("energy").alias("first_reading"),
        last("energy").alias("last_reading")
    )

    # Calcula o consumo diário de energia
    daily_consumption = daily_consumption.withColumn(
        "daily_energy_consumption", col("last_reading") - col("first_reading")
    )

    # Identifica o dia com o consumo mínimo e máximo de energia para cada sensor
    min_consumption_day = daily_consumption.groupBy("sensor").agg(
        round(min("daily_energy_consumption"), 3).alias("min_daily_energy"),
        first("date").alias("day_min_consumption")  # Ajustar para mostrar o dia correto
    )

    max_consumption_day = daily_consumption.groupBy("sensor").agg(
        round(max("daily_energy_consumption"), 3).alias("max_daily_energy"),
        last("date").alias("day_max_consumption")  # Ajustar para mostrar o dia correto
    )

    # Junte os resultados para ter ambos os dias em uma única tabela
    result = min_consumption_day.join(max_consumption_day, on="sensor")

    # Exibe os dias de consumo mínimo e máximo de energia para cada sensor
    result.select("sensor", "day_min_consumption", "min_daily_energy", "day_max_consumption", "max_daily_energy").show()

except Exception as err:
    print("Erro:", err)


+------+-------------------+----------------+-------------------+----------------+
|sensor|day_min_consumption|min_daily_energy|day_max_consumption|max_daily_energy|
+------+-------------------+----------------+-------------------+----------------+
|     K|         2024-02-14|             1.2|         2024-02-20|            10.2|
|     F|         2024-02-27|             0.8|         2024-02-09|           12.87|
|     E|         2024-02-16|             4.7|         2024-02-13|            20.6|
|     B|         2024-02-26|             0.1|         2024-02-22|             9.9|
|     D|         2024-02-11|             5.7|         2024-02-21|            26.4|
|     C|         2024-02-23|             1.6|         2024-02-12|            14.0|
|     J|         2024-02-09|             1.7|         2024-02-28|            10.0|
|     A|         2024-02-28|            0.77|         2024-02-23|             8.1|
|     G|         2024-02-22|             0.7|         2024-02-18|             9.3|
|   