In [None]:
#@title Install PySpark
!pip install pyspark findspark --quiet

In [None]:
#@title Download the dataset

!wget -q -O energy-readings.csv https://raw.githubusercontent.com/smduarte/spbd-2425/refs/heads/main/docs/labs/projs/energy-readings.csv
!head -10 energy-readings.csv

date;sensor;energy
2024-02-01 00:00:00;D;2615.0
2024-02-01 00:00:18;C;1098.8
2024-02-01 00:00:25;A;650.5
2024-02-01 00:00:33;J;966.7
2024-02-01 00:00:42;H;2145.4
2024-02-01 00:00:54;E;1874.0
2024-02-01 00:01:52;K;841.2
2024-02-01 00:02:00;E;1874.1
2024-02-01 00:02:20;I;927.2


In [None]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]') \
						.appName('energy').getOrCreate()

sc = spark.sparkContext
try :
    readings = spark.read.csv('energy-readings.csv',
                             sep =';', header=True, inferSchema=True)

    readings.printSchema()


    readings.show(5)
except Exception as err:
    print(err)

root
 |-- date: timestamp (nullable = true)
 |-- sensor: string (nullable = true)
 |-- energy: double (nullable = true)

+-------------------+------+------+
|               date|sensor|energy|
+-------------------+------+------+
|2024-02-01 00:00:00|     D|2615.0|
|2024-02-01 00:00:18|     C|1098.8|
|2024-02-01 00:00:25|     A| 650.5|
|2024-02-01 00:00:33|     J| 966.7|
|2024-02-01 00:00:42|     H|2145.4|
+-------------------+------+------+
only showing top 5 rows



In [None]:
#@title Alinea a)
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Configura a Spark Session
spark = SparkSession.builder.master('local[*]') \
                    .appName('energy').getOrCreate()

# Carrega o arquivo CSV
try:
    readings = spark.read.csv('energy-readings.csv',
                              sep=';', header=True, inferSchema=True)

    # Imprime o esquema dos dados
    readings.printSchema()

    # Filtra apenas os dados de fevereiro de 2024
    readings = readings.filter((year("timestamp") == 2024) & (month("timestamp") == 2))

    # Para cada sensor, queremos encontrar o valor de energia inicial e final em fevereiro
    # Usa o primeiro e o último valor de energia de cada sensor para calcular o consumo total
    total_energy = readings.groupBy("sensor_id").agg(
        (max("energy") - min("energy")).alias("total_energy_consumed")
    )

    # Exibe o consumo total de energia de cada sensor
    total_energy.show()

except Exception as err:
   print(err)

root
 |-- date: timestamp (nullable = true)
 |-- sensor: string (nullable = true)
 |-- energy: double (nullable = true)

[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `timestamp` cannot be resolved. Did you mean one of the following? [`date`, `energy`, `sensor`].;
'Filter ((year('timestamp) = 2024) AND (month('timestamp) = 2))
+- Relation [date#57,sensor#58,energy#59] csv



In [None]:
#@title Alinea b)

from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Configura a Spark Session
spark = SparkSession.builder.master('local[*]') \
                    .appName('energy').getOrCreate()

# Carrega o arquivo CSV
try:
    readings = spark.read.csv('energy-readings.csv',
                              sep=';', header=True, inferSchema=True)

    # Imprime o esquema dos dados
    readings.printSchema()

    # Filtra apenas os dados de fevereiro de 2024
    readings = readings.filter((year("date") == 2024) & (month("date") == 2))

    # Extrai a data (sem o tempo) para agrupar por dia
    readings = readings.withColumn("date", to_date("date"))

    # Usa Window Functions para pegar a última leitura de cada sensor por dia
    window_spec = Window.partitionBy("sensor", "date").orderBy(desc("date"))
    daily_last_reading = readings.withColumn("row_number", row_number().over(window_spec)) \
                                 .filter(col("row_number") == 1) \
                                 .drop("row_number")

    # Calcula o total acumulado diário de energia consumida por sensor
    daily_running_total = daily_last_reading.groupBy("date").agg(
        sum("energy").alias("running_total_energy")
    ).orderBy("date")

    # Exibe o total acumulado de energia consumida por dia
    daily_running_total.show(truncate=False)

except Exception as err:
    print(err)