# Setup Spark Cluster

In [37]:
from pyspark.sql import SparkSession

In [38]:
spark = (
    SparkSession
    .builder
    .appName("Solar Power")
    .master("spark://ea99ad5384cf:7077")
    .config("spark.executor.cores", 1)
    .config("spark.cores.max", 6)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

24/12/27 15:36:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [39]:
spark

# Read Weather CSV data

In [40]:
# Read Weather CSV data
from pyspark.sql import functions as F

_schema = "timestamp timestamp, solar_intensity float, temp float"

weather_df = spark.read.format("csv").schema(_schema).option("header", True)\
                   .load("/home/iceberg/warehouse/weather_history_splitted_resampled/2013-01-01.csv")\
                   .withColumn("15_min_interval", F.floor((F.hour(F.col("timestamp"))*60 + F.minute(F.col("timestamp")) - 60) / 15))

In [41]:
weather_df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- solar_intensity: float (nullable = true)
 |-- temp: float (nullable = true)
 |-- 15_min_interval: long (nullable = true)



# Optimize Number of Partitions and Shuffle 

In [42]:
# Check Spark Shuffle Partition setting

print("# shiffle: ", spark.conf.get("spark.sql.shuffle.partitions"))
print("# partitions: ", weather_df.rdd.getNumPartitions())

# shiffle:  200
# partitions:  6


In [43]:
spark.sparkContext.defaultParallelism

6

In [44]:
spark.conf.set("spark.sql.shuffle.partitions", 92)
weather_partitioned_df = weather_df.repartition(92, F.col('15_min_interval'))

In [49]:
print("# shiffle: ", spark.conf.get("spark.sql.shuffle.partitions"))
print("# partitions: ", weather_partitioned_df.rdd.getNumPartitions())

# shiffle:  92




# partitions:  92


In [35]:
# weather_partitioned_df.show()

                                                                                

+--------------------+---------------+----------+---------------+
|           timestamp|solar_intensity|      temp|15_min_interval|
+--------------------+---------------+----------+---------------+
| 2013-01-01 21:15:00|            0.0|     14.25|             81|
|2013-01-01 21:15:...|            0.0| 14.250009|             81|
|2013-01-01 21:15:...|            0.0| 14.250016|             81|
|2013-01-01 21:15:...|            0.0| 14.250025|             81|
|2013-01-01 21:15:...|            0.0| 14.250033|             81|
|2013-01-01 21:15:...|            0.0| 14.250042|             81|
|2013-01-01 21:15:...|            0.0|  14.25005|             81|
|2013-01-01 21:15:...|            0.0| 14.250058|             81|
|2013-01-01 21:15:...|            0.0| 14.250067|             81|
|2013-01-01 21:15:...|            0.0| 14.250075|             81|
|2013-01-01 21:15:...|            0.0| 14.250083|             81|
|2013-01-01 21:15:...|            0.0| 14.250092|             81|
|2013-01-0

# Calc Solar Power

solar_panel_rating_kwh*1000 -> to get wh

solar_panel_rating_kwh*`1000/(60*60*1000/5)`

------------------------------------^ first `60` to convert hour to minute

-----------------------------------------^ second `60` is to convert minute to seconds

------------------------------------------------^ `1000` is to convert seconds to ms, and `/5` because the date is sampled each `5ms`

</br>

solar panel power rating (in our example `10kwh`) is taken at `1000W/square meters` solar intensity
and `25c` temperature

</br>
as solar_intensity increases the generated power increases, and as temp increase the generated power decreases

In [45]:
solar_panel_rating_kwh = 10
solar_intensity_power_rating = 1000
temp_power_rating = 25

solar_panel_rating_w_5ms = solar_panel_rating_kwh*1000/(60*60*1000/5)

"""
solar_power_w = solar_panel_rating_w_5ms * (1 /(1 - (temp_power_rating - temp)/(temp_power_rating))) \
                                         * (1 - (solar_intensity_power_rating - solar_intensity)/solar_intensity_power_rating)
                                         
"""

solar_panel_readings_df = weather_partitioned_df.withColumn('current_generation_watt', solar_panel_rating_w_5ms \
                * (1 /(1 - (temp_power_rating - F.col("temp"))/(temp_power_rating))) \
                * (1 - (solar_intensity_power_rating - F.col("solar_intensity"))/solar_intensity_power_rating) \
                                                           ).drop("solar_intensity", "temp")

In [46]:
solar_panel_readings_df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- 15_min_interval: long (nullable = true)
 |-- current_generation_watt: double (nullable = true)



In [47]:
solar_panel_readings_df.write.format("csv").option("header", True).mode("overwrite").partitionBy("15_min_interval") \
                       .save("/home/iceberg/warehouse/weather_history_splitted_resampled/solar_panel_readings/2013-01-01.csv")

                                                                                

In [48]:
solar_panel_readings_df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [timestamp#110, 15_min_interval#116L, (((1.0 / (1.0 - (cast((25.0 - temp#112) as double) / 25.0))) * 0.013888888888888888) * (1.0 - (cast((1000.0 - solar_intensity#111) as double) / 1000.0))) AS current_generation_watt#125]
   +- Exchange hashpartitioning(15_min_interval#116L, 92), REPARTITION_BY_NUM, [plan_id=447]
      +- Project [timestamp#110, solar_intensity#111, temp#112, FLOOR((cast((((hour(timestamp#110, Some(Etc/UTC)) * 60) + minute(timestamp#110, Some(Etc/UTC))) - 60) as double) / 15.0)) AS 15_min_interval#116L]
         +- FileScan csv [timestamp#110,solar_intensity#111,temp#112] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/warehouse/weather_history_splitted_resampled/2013-0..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<timestamp:timestamp,solar_intensity:float,temp:float>




In [50]:
solar_panel_readings_df.groupBy("15_min_interval").agg(F.sum("current_generation_watt").alias("avg_current_generation_watt")).show()

                                                                                

+---------------+---------------------------+
|15_min_interval|avg_current_generation_watt|
+---------------+---------------------------+
|             81|                        0.0|
|             91|                        0.0|
|             19|                        0.0|
|             90|                        0.0|
|             35|         3446.0537665243382|
|             61|         2138.1184121672377|
|              4|                        0.0|
|             33|         3567.1697686414577|
|             11|                        0.0|
|             57|          2621.049297203433|
|             20|                        0.0|
|             15|                        0.0|
|             25|                        0.0|
|             38|          3339.619220014445|
|             86|                        0.0|
|             28|         1918.5592792420696|
|             34|          3707.909386065704|
|             42|          3099.275463319669|
|             10|                 

In [52]:
spark.stop()