# Setup Spark Cluster

In [123]:
from pyspark.sql import SparkSession

In [124]:
spark = (
    SparkSession
    .builder
    .appName("Solar Power")
    .master("spark://9611ff031a11:7077")
    .config("spark.executor.cores", 1)
    .config("spark.cores.max", 6)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

In [125]:
spark

# Read Weather CSV data

In [126]:
# Read Weather CSV data
from pyspark.sql import functions as F

_schema = "timestamp timestamp, solar_intensity float, temp float"

weather_df = spark.read.format("csv").schema(_schema).option("header", True)\
                   .load("/home/iceberg/warehouse/weather_history_splitted_resampled/2013-01-01.csv")\
                   .withColumn("hour", F.hour(F.col("timestamp")))

In [127]:
weather_df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- solar_intensity: float (nullable = true)
 |-- temp: float (nullable = true)
 |-- hour: integer (nullable = true)



# Optimize Number of Partitions and Shuffle 

In [128]:
# Check Spark Shuffle Partition setting

print("# shiffle: ", spark.conf.get("spark.sql.shuffle.partitions"))
print("# partitions: ", weather_df.rdd.getNumPartitions())

# shiffle:  200
# partitions:  6


In [129]:
spark.sparkContext.defaultParallelism

6

In [130]:
spark.conf.set("spark.sql.shuffle.partitions", 23)
weather_partitioned_df = weather_df.repartition(23, F.col('hour'))

In [138]:
print("# shiffle: ", spark.conf.get("spark.sql.shuffle.partitions"))
print("# partitions: ", weather_partitioned_df.rdd.getNumPartitions())

# shiffle:  23




# partitions:  23


In [33]:
# weather_partitioned_df.show()



+--------------------+---------------+---------+----+
|           timestamp|solar_intensity|     temp|hour|
+--------------------+---------------+---------+----+
| 2013-01-01 18:00:00|         14.525|   16.835|  18|
|2013-01-01 18:00:...|      14.523789| 16.83496|  18|
|2013-01-01 18:00:...|      14.522579| 16.83492|  18|
|2013-01-01 18:00:...|      14.521369| 16.83488|  18|
|2013-01-01 18:00:...|      14.520159| 16.83484|  18|
|2013-01-01 18:00:...|      14.518948|16.834803|  18|
|2013-01-01 18:00:...|      14.517737|16.834763|  18|
|2013-01-01 18:00:...|      14.516527|16.834723|  18|
|2013-01-01 18:00:...|      14.515317|16.834682|  18|
|2013-01-01 18:00:...|      14.514106|16.834644|  18|
|2013-01-01 18:00:...|      14.512896|16.834604|  18|
|2013-01-01 18:00:...|      14.511685|16.834564|  18|
|2013-01-01 18:00:...|      14.510475|16.834524|  18|
|2013-01-01 18:00:...|      14.509265|16.834486|  18|
|2013-01-01 18:00:...|      14.508054|16.834446|  18|
|2013-01-01 18:00:...|      

                                                                                

In [58]:
# weather_partitioned_avg_df = weather_partitioned_df.groupBy("hour").agg(F.avg("temp").alias("avg_temp"))


In [59]:
# Write data for performance Benchmarking

# weather_partitioned_avg_df.write.format("noop").mode("overwrite").save()

In [43]:
# weather_partitioned_avg_df.show()

                                                                                

+----+------------------+
|hour|          avg_temp|
+----+------------------+
|  14|18.922791989506617|
|  18|16.978458910542063|
|   6|  9.98958299305174|
|   3|10.066041906261445|
|   7| 9.982791267459922|
|  15|18.861208586766985|
|   2|10.499624821007252|
|   4|10.001875093800491|
|  22|13.999959184028043|
|  12|18.467873648031553|
|   9|10.485956468833818|
|  17|17.562625767366093|
|  11|17.251248770969443|
|  19|15.485125461803543|
|  20|14.417791607675287|
|  21|15.042750562471813|
|   8| 10.11683338186476|
|  13|18.840041614619892|
|  16|18.489166768927046|
|  23|12.619364792573174|
+----+------------------+
only showing top 20 rows



# Calc Solar Power

solar_panel_rating_kwh*1000 -> to get wh

solar_panel_rating_kwh*`1000/(60*60*1000/5)`

------------------------------------^ first `60` to convert hour to minute

-----------------------------------------^ second `60` is to convert minute to seconds

------------------------------------------------^ `1000` is to convert seconds to ms, and `/5` because the date is sampled each `5ms`

</br>

solar panel power rating (in our example `10kwh`) is taken at `1000W/square meters` solar intensity
and `25c` temperature

</br>
as solar_intensity increases the generated power increases, and as temp increase the generated power decreases

In [132]:
solar_panel_rating_kwh = 10
solar_intensity_power_rating = 1000
temp_power_rating = 25

solar_panel_rating_w_5ms = solar_panel_rating_kwh*1000/(60*60*1000/5)

"""
solar_power_w = solar_panel_rating_w_5ms * (1 /(1 - (temp_power_rating - temp)/(temp_power_rating))) \
                                         * (1 - (solar_intensity_power_rating - solar_intensity)/solar_intensity_power_rating)
                                         
"""

solar_panel_readings_df = weather_partitioned_df.withColumn('current_generation_watt', solar_panel_rating_w_5ms \
                * (1 /(1 - (temp_power_rating - F.col("temp"))/(temp_power_rating))) \
                * (1 - (solar_intensity_power_rating - F.col("solar_intensity"))/solar_intensity_power_rating) \
                                                           ).drop("solar_intensity", "temp")

In [133]:
solar_panel_readings_df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- current_generation_watt: double (nullable = true)



In [135]:
solar_panel_readings_df.write.format("csv").option("header", True).mode("overwrite").partitionBy("hour") \
                       .save("/home/iceberg/warehouse/weather_history_splitted_resampled/solar_panel_readings/2013-01-01.csv")

                                                                                

In [137]:
solar_panel_readings_df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [timestamp#311, hour#317, (((1.0 / (1.0 - (cast((25.0 - temp#313) as double) / 25.0))) * 0.013888888888888888) * (1.0 - (cast((1000.0 - solar_intensity#312) as double) / 1000.0))) AS current_generation_watt#326]
   +- Exchange hashpartitioning(hour#317, 23), REPARTITION_BY_NUM, [plan_id=1114]
      +- Project [timestamp#311, solar_intensity#312, temp#313, hour(timestamp#311, Some(Etc/UTC)) AS hour#317]
         +- FileScan csv [timestamp#311,solar_intensity#312,temp#313] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/warehouse/weather_history_splitted_resampled/2013-0..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<timestamp:timestamp,solar_intensity:float,temp:float>




In [24]:
solar_panel_readings_df.groupBy("hour").agg(F.sum("current_generation_watt").alias("avg_current_generation_watt")).show()

                                                                                

+----+---------------------------+
|hour|avg_current_generation_watt|
+----+---------------------------+
|  14|         11387.961751424062|
|  18|         1.8147681686008805|
|   6|                        0.0|
|   3|                        0.0|
|   7|         22.347037731300503|
|  15|         10413.181870815331|
|   2|                        0.0|
|   4|                        0.0|
|  22|                        0.0|
|  12|         11969.988631811231|
|  17|          3308.708864947764|
|   9|         14416.146357168736|
|  11|         12308.133476173876|
|  19|                        0.0|
|  20|                        0.0|
|   8|          7281.759642823639|
|  21|                        0.0|
|  13|         11785.498389172582|
|  16|          8739.396919769139|
|  23|                        0.0|
+----+---------------------------+
only showing top 20 rows



                                                                                

In [121]:
spark.stop()