In [12]:
# Import necessary functionality
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, udf, cos, radians, sin, sqrt, atan2
from pyspark.sql.types import FloatType
import math

# Create a SparkSession
spark = SparkSession.builder.appName("ReadCSV").getOrCreate()

# Read wind core sites data
df = spark.read.option("sep", ";").csv("data/metobs_maxMeanWindSpeed_active_sites.csv", header=True, inferSchema=True)

# Read wind turbine data
df_2 = spark.read.option("sep", ",").csv("data/VBK_export_allman_prod - Vindkraftverk.csv", header=True, inferSchema=True)

df.show(n=5)
df_2.show(n=5)


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/home/chickenthug/Documents/school/DIC/project/data/metobs_maxMeanWindSpeed_active_sties.csv.

In [None]:
# Function for transformation sweref99 tm to longitude and latitude coordinates
def sweref99_to_latlon(E, N):
    # Constants for SWEREF99 TM projection
    E0 = 500000  # False Easting in meters
    N0 = 0       # False Northing in meters
    F0 = 0.9996  # Scale factor at central meridian
    lo0 = math.radians(15)  # Central meridian in radians
    a = 6378137.0  # Semi-major axis of WGS 84 ellipsoid in meters
    la0 = 0  # Latitude of projection origin in radians

    # Convert
    lat = la0 + (N - N0) / (a * F0)
    long = lo0 + (E - E0) / (a * F0 * cos(lat))

    return lat, long

# Calculate distance between points (E, N) and (lat, long)
def distance(E, N, lat, long):

    # Convert E-kooridnat and N-kooridnat to (lat1, lon1)
    lat1, lon1 = sweref99_to_latlon(E, N)
    
    # Convert to radians
    lat2 = radians(lat)
    lon2 = radians(long)
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    
    # Radius of the Earth in kilometers (mean value)
    radius_earth = 6371.0
    
    # Calculate the distance
    distance = radius_earth * c
    
    return distance

distance_udf = udf(distance, FloatType())

In [None]:
# Clean uneccesary spaces after Elområde (Electricity zone)
df_2 = df_2.withColumn("Elområde", regexp_replace(col("Elområde"), "\\s+", ""))

# Filter out turbines:
#    not on land, 
#    not mounted and 
#    not in Elområde 1

df_2 = df_2.filter((col("Status") == "Uppfört") & (col("Placering") == "Land") & (col("Elområde") == "Luleå"))

# Make life easier
df_2.createOrReplaceTempView("df2")

# Get relevant fields
df_2 = spark.sql("SELECT `Verk-ID`, `E-koordinat`, `N-koordinat`, `Uppfört`, `Maxeffekt (MW)` FROM df2")

# Join tables ignoring no longer active wind core sites
df_2 = df_2.crossJoin(df.filter((col("Aktiv") == "Ja")))

# Add column that calculate distance between all turbines and wind core sites
df_final = df_2.withColumn("distance (km)", distance(df_2["E-koordinat"], df_2["N-koordinat"], df_2["Latitud"], df_2["Longitud"]))

df_final.createOrReplaceTempView("dfinal")
# Select the closest wind core site for each wind turbine (defined by Verk-ID)
df_final = spark.sql("SELECT * FROM dfinal WHERE `distance (km)` IN (SELECT MIN(`distance (km)`) FROM dfinal GROUP BY `Verk-ID`)")


df_final.createOrReplaceTempView("dfinal")
# Find needed wind core sites
spark.sql("SELECT DISTINCT Namn FROM dfinal ORDER BY Namn").show(n=50, truncate=False)

spark.stop()

+----------------------+
|Namn                  |
+----------------------+
|Arvidsjaur A          |
|Buresjön A            |
|Haparanda A           |
|Holmön A              |
|Kiruna Flygplats      |
|Latnivaara A          |
|Luleå-Kallax Flygplats|
|Malå-Brännan A        |
|Norsjö A              |
|Paharova A            |
|Pajala A              |
|Petisträsk A          |
|Pite-Rönnskär A       |
|Saittarova A          |
|Skellefteå Flygplats  |
|Stora Sjöfallet A     |
|Storön A              |
|Umeå Flygplats        |
|Ylinenjärvi A         |
|Älvsbyn A             |
+----------------------+

