In [1]:
import requests
import json
import time
import os
import ConnectionConfig as cc
from datetime import datetime, timedelta
from pyspark.sql.functions import col, collect_list, regexp_replace, split, trim

cc.setupEnvironment()
cc.set_connectionProfile("veloDB")

spark = cc.startLocalCluster("Weather_Data", 4)
spark.getActiveSession()

url = "https://archive-api.open-meteo.com/v1/archive"

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "D:\KdG\Year 2\Integration\sparkdelta\venv\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:\KdG\Year 2\Integration\sparkdelta\venv\Lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\KdG\Year 2\Integration\sparkdelta\venv\Lib\site-packages\py4j\clientserver.py", line 539, in send_c

Py4JError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext

In [2]:
def fetch_openmeteo_data(lat, lon, timestamp, zipcode):
    dt = to_datetime(timestamp)
    date_str = dt.strftime("%Y-%m-%d")
    hour_index = dt.hour

    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": date_str,
        "end_date": date_str,
        "hourly": "temperature_2m,precipitation,weathercode",
        "timezone": "auto"
    }

    response = requests.get(url, params=params)
    if response.status_code != 200:
        print(f"Cannot fetch: {response.status_code}")
        return None

    data = response.json()
    try:
        temperature = data["hourly"]["temperature_2m"][hour_index]
        precipitation = data["hourly"]["precipitation"][hour_index]
        code = data["hourly"]["weathercode"][hour_index]

        weather_data = {
            "timestamp": timestamp,
            "latitude": lat,
            "longitude": lon,
            "temperature": temperature,
            "precipitation": precipitation,
            "weather_code": code,
            "zipCode": int(zipcode),
            "weather_condition": classify_weather(temperature, code, precipitation)
        }
        return weather_data

    except (KeyError, IndexError):
        print("No hourly weather")
        return None


def generate_weather_condition(weather_data):
    if not weather_data or "main" not in weather_data or "weather" not in weather_data or not weather_data["weather"]:
        return "Unknown"

    temperature = weather_data["main"].get("temperature")
    weather_main = weather_data["weather"][0].get("main", "").lower()
    weather_description = weather_data["weather"][0].get("description", "").lower()

    # Unpleasant: Any form of precipitation
    precipitation_keywords = ["rain", "snow", "drizzle", "thunderstorm", "sleet"]
    if any(keyword in weather_main or keyword in weather_description for keyword in precipitation_keywords):
        return "Unpleasant"

    # Pleasant: Temperature above 15Â°C and sunny
    if temperature is not None and temperature > 15 and weather_main == "clear":
        return "Pleasant"

    # Neutral: All other conditions
    return "Neutral"

def classify_weather(temp, weather_code, precipitation):
    if temp is None or weather_code is None:
        return "Unknown"

    unpleasant_codes = {61, 63, 65, 66, 67, 80, 81, 82, 95, 96, 99}  # rainy/thunderstorms

    if weather_code in unpleasant_codes or (precipitation is not None and precipitation > 0):
        return "Unpleasant"
    if temp > 15 and weather_code == 0:  # clear sky
        return "Pleasant"
    return "Neutral"

def to_datetime(iso_timestamp):
    return datetime.fromisoformat(str(iso_timestamp))

In [3]:
# Load rides, locks, and stations data
df_rides = (spark.read
            .format("jdbc")
            .option("driver", cc.get_Property("driver"))
            .option("url", cc.get_Property("url"))
            .option("dbtable", "rides")
            .option("user", cc.get_Property("username"))
            .option("password", cc.get_Property("password"))
            .load())
df_locks = (spark.read
            .format("jdbc")
            .option("driver", cc.get_Property("driver"))
            .option("url", cc.get_Property("url"))
            .option("user", cc.get_Property("username"))
            .option("password", cc.get_Property("password"))
            .option("dbtable", "locks")
            .load())
df_stations = (spark.read
               .format("jdbc")
               .option("driver", cc.get_Property("driver"))
               .option("url", cc.get_Property("url"))
               .option("user", cc.get_Property("username"))
               .option("password", cc.get_Property("password"))
               .option("dbtable", "stations")
               .load())

df_rides_with_stations = df_rides.alias("r") \
    .join(df_locks.alias("l"), col("r.startlockid") == col("l.lockid")) \
    .join(df_stations.alias("s"), col("l.stationid") == col("s.stationid")) \
    .select(
    col("r.rideid"),
    col("r.starttime"),
    col("s.zipcode").alias("start_zipcode"),
    col("s.gpscoord"),
    trim(split(regexp_replace("s.gpscoord", r"[()]", ""), ",")[0]).cast("double").alias("latitude"),
    trim(split(regexp_replace("s.gpscoord", r"[()]", ""), ",")[1]).cast("double").alias("longitude")
)

print("=== Schema ===")
df_rides_with_stations.printSchema()
print("=== Data ===")
df_rides_with_stations.show()

=== Schema ===
root
 |-- rideid: integer (nullable = true)
 |-- starttime: timestamp (nullable = true)
 |-- start_zipcode: string (nullable = true)
 |-- gpscoord: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)

=== Data ===
+------+-------------------+-------------+-----------------+--------+---------+
|rideid|          starttime|start_zipcode|         gpscoord|latitude|longitude|
+------+-------------------+-------------+-----------------+--------+---------+
|     4|2015-09-22 00:00:00|         2018|(51.2034,4.39221)| 51.2034|  4.39221|
|     9|2015-09-22 00:00:00|         2000|(51.2187,4.40066)| 51.2187|  4.40066|
|    12|2015-09-22 00:00:00|         2000|(51.2092,4.40293)| 51.2092|  4.40293|
|    13|2015-09-22 00:00:00|         2610|(51.1771,4.39978)| 51.1771|  4.39978|
|    18|2019-09-22 08:41:48|         2018|(51.2034,4.39221)| 51.2034|  4.39221|
|    23|2019-09-22 08:23:27|         2000|(51.2187,4.40066)| 51.2187|  4.4006

In [4]:
os.makedirs("weather", exist_ok=True)

postal_code_timestamps = df_rides_with_stations.groupBy("start_zipcode") \
    .agg(collect_list("starttime").alias("timestamps"),
         collect_list("latitude").alias("latitudes"),
         collect_list("longitude").alias("longitudes")) \
    .collect()

for row in postal_code_timestamps:
    start_zipcode = row["start_zipcode"]
    timestamps = row["timestamps"]
    latitudes = row["latitudes"]
    longitudes = row["longitudes"]

    if len(timestamps) < 3:
        additional_timestamps = [datetime.now() - timedelta(hours=i) for i in range(3 - len(timestamps))]
        timestamps.extend(additional_timestamps)

    for i in range(3):
        timestamp = timestamps[i]
        latitude = latitudes[i]
        longitude = longitudes[i]
        weather_data = fetch_openmeteo_data(latitude, longitude, timestamp, start_zipcode)
        if weather_data:
            formatted_timestamp = timestamp.strftime("%Y-%m-%d_%H-%M-%S")

            filename = f"weather/{start_zipcode}_{formatted_timestamp}.json"
            with open(filename, "w") as f:
                json.dump(weather_data, f, indent=4, default=str)
            print(f"Saved weather data for {start_zipcode} at {timestamp} to {filename}")

        time.sleep(4)

spark.stop()

Saved weather data for 2140 at 2019-09-22 08:03:38 to weather/2140_2019-09-22_08-03-38.json
Saved weather data for 2140 at 2019-09-22 08:49:18 to weather/2140_2019-09-22_08-49-18.json
Saved weather data for 2140 at 2019-09-22 08:11:14 to weather/2140_2019-09-22_08-11-14.json
Saved weather data for 2170 at 2019-09-22 08:08:47 to weather/2170_2019-09-22_08-08-47.json
Saved weather data for 2170 at 2019-09-22 09:10:02 to weather/2170_2019-09-22_09-10-02.json
Saved weather data for 2170 at 2019-09-22 10:57:35 to weather/2170_2019-09-22_10-57-35.json
Saved weather data for 2100 at 2019-09-22 08:33:43 to weather/2100_2019-09-22_08-33-43.json
Saved weather data for 2100 at 2019-09-22 08:34:50 to weather/2100_2019-09-22_08-34-50.json
Saved weather data for 2100 at 2019-09-22 08:36:21 to weather/2100_2019-09-22_08-36-21.json
Saved weather data for 2060 at 2019-09-22 08:42:03 to weather/2060_2019-09-22_08-42-03.json
Saved weather data for 2060 at 2019-09-22 08:37:23 to weather/2060_2019-09-22_08