In [1]:
import requests
import json
import time
import os
import ConnectionConfig as cc
from datetime import datetime, timedelta
from pyspark.sql.functions import col, collect_list, regexp_replace, split, trim

cc.setupEnvironment()
cc.set_connectionProfile("veloDB")

spark = cc.startLocalCluster("Weather_Data", 4)
spark.getActiveSession()

url = "https://archive-api.open-meteo.com/v1/archive"

In [2]:
def fetch_openmeteo_data(lat, lon, timestamp, zipcode):
    dt = to_datetime(timestamp)
    date_str = dt.strftime("%Y-%m-%d")
    hour_index = dt.hour

    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": date_str,
        "end_date": date_str,
        "hourly": "temperature_2m,precipitation,weathercode",
        "timezone": "auto"
    }

    response = requests.get(url, params=params)
    if response.status_code != 200:
        print(f"Cannot fetch: {response.status_code}")
        return None

    data = response.json()
    try:
        temperature = data["hourly"]["temperature_2m"][hour_index]
        precipitation = data["hourly"]["precipitation"][hour_index]
        code = data["hourly"]["weathercode"][hour_index]

        weather_data = {
            "timestamp": timestamp,
            "latitude": lat,
            "longitude": lon,
            "temperature": temperature,
            "precipitation": precipitation,
            "weather_code": code,
            "zipCode": int(zipcode),
            "weather_condition": classify_weather(temperature, code, precipitation)
        }
        return weather_data

    except (KeyError, IndexError):
        print("No hourly weather")
        return None


def generate_weather_condition(weather_data):
    if not weather_data or "main" not in weather_data or "weather" not in weather_data or not weather_data["weather"]:
        return "Unknown"

    temperature = weather_data["main"].get("temperature")
    weather_main = weather_data["weather"][0].get("main", "").lower()
    weather_description = weather_data["weather"][0].get("description", "").lower()

    # Unpleasant: Any form of precipitation
    precipitation_keywords = ["rain", "snow", "drizzle", "thunderstorm", "sleet"]
    if any(keyword in weather_main or keyword in weather_description for keyword in precipitation_keywords):
        return "Unpleasant"

    # Pleasant: Temperature above 15Â°C and sunny
    if temperature is not None and temperature > 15 and weather_main == "clear":
        return "Pleasant"

    # Neutral: All other conditions
    return "Neutral"

def classify_weather(temp, weather_code, precipitation):
    if temp is None or weather_code is None:
        return "Unknown"

    unpleasant_codes = {61, 63, 65, 66, 67, 80, 81, 82, 95, 96, 99}  # rainy/thunderstorms

    if weather_code in unpleasant_codes or (precipitation is not None and precipitation > 0):
        return "Unpleasant"
    if temp > 15 and weather_code == 0:  # clear sky
        return "Pleasant"
    return "Neutral"

def to_datetime(iso_timestamp):
    return datetime.fromisoformat(str(iso_timestamp))

In [3]:
# Load rides, locks, and stations data
df_rides = (spark.read
            .format("jdbc")
            .option("driver", cc.get_Property("driver"))
            .option("url", cc.get_Property("url"))
            .option("dbtable", "rides")
            .option("user", cc.get_Property("username"))
            .option("password", cc.get_Property("password"))
            .load())
df_locks = (spark.read
            .format("jdbc")
            .option("driver", cc.get_Property("driver"))
            .option("url", cc.get_Property("url"))
            .option("user", cc.get_Property("username"))
            .option("password", cc.get_Property("password"))
            .option("dbtable", "locks")
            .load())
df_stations = (spark.read
               .format("jdbc")
               .option("driver", cc.get_Property("driver"))
               .option("url", cc.get_Property("url"))
               .option("user", cc.get_Property("username"))
               .option("password", cc.get_Property("password"))
               .option("dbtable", "stations")
               .load())

df_rides_with_stations = df_rides.alias("r") \
    .join(df_locks.alias("l"), col("r.startlockid") == col("l.lockid")) \
    .join(df_stations.alias("s"), col("l.stationid") == col("s.stationid")) \
    .select(
    col("r.rideid"),
    col("r.starttime"),
    col("s.zipcode").alias("start_zipcode"),
    col("s.gpscoord"),
    trim(split(regexp_replace("s.gpscoord", r"[()]", ""), ",")[0]).cast("double").alias("latitude"),
    trim(split(regexp_replace("s.gpscoord", r"[()]", ""), ",")[1]).cast("double").alias("longitude")
)

print("=== Schema ===")
df_rides_with_stations.printSchema()
print("=== Data ===")
df_rides_with_stations.show()

=== Schema ===
root
 |-- rideid: integer (nullable = true)
 |-- starttime: timestamp (nullable = true)
 |-- start_zipcode: string (nullable = true)
 |-- gpscoord: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)

=== Data ===
+-------+-------------------+-------------+-----------------+--------+---------+
| rideid|          starttime|start_zipcode|         gpscoord|latitude|longitude|
+-------+-------------------+-------------+-----------------+--------+---------+
|2043020|2021-09-05 12:04:36|         2000|(51.2195,4.41164)| 51.2195|  4.41164|
|2043029|2021-09-01 11:35:23|         2100|(51.2293,4.46783)| 51.2293|  4.46783|
|2043034|2021-09-01 11:38:46|         2060|(51.2201,4.43266)| 51.2201|  4.43266|
|2043038|2021-09-01 11:11:24|         2140|(51.2083,4.44597)| 51.2083|  4.44597|
|2043041|2021-09-01 11:15:13|         2000|(51.2281,4.41159)| 51.2281|  4.41159|
|2043045|2021-09-01 11:23:33|         2000|(51.2098,4.39089)| 51.2098

In [4]:
os.makedirs("weather", exist_ok=True)

postal_code_timestamps = df_rides_with_stations.groupBy("start_zipcode") \
    .agg(collect_list("starttime").alias("timestamps"),
         collect_list("latitude").alias("latitudes"),
         collect_list("longitude").alias("longitudes")) \
    .collect()

for row in postal_code_timestamps:
    start_zipcode = row["start_zipcode"]
    timestamps = row["timestamps"]
    latitudes = row["latitudes"]
    longitudes = row["longitudes"]

    if len(timestamps) < 3:
        additional_timestamps = [datetime.now() - timedelta(hours=i) for i in range(3 - len(timestamps))]
        timestamps.extend(additional_timestamps)

    for i in range(3):
        timestamp = timestamps[i]
        latitude = latitudes[i]
        longitude = longitudes[i]
        weather_data = fetch_openmeteo_data(latitude, longitude, timestamp, start_zipcode)
        if weather_data:
            formatted_timestamp = timestamp.strftime("%Y-%m-%d_%H-%M-%S")

            filename = f"weather/{start_zipcode}_{formatted_timestamp}.json"
            with open(filename, "w") as f:
                json.dump(weather_data, f, indent=4, default=str)
            print(f"Saved weather data for {start_zipcode} at {timestamp} to {filename}")

        time.sleep(4)

spark.stop()

Saved weather data for 2100 at 2021-09-01 11:35:23 to weather/2100_2021-09-01_11-35-23.json
Saved weather data for 2100 at 2021-09-01 11:43:31 to weather/2100_2021-09-01_11-43-31.json
Saved weather data for 2100 at 2021-09-01 11:52:08 to weather/2100_2021-09-01_11-52-08.json
Saved weather data for 2060 at 2021-09-01 11:38:46 to weather/2060_2021-09-01_11-38-46.json
Saved weather data for 2060 at 2021-09-01 11:17:04 to weather/2060_2021-09-01_11-17-04.json
Saved weather data for 2060 at 2021-09-01 11:58:31 to weather/2060_2021-09-01_11-58-31.json
Saved weather data for 2140 at 2021-09-01 11:11:24 to weather/2140_2021-09-01_11-11-24.json
Saved weather data for 2140 at 2021-09-01 11:36:03 to weather/2140_2021-09-01_11-36-03.json
Saved weather data for 2140 at 2021-09-01 12:56:13 to weather/2140_2021-09-01_12-56-13.json
Saved weather data for 2170 at 2021-09-01 12:11:42 to weather/2170_2021-09-01_12-11-42.json
Saved weather data for 2170 at 2021-09-05 14:32:45 to weather/2170_2021-09-05_14