### Connect to NY511.ORG API to download road events

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, ArrayType, IntegerType, BooleanType

# Define the Schedule schema (Array of Structs)
schedule_schema = ArrayType(StructType([
    StructField("ScheduleId", IntegerType(), True),
    StructField("Start", StringType(), True),
    StructField("End", StringType(), True),
    StructField("Continuous", BooleanType(), True),
    StructField("ActiveDays", ArrayType(StringType()), True),
    StructField("Impact", StringType(), True)  # Impact field added
]))

# Define the Main Schema
schema = StructType([
    StructField("LastUpdated", StringType(), True),
    StructField("Latitude", DoubleType(), True),
    StructField("Longitude", DoubleType(), True),
    StructField("PlannedEndDate", StringType(), True),
    StructField("Reported", StringType(), True),
    StructField("StartDate", StringType(), True),
    StructField("ID", StringType(), True),
    StructField("RegionName", StringType(), True),
    StructField("CountyName", StringType(), True),
    StructField("Severity", StringType(), True),
    StructField("RoadwayName", StringType(), True),
    StructField("DirectionOfTravel", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("Location", StringType(), True),
    StructField("LanesAffected", StringType(), True),
    StructField("LanesStatus", StringType(), True),
    StructField("LcsEntries", ArrayType(StructType([])), True),  # Assuming empty object array
    StructField("NavteqLinkId", StringType(), True),
    StructField("PrimaryLocation", StringType(), True),
    StructField("SecondaryLocation", StringType(), True),
    StructField("FirstArticleCity", StringType(), True),
    StructField("SecondCity", StringType(), True),
    StructField("EventType", StringType(), True),
    StructField("EventSubType", StringType(), True),
    StructField("MapEncodedPolyline", StringType(), True),
    StructField("Schedule", schedule_schema, True)  # Corrected to ArrayType(StructType([...]))
])


In [0]:
import requests
import datetime
from pyspark.sql.functions import to_timestamp, current_timestamp

# API request URL
API_KEY = ""
URL = f"https://511ny.org/api/getevents?key={API_KEY}&format=json"

# Make the request
response = requests.get(URL)
data = response.json()

# Create Spark DataFrame
df = spark.createDataFrame(data, schema=schema)
df = df.withColumn("ReportedDate", to_timestamp(df["Reported"],"dd/MM/yyyy HH:mm:ss")).drop("Reported")

# Display DataFrame
display(df)


Databricks visualization. Run in Databricks to view.

## Download Historical Weather Data for Road Collision using openmeteo

In [None]:
%pip install openmeteo-requests requests-cache retry-requests numpy pandas

In [None]:
#import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"

for d in weather_dates_df:
  collision_data = spark.sql(f"select distinct crash_date, round(longitude,1) longitude , round(latitude,1) latitude from {catalog_name}.{schema_name}.collision_bronze where year(crash_date) = 2024 and longitude is not null and latitude is not null and round(longitude,1) != 0.0 and round(latitude,1) != 0.0 and crash_date = '{d.crash_date}'").collect()
  
  weather_df = pd.DataFrame()
  c = 0

  for r in collision_data:
     params = {
     "latitude": r.latitude,
	   "longitude": r.longitude,
	   "start_date": r.crash_date,
	   "end_date": r.crash_date,
	   "hourly": ["temperature_2m", "precipitation", "rain", "snowfall", "snow_depth", "weather_code", "wind_speed_10m", "wind_direction_10m", "wind_gusts_10m"],
	   "temperature_unit": "fahrenheit",
	   "wind_speed_unit": "mph",
	   "precipitation_unit": "inch",
	   "timezone": "America/New_York"
     }
     try:
       responses = openmeteo.weather_api(url, params=params)
     except Exception as e:
       if "Hourly API request limit exceeded. Please try again in the next hour." in str(e):
         time.sleep(3600)
       else:
         raise e
     # Process first location. Add a for-loop for multiple locations or weather models
     response = responses[0]

     # Process hourly data. The order of variables needs to be the same as requested.
     hourly = response.Hourly()
     hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
     hourly_precipitation = hourly.Variables(1).ValuesAsNumpy()
     hourly_rain = hourly.Variables(2).ValuesAsNumpy()
     hourly_snowfall = hourly.Variables(3).ValuesAsNumpy()
     hourly_snow_depth = hourly.Variables(4).ValuesAsNumpy()
     hourly_weather_code = hourly.Variables(5).ValuesAsNumpy()
     hourly_wind_speed_10m = hourly.Variables(6).ValuesAsNumpy()
     hourly_wind_direction_10m = hourly.Variables(7).ValuesAsNumpy()
     hourly_wind_gusts_10m = hourly.Variables(8).ValuesAsNumpy()

     hourly_data = {"datetime": pd.date_range(
       start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	     end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	     freq = pd.Timedelta(seconds = hourly.Interval()),
	     inclusive = "left"
      )}
     hourly_data["date"] = r.crash_date
     hourly_data["latitude"] = r.latitude
     hourly_data["longitude"] = r.longitude
     hourly_data["temperature_2m"] = hourly_temperature_2m
     hourly_data["precipitation"] = hourly_precipitation
     hourly_data["rain"] = hourly_rain
     hourly_data["snowfall"] = hourly_snowfall
     hourly_data["snow_depth"] = hourly_snow_depth
     hourly_data["weather_code"] = hourly_weather_code
     hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
     hourly_data["wind_direction_10m"] = hourly_wind_direction_10m
     hourly_data["wind_gusts_10m"] = hourly_wind_gusts_10m

     hourly_dataframe = pd.DataFrame(data = hourly_data)
     weather_df = pd.concat([weather_df, hourly_dataframe])
     c = c+1
     print(c)
  weather_sdf = spark.createDataFrame(weather_df)
  #weather_sdf.write.mode("append").saveAsTable(f"{catalog_name}.{schema_name}.weather_history_bronze")   