In [1]:
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.window import Window
from pyspark.sql import DataFrame
from datetime import datetime
import h3_pyspark

# Settings
project = "project_aiu"
max_h3_resolution = 12

# Getting today's date
today = datetime.today().strftime('%d %B %Y')

# Spark Session Initialization
spark = SparkSession.builder \
    .appName("OSN statevectors H3 addition") \
    .config("spark.log.level", "ERROR")\
    .config("spark.ui.showConsoleProgress", "false")\
    .config("spark.hadoop.fs.azure.ext.cab.required.group", "eur-app-aiu-dev") \
    .config("spark.kerberos.access.hadoopFileSystems", "abfs://storage-fs@cdpdldev0.dfs.core.windows.net/data/project/aiu.db/unmanaged") \
    .config("spark.driver.cores", "1") \
    .config("spark.driver.memory", "8G") \
    .config("spark.executor.memory", "6G") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.instances", "2") \
    .config("spark.dynamicAllocation.maxExecutors", "12") \
    .config("spark.network.timeout", "800s") \
    .config("spark.executor.heartbeatInterval", "400s") \
    .enableHiveSupport() \
    .getOrCreate()

## Read raw data 
#df = spark.sql(f"SELECT * FROM `{project}`.`osn_statevectors_clustered`;")

# Create OSN tracks db
h3_resolution = max_h3_resolution
h3_res_sql = ""
h3_res = []

while h3_resolution >= 0:
    if h3_resolution != 0:
        h3_res_sql = h3_res_sql + f"h3_res_{h3_resolution} STRING COMMENT 'H3 cell identifier for lat and lon with H3 resolution {h3_resolution}.',"
    else: 
        h3_res_sql = h3_res_sql + f"h3_res_{h3_resolution} STRING COMMENT 'H3 cell identifier for lat and lon with H3 resolution {h3_resolution}.'"
    h3_res.append(f"h3_res_{h3_resolution}")
    h3_resolution = h3_resolution - 1

h3_res_str = ', '.join(h3_res)

create_osn_tracks_sql = f"""
    CREATE TABLE IF NOT EXISTS `{project}`.`osn_h3_statevectors` (
        event_time BIGINT COMMENT 'This column contains the unix (aka POSIX or epoch) timestamp for which the state vector was valid.',
        icao24 STRING COMMENT 'This column contains the 24-bit ICAO transponder ID which can be used to track specific airframes over different flights.',
        lat DOUBLE COMMENT 'This column contains the last known latitude of the aircraft.',
        lon DOUBLE COMMENT 'This column contains the last known longitude of the aircraft.',
        velocity DOUBLE COMMENT 'This column contains the speed over ground of the aircraft in meters per second.',
        heading DOUBLE COMMENT 'This column represents the direction of movement (track angle in degrees) as the clockwise angle from the geographic north.',
        vert_rate DOUBLE COMMENT 'This column contains the vertical speed of the aircraft in meters per second.',
        callsign STRING COMMENT 'This column contains the callsign that was broadcast by the aircraft.',
        on_ground BOOLEAN COMMENT 'This flag indicates whether the aircraft is broadcasting surface positions (true) or airborne positions (false).',
        alert BOOLEAN COMMENT 'This flag is a special indicator used in ATC.',
        spi BOOLEAN COMMENT 'This flag is a special indicator used in ATC.',
        squawk STRING COMMENT 'This 4-digit octal number is another transponder code which is used by ATC and pilots for identification purposes and indication of emergencies.',
        baro_altitude DOUBLE COMMENT 'This column indicates the aircrafts altitude. As the names suggest, baroaltitude is the altitude measured by the barometer (in meter).',
        geo_altitude DOUBLE COMMENT 'This column indicates the aircrafts altitude. As the names suggest, geoaltitude is determined using the GNSS (GPS) sensor (in meter).',
        last_pos_update DOUBLE COMMENT 'This unix timestamp indicates the age of the position.',
        last_contact DOUBLE COMMENT 'This unix timestamp indicates the time at which OpenSky received the last signal of the aircraft.',
        serials ARRAY<INT> COMMENT 'The serials column is a list of serials of the ADS-B receivers which received the message.',
        {h3_res_sql}
    )
    COMMENT '`{project}`.`osn_statevectors_clustered` with added H3 tags (generated based on callsign, icao24 grouping with 30 min signal gap intolerance). Last updated: {today}.'
    STORED AS parquet
    TBLPROPERTIES ('transactional'='false');
"""

Setting spark.hadoop.yarn.resourcemanager.principal to quinten.goens
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/22 08:10:44 WARN JettyUtils: GET /jobs/ failed: java.util.NoSuchElementException: Failed to get the application information. If you are starting up Spark, please wait a while until it's ready.
java.util.NoSuchElementException: Failed to get the application information. If you are starting up Spark, please wait a while until it's ready.
	at org.apache.spark.status.AppStatusStore.applicationInfo(AppStatusStore.scala:51)
	at org.apache.spark.ui.jobs.AllJobsPage.render(AllJobsPage.scala:276)
	at org.apache.spark.ui.WebUI.$anonfun$attachPage$1(WebUI.scala:90)
	at org.apache.spark.ui.JettyUtils$$anon$1.doGet(JettyUtils.scala:81)
	at javax.servlet.http.HttpServlet.service(HttpServlet.java:687)
	at javax.servlet.h

In [2]:
print(create_osn_tracks_sql)


    CREATE TABLE IF NOT EXISTS `project_aiu`.`osn_h3_statevectors` (
        event_time BIGINT COMMENT 'This column contains the unix (aka POSIX or epoch) timestamp for which the state vector was valid.',
        icao24 STRING COMMENT 'This column contains the 24-bit ICAO transponder ID which can be used to track specific airframes over different flights.',
        lat DOUBLE COMMENT 'This column contains the last known latitude of the aircraft.',
        lon DOUBLE COMMENT 'This column contains the last known longitude of the aircraft.',
        velocity DOUBLE COMMENT 'This column contains the speed over ground of the aircraft in meters per second.',
        heading DOUBLE COMMENT 'This column represents the direction of movement (track angle in degrees) as the clockwise angle from the geographic north.',
        vert_rate DOUBLE COMMENT 'This column contains the vertical speed of the aircraft in meters per second.',
        callsign STRING COMMENT 'This column contains the callsign

24/04/22 08:17:15 WARN WatcherWebSocketListener: Exec Failure javax.net.ssl.SSLException Connection reset


In [20]:
from IPython.display import display, HTML
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, pandas_udf, col, PandasUDFType, lit, round
from pyspark.sql.types import DoubleType, StructType, StructField
from pyspark.sql import functions as F
from pyspark.sql import Window

import os, time
import subprocess
import os,shutil
from datetime import datetime, timedelta
import pandas as pd
import numpy as np

# Settings

## Project settings
project = "project_aiu"

# Getting today's date
today = datetime.today().strftime('%d %B %Y')

# Spark Session Initialization
shutil.copy("/runtime-addons/cmladdon-2.0.40-b150/log4j.properties", "/etc/spark/conf/") # Setting logging properties
spark = SparkSession.builder \
    .appName("Extract OPDI") \
    .config("spark.log.level", "ERROR")\
    .config("spark.hadoop.fs.azure.ext.cab.required.group", "eur-app-aiu-dev") \
    .config("spark.kerberos.access.hadoopFileSystems", "abfs://storage-fs@cdpdldev0.dfs.core.windows.net/data/project/aiu.db/unmanaged") \
    .config("spark.driver.cores", "1") \
    .config("spark.driver.memory", "11G") \
    .config("spark.executor.memory", "7G") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.instances", "2") \
    .config("spark.dynamicAllocation.maxExecutors", "6") \
    .config("spark.network.timeout", "800s") \
    .config("spark.executor.heartbeatInterval", "400s") \
    .config("spark.driver.maxResultSize", "4g") \
    .enableHiveSupport() \
    .getOrCreate()

# Get environment variables
engine_id = os.getenv('CDSW_ENGINE_ID')
domain = os.getenv('CDSW_DOMAIN')

# Format the URL
url = f"https://spark-{engine_id}.{domain}"

# Display the clickable URL
display(HTML(f'<a href="{url}">{url}</a>'))


Setting spark.hadoop.yarn.resourcemanager.principal to quinten.goens


In [38]:
def download_and_unzip(url):
    response = requests.get(url)
    if response.status_code == 200:
        zip_content = io.BytesIO(response.content)

        with zipfile.ZipFile(zip_content, 'r') as zip_ref:
            zip_ref.extractall(".")

        files = zip_ref.namelist()
        return files
    else:
        files = []
        
def filter_down_to_osn_bbox(df, lng_col = 'longitude_deg', lat_col = 'latitude_deg'):
    # Bounding box OSN data
    xmin = -25.86653
    ymin = 26.74617
    xmax = 49.65699
    ymax = 70.25976

    # Filter down df
    f_lng = np.logical_and(apt_hex[lng_col] >= xmin, apt_hex[lng_col] <= xmax)
    f_lat = np.logical_and(apt_hex[lat_col] >= ymin, apt_hex[lat_col] <= ymax)
    f = np.logical_and(f_lng, f_lat)
    df = df[f]
    
    return df

SyntaxError: invalid syntax (3628970362.py, line 14)

In [60]:
import requests
import zipfile
import io
import pandas as pd

# Airport Hex

# URL of the zip file to be downloaded
airport_hex_url = "https://zenodo.org/records/10797224/files/airport_hex.zip"

# Download the zip file
files = download_and_unzip(airport_hex_url)

# Read and filter
apt_hex = pd.read_parquet('./airport_hex/airport_hex_res_5_radius_40_nm.parquet')
apt_hex = filter_down_to_osn_bbox(apt_hex)

# Create spark df 
apt_hex_spark = spark.createDataFrame(apt_hex)

NameError: name 'filter_down_to_osn_bbox' is not defined

In [41]:
df = pd.read_parquet('runway_hex/h3_res_11_rwy_EGLL.parquet')

In [56]:
for x in df.columns:
    print("\'",x,"\',", sep='')

'id',
'airport_ref',
'airport_ident',
'length_ft',
'width_ft',
'surface',
'lighted',
'closed',
'le_ident',
'le_latitude_deg',
'le_longitude_deg',
'le_elevation_ft',
'le_heading_degT',
'le_displaced_threshold_ft',
'he_ident',
'he_latitude_deg',
'he_longitude_deg',
'he_elevation_ft',
'he_heading_degT',
'he_displaced_threshold_ft',
'gate_id',
'hex_id',
'gate_id_nr',


In [None]:
# Runway Hex

runway_hex_url = "https://zenodo.org/records/10797224/files/runway_hex.zip"
files = download_and_unzip(runway_hex_url)

dfs = []

for file in files:
    if file == 'runway_hex/':
        continue
    else:
        df = pd.read_parquet(file)
        df = filter_down_to_osn_bbox(df, lng_col = 'le_longitude_deg', lat_col = 'le_latitude_deg')
        cols = [
            'id',
            'airport_ident',
            'le_ident',
            'he_ident',
            'gate_id',
            'hex_id',
            'gate_id_nr'
        ]

        dfs = dfs.append(df[cols])

rwy_hex = pd.concat(dfs)

In [None]:
rwy_hex.to_parquet('rwy_hex.parquet')

In [33]:
apt_hex_spark

DataFrame[id: bigint, ident: string, latitude_deg: double, longitude_deg: double, elevation_ft: double, type: string, hex_id: string, hex_lat: double, hex_lon: double]

In [36]:
osn_h3_sv = f"""
SELECT *
FROM `project_aiu`.`osn_h3_statevectors` LIMIT 10000
"""

df = spark.sql(osn_h3_sv).toPandas()

                                                                                

In [35]:
df

Unnamed: 0,event_time,icao24,lat,lon,velocity,heading,vert_rate,callsign,on_ground,alert,...,h3_res_9,h3_res_8,h3_res_7,h3_res_6,h3_res_5,h3_res_4,h3_res_3,h3_res_2,h3_res_1,h3_res_0
0,1686636385,49f137,50.090755,14.281106,,,,,True,False,...,891e3558b4fffff,881e3558b5fffff,871e3558bffffff,861e3558fffffff,851e355bfffffff,841e355ffffffff,831e35fffffffff,821e37fffffffff,811e3ffffffffff,801ffffffffffff
1,1686636390,49f137,50.090755,14.281106,,,,,True,False,...,891e3558b4fffff,881e3558b5fffff,871e3558bffffff,861e3558fffffff,851e355bfffffff,841e355ffffffff,831e35fffffffff,821e37fffffffff,811e3ffffffffff,801ffffffffffff
2,1686636395,49f137,50.090755,14.281113,,,,ODGUMOV1,True,False,...,891e3558b4fffff,881e3558b5fffff,871e3558bffffff,861e3558fffffff,851e355bfffffff,841e355ffffffff,831e35fffffffff,821e37fffffffff,811e3ffffffffff,801ffffffffffff
3,1686636400,49f137,50.090755,14.281106,,,,ODGUMOV1,True,False,...,891e3558b4fffff,881e3558b5fffff,871e3558bffffff,861e3558fffffff,851e355bfffffff,841e355ffffffff,831e35fffffffff,821e37fffffffff,811e3ffffffffff,801ffffffffffff
4,1686636405,49f137,50.090755,14.281113,,,,ODGUMOV1,True,False,...,891e3558b4fffff,881e3558b5fffff,871e3558bffffff,861e3558fffffff,851e355bfffffff,841e355ffffffff,831e35fffffffff,821e37fffffffff,811e3ffffffffff,801ffffffffffff
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1686636905,4cafcc,59.646067,17.953815,52.271156,190.203974,-0.32512,SAS83L,True,False,...,8908862182fffff,8808862183fffff,870886218ffffff,86088621fffffff,85088623fffffff,8408863ffffffff,830886fffffffff,820887fffffffff,8108bffffffffff,8009fffffffffff
9996,1686636910,4cafcc,59.646067,17.953815,52.271156,190.203974,-0.32512,SAS83L,True,False,...,8908862182fffff,8808862183fffff,870886218ffffff,86088621fffffff,85088623fffffff,8408863ffffffff,830886fffffffff,820887fffffffff,8108bffffffffff,8009fffffffffff
9997,1686636915,4cafcc,59.647196,17.954217,52.271156,190.203974,-0.32512,SAS83L,True,False,...,89088621827ffff,8808862183fffff,870886218ffffff,86088621fffffff,85088623fffffff,8408863ffffffff,830886fffffffff,820887fffffffff,8108bffffffffff,8009fffffffffff
9998,1686636920,4cafcc,59.647196,17.954217,52.271156,190.203974,-0.32512,SAS83L,True,False,...,89088621827ffff,8808862183fffff,870886218ffffff,86088621fffffff,85088623fffffff,8408863ffffffff,830886fffffffff,820887fffffffff,8108bffffffffff,8009fffffffffff
