In [1]:
from pyspark.sql import SparkSession
import os
import subprocess
from collections import defaultdict
import pandas as pd

# Spark Session Initialization
!cp /runtime-addons/cmladdon-2.0.40-b150/log4j.properties /etc/spark/conf/

spark = SparkSession.builder \
    .appName("project_aiu") \
    .config("spark.hadoop.fs.azure.ext.cab.required.group", "eur-app-aiu-dev") \
    .config("spark.yarn.access.hadoopFileSystems", "abfs://storage-fs@cdpdldev0.dfs.core.windows.net/data/project/aiu.db/unmanaged") \
    .config("spark.driver.cores", "1") \
    .config("spark.driver.memory", "8G") \
    .config("spark.executor.memory", "5G") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.instances", "2") \
    .config("spark.dynamicAllocation.maxExecutors", "6") \
    .config("spark.network.timeout", "800s") \
    .config("spark.executor.heartbeatInterval", "400s") \
    .enableHiveSupport() \
    .getOrCreate()

# Hive settings
spark.sql("SET hive.exec.dynamic.partition = true")
spark.sql("SET hive.exec.dynamic.partition.mode = nonstrict")

# Functions

def execute_shell_command(command):
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    return stdout.decode().strip(), stderr.decode().strip()

def group_files_by_month(files_to_download):
    monthly_files = defaultdict(list)
    for file in files_to_download:
        year_month = '-'.join(file.split("/")[2].split("-")[:2])
        monthly_files[year_month].append(file)
    return monthly_files

def set_up_mc():
    # Setting up mc commands
    if not os.path.isfile("mc"):
        execute_shell_command('curl -O https://dl.min.io/client/mc/release/linux-amd64/mc')
        execute_shell_command('chmod +x mc')
    execute_shell_command('./mc alias set opensky https://s3.opensky-network.org $OSN_USERNAME $OSN_KEY')
set_up_mc()

# Execute mc find command to list files
stdout, _ = execute_shell_command('./mc find opensky/ec-datadump/ --path "*/states_*.parquet"')
files_to_download = stdout.split('\n')

# Group files by month
monthly_files = group_files_by_month(files_to_download)

# Load the processed files list
processed_files_path = '../processed_files.log'
if os.path.exists(processed_files_path):
    with open(processed_files_path, 'r') as f:
        processed_files = f.read().splitlines()
else:
    processed_files = []

column_name_mapping = {
    "eventTime": "event_time",
    "icao24": "icao24",
    "lat": "lat",
    "lon": "lon",
    "velocity": "velocity",
    "heading": "heading",
    "vertRate": "vert_rate",
    "callsign": "callsign",
    "onGround": "on_ground",
    "alert": "alert",
    "spi": "spi",
    "squawk": "squawk",
    "baroAltitude": "baro_altitude",
    "geoAltitude": "geo_altitude",
    "lastPosUpdate": "last_pos_update",
    "lastContact": "last_contact",
    "serials": "serials"
}

Setting spark.hadoop.yarn.resourcemanager.principal to quinten.goens


In [None]:
year_month = "2022-05" 

files = pd.DataFrame({"monthly_files":monthly_files[year_month]})
processed_files_month = pd.DataFrame({'filename_processed':processed_files})
files['filename'] = files.monthly_files.str.split("/").apply(lambda l:l[-1])
files = files.merge(processed_files_month, left_on = 'filename', right_on='filename_processed', how = 'left')

unprocessed_files_df = files[files['filename_processed'].isna()]
unprocessed_files_list = list(unprocessed_files_df['monthly_files'])
unprocessed_files_str = " ".join(unprocessed_files_list)

local_file_path = f'data/ec-datadump/'
cp_command = f'./mc cp {unprocessed_files_str} {local_file_path}'
out, err = execute_shell_command(cp_command)

if err:
    print(f"Error for {cp_command}: {err}")

# Process the file using Spark immediately after download

# Cleanup
#for fn in unprocessed_files_list:
#    with open(processed_files_path, 'a') as f:
#        f.write(fn + '\n')

In [8]:
local_file_path = f'data/ec-datadump/'
df = spark.read.parquet(local_file_path)
for camel_case, snake_case in column_name_mapping.items():
    df = df.withColumnRenamed(camel_case, snake_case)

# Reorder for insertion
#all_columns = df.columns
#selected_columns = [col for col in all_columns if col not in ['icao24', 'callsign']]
#reordered_columns = selected_columns + ['icao24', 'callsign']
#df_reordered = df.select(reordered_columns)

                                                                                

In [6]:
# Read the existing data from the original table
df_original = spark.sql("SELECT * FROM project_aiu.osn_ec_datadump LIMIT 1000")

# Bucket the DataFrame
df_bucketed = (df_original.write
    .bucketBy(10, 'event_time', 'icao24', 'callsign')  # adjust the number of buckets and bucketing columns as needed
    .mode("append")  # this will overwrite the table
    .saveAsTable("project_aiu.osn_ec_datadump_bucketed2"))


AnalysisException: Can not create the managed table('`project_aiu`.`osn_ec_datadump_bucketed2`'). The associated location('abfs://storage-fs@cdpdldev0.dfs.core.windows.net/data/project/aiu.db/unmanaged/osn_ec_datadump_bucketed2') already exists.

In [8]:
spark.sql("""
CREATE TABLE `project_aiu`.`osn_ec_datadump_bucketed2` (
  event_time BIGINT COMMENT 'This column contains the unix (aka POSIX or epoch) timestamp for which the state vector was valid.',
  icao24 STRING COMMENT 'This column contains the 24-bit ICAO transponder ID which can be used to track specific airframes over different flights.',
  lat DOUBLE COMMENT 'This column contains the last known latitude of the aircraft.',
  lon DOUBLE COMMENT 'This column contains the last known longitude of the aircraft.',
  velocity DOUBLE COMMENT 'This column contains the speed over ground of the aircraft in meters per second.',
  heading DOUBLE COMMENT 'This column represents the direction of movement (track angle in degrees) as the clockwise angle from the geographic north.',
  vert_rate DOUBLE COMMENT 'This column contains the vertical speed of the aircraft in meters per second.',
  callsign STRING COMMENT 'This column contains the callsign that was broadcast by the aircraft.',
  on_ground BOOLEAN COMMENT 'This flag indicates whether the aircraft is broadcasting surface positions (true) or airborne positions (false).',
  alert BOOLEAN COMMENT 'This flag is a special indicator used in ATC.',
  spi BOOLEAN COMMENT 'This flag is a special indicator used in ATC.',
  squawk STRING COMMENT 'This 4-digit octal number is another transponder code which is used by ATC and pilots for identification purposes and indication of emergencies.',
  baro_altitude DOUBLE COMMENT 'This column indicates the aircrafts altitude. As the names suggest, baroaltitude is the altitude measured by the barometer (in meter).',
  geo_altitude DOUBLE COMMENT 'This column indicates the aircrafts altitude. As the names suggest, geoaltitude is determined using the GNSS (GPS) sensor (in meter).',
  last_pos_update DOUBLE COMMENT 'This unix timestamp indicates the age of the position.',
  last_contact DOUBLE COMMENT 'This unix timestamp indicates the time at which OpenSky received the last signal of the aircraft.',
  serials ARRAY<INT> COMMENT 'The serials column is a list of serials of the ADS-B receivers which received the message.'
)
COMMENT 'OpenSky Network EUROCONTROL datadump (for PRU) - Weekly updated.'
CLUSTERED BY (event_time, icao24, callsign) INTO 1024 BUCKETS -- Adjust the number of buckets based on testing and your specific requirements
STORED AS PARQUET
TBLPROPERTIES ('transactional'='false');
""")

AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: MetaException(message:Got exception: org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsRestOperationException Status code: -1 error code: null error message: Auth failure: HTTP Error -1CustomTokenProvider getAccessToken threw java.io.IOException : Ambiguous group role mappings for the authenticated user. (user: quinten.goens))

In [9]:

df.write.mode("append").insertInto("project_aiu.osn_ec_datadump_bucketed")




KeyboardInterrupt: 



In [None]:
processed_files_set = set(processed_files)
for year_month in monthly_files.keys():
    files_not_processed = list(set(month_files[year_month]) - processed_files_set) 
    month_files = " ".join(files_not_processed)
    

for file in files_to_download:
    file_name = file.split("/")[-1]
    
    
    
    if file_name not in processed_files:
        local_file_path = f'data/ec-datadump/'
        cp_command = f'./mc cp "{unprocessed_files_str}" {local_file_path}'
        out, err = execute_shell_command(cp_command)

        if err:
            print(f"Error for {cp_command}: {err}")

        

        # Delete the local copy to save space
        os.remove(local_file_path)

        # Update the log of processed files after successful load to Hive
        with open(processed_files_path, 'a') as f:
            f.write(file_name + '\n')

# Stop the SparkSession
spark.stop()