In [1]:
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F
from pyspark.sql.functions import (
    lit, lag, when, to_date, concat, avg, abs, col, unix_timestamp, to_timestamp
)
from pyspark.sql import DataFrame
import os
import pandas as pd
from datetime import datetime, date
import dateutil.relativedelta
import calendar
import h3_pyspark
import gc

# Settings
## Config
project = "project_opdi"
max_h3_resolution = 12
start_month = date(2022, 1, 1)

## Which months to process
today = date.today()
end_month = today - dateutil.relativedelta.relativedelta(months=1) # We work on the d-1 months

# Getting today's date formatted
today = today.strftime('%d %B %Y')
#    .config("spark.log.level", "ERROR") \
#    .config("spark.ui.showConsoleProgress", "false") \
# Spark Session Initialization
spark = SparkSession.builder \
    .appName("OPDI Ingestion") \
    .config("spark.hadoop.fs.azure.ext.cab.required.group", "eur-app-opdi") \
    .config("spark.kerberos.access.hadoopFileSystems", "abfs://storage-fs@cdpdllive.dfs.core.windows.net/data/project/opdi.db/unmanaged") \
    .config("spark.executor.extraClassPath", "/opt/spark/optional-lib/iceberg-spark-runtime-3.3_2.12-1.3.1.1.20.7216.0-70.jar") \
    .config("spark.driver.extraClassPath", "/opt/spark/optional-lib/iceberg-spark-runtime-3.3_2.12-1.3.1.1.20.7216.0-70.jar") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
    .config("spark.sql.iceberg.handle-timestamp-without-timezone", "true") \
    .config("spark.sql.catalog.spark_catalog.warehouse", "abfs://storage-fs@cdpdllive.dfs.core.windows.net/data/project/opdi.db/unmanaged") \
    .config("spark.driver.cores", "1") \
    .config("spark.driver.memory", "20G") \
    .config("spark.executor.memory", "6G") \
    .config("spark.executor.memoryOverhead", "3G") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.instances", "3") \
    .config("spark.dynamicAllocation.maxExecutors", "4") \
    .config("spark.network.timeout", "800s") \
    .config("spark.executor.heartbeatInterval", "400s") \
    .config("spark.driver.maxResultSize", "6g") \
    .config("spark.shuffle.compress", "true") \
    .config("spark.shuffle.spill.compress", "true") \
    .enableHiveSupport() \
    .getOrCreate()

Setting spark.hadoop.yarn.resourcemanager.principal to quinten.goens


In [8]:
!hdfs dfs -ls abfs://storage-fs@cdpdllive.dfs.core.windows.net/data/project/opdi.db/unmanaged/v002/test.csv

Dec 04, 2024 5:08:32 PM org.apache.knox.gateway.shell.KnoxSession createClient
INFO: Using default JAAS configuration
24/12/04 17:08:33 ERROR common.DefaultRequestExecutor: Error executing request: org.apache.knox.gateway.shell.ErrorResponse: https://dl-live-ent-idbroker0.az-live.x9er-zkvz.cloudera.site:8444/gateway/azure-cab/cab/api/v1/credentials: HTTP/1.1 403 Forbidden
24/12/04 17:08:33 ERROR idbroker.AbstractIDBClient: Cloud Access Broker response: {
  "error": "Ambiguous group role mappings for the authenticated user.",
  "auth_id": "quinten.goens"
}

24/12/04 17:08:33 ERROR common.DefaultRequestExecutor: Error executing request: org.apache.knox.gateway.shell.ErrorResponse: https://dl-live-ent-idbroker0.az-live.x9er-zkvz.cloudera.site:8444/gateway/azure-cab/cab/api/v1/credentials: HTTP/1.1 403 Forbidden
24/12/04 17:08:33 ERROR idbroker.AbstractIDBClient: Cloud Access Broker response: {
  "error": "Ambiguous group role mappings for the authenticated user.",
  "auth_id": "quinten.go

In [6]:
%fs rm -r abfs://storage-fs@cdpdllive.dfs.core.windows.net/data/project/opdi.db/unmanaged/v002/test.csv

UsageError: Line magic function `%fs` not found.


In [7]:
import os

# Define the folder path
folder_path = "abfs://storage-fs@cdpdllive.dfs.core.windows.net/data/project/opdi.db/unmanaged/v002/test.csv"

# Use os.system to call Azure CLI or equivalent for deletion
os.system(f"az storage blob delete-batch --source my-container --pattern '{folder_path}'")

sh: 1: az: not found


32512

In [5]:
folder_path = "abfs://storage-fs@cdpdllive.dfs.core.windows.net/data/project/opdi.db/unmanaged/v002/test.csv"

# Use Spark's wholeTextFiles to inspect the folder
file_paths = spark.sparkContext.wholeTextFiles(folder_path).keys().collect()
print(file_paths)

[Stage 0:>                                                          (0 + 1) / 1]

['abfs://storage-fs@cdpdllive.dfs.core.windows.net/data/project/opdi.db/unmanaged/v002/test.csv/part-00000-6ff38fe7-c5b7-4409-8a08-c8ffa4a4704f-c000.csv']


                                                                                

In [20]:
# Define the folder path to delete
folder_path = "abfs://storage-fs@cdpdllive.dfs.core.windows.net/data/project/opdi.db/unmanaged/v002/test.csv"

# Access the Hadoop FileSystem
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())

# Delete the folder recursively
folder_to_delete = spark._jvm.org.apache.hadoop.fs.Path(folder_path)
delete_status = fs.delete(folder_to_delete, True)  # 'True' enables recursive delete

# Check the deletion status
if delete_status:
    print(f"Folder {folder_path} deleted successfully.")
else:
    print(f"Failed to delete folder {folder_path}.")


IllegalArgumentException: Wrong FS: abfs://storage-fs@cdpdllive.dfs.core.windows.net/data/project/opdi.db/unmanaged/v002/test.csv, expected: file:///

In [10]:
%%time
df = spark.sql("""
SELECT track_id, 
       icao24, 
       callsign,
       CAST(event_time AS STRING) AS event_time,
       lat, 
       lon, 
       baro_altitude, 
       geo_altitude
FROM project_opdi.osn_tracks
WHERE event_time BETWEEN TIMESTAMP('2022-01-01 00:00:00') AND TIMESTAMP('2022-01-01 06:00:00')
""")

csv_path = "abfs://storage-fs@cdpdllive.dfs.core.windows.net/data/project/opdi.db/unmanaged/v002/test.parquet"
df.coalesce(1).write.mode("overwrite").parquet(csv_path)

                                                                                

CPU times: user 9.38 ms, sys: 4.29 ms, total: 13.7 ms
Wall time: 20.1 s


In [16]:
# Get Hadoop FileSystem object
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
path = spark._jvm.org.apache.hadoop.fs.Path("abfs://storage-fs@cdpdllive.dfs.core.windows.net/data/project/opdi.db/unmanaged/v002/")

# List all files in the directory
file_status = fs.listStatus(path)
files = [file.getPath().toString() for file in file_status]

# Print all files in the folder
for file in files:
    print(file)


IllegalArgumentException: Wrong FS: abfs://storage-fs@cdpdllive.dfs.core.windows.net/data/project/opdi.db/unmanaged/v002, expected: file:///

In [11]:
csv_path = "abfs://storage-fs@cdpdllive.dfs.core.windows.net/data/project/opdi.db/unmanaged/v002/test.parquet"
df = spark.read.parquet(csv_path, header=True, inferSchema=True).limit(10)
df.toPandas()


Unnamed: 0,track_id,icao24,callsign,event_time,lat,lon,baro_altitude,geo_altitude
0,b8976e557478d88d686719e3f426611957318986833f69...,399a40,VLJ584H,2022-01-01 01:19:45,26.748776,47.931501,11727.18,12062.46
1,b8976e557478d88d686719e3f426611957318986833f69...,399a40,VLJ584H,2022-01-01 01:19:50,26.756882,47.944647,11742.42,12077.7
2,b8976e557478d88d686719e3f426611957318986833f69...,399a40,VLJ584H,2022-01-01 01:19:55,26.762054,47.953975,11757.66,12085.32
3,b8976e557478d88d686719e3f426611957318986833f69...,399a40,VLJ584H,2022-01-01 01:20:00,26.769196,47.968641,11772.9,12108.18
4,b8976e557478d88d686719e3f426611957318986833f69...,399a40,VLJ584H,2022-01-01 01:20:05,26.774613,47.981415,11788.14,12123.42
5,b8976e557478d88d686719e3f426611957318986833f69...,399a40,VLJ584H,2022-01-01 01:20:10,26.776614,47.986644,11795.76,12131.04
6,b8976e557478d88d686719e3f426611957318986833f69...,399a40,VLJ584H,2022-01-01 01:20:15,26.78369,48.008247,11818.62,12153.9
7,b8976e557478d88d686719e3f426611957318986833f69...,399a40,VLJ584H,2022-01-01 01:20:20,26.786499,48.018753,11826.24,12153.9
8,b8976e557478d88d686719e3f426611957318986833f69...,399a40,VLJ584H,2022-01-01 01:20:25,26.786499,48.018753,11826.24,12153.9
9,b8976e557478d88d686719e3f426611957318986833f69...,399a40,VLJ584H,2022-01-01 01:20:30,26.786499,48.018753,11826.24,12161.52


In [None]:

df.write.mode("overwrite").csv(csv_path)


In [None]:
pd.DataFrame({''}

In [4]:
pd.read_csv('/tmp/asmt_2022-01-01.csv')

IsADirectoryError: [Errno 21] Is a directory: '/tmp/asmt_2022-01-01.csv'

In [5]:
df.count()

                                                                                

1679032

In [4]:
pd.read_parquet("/tmp/asmt_2022-01-01.parquet")

In [3]:
written_df = spark.read.parquet("/tmp/asmt_2022-01-01.parquet")
written_df.show(10)

                                                                                

AnalysisException: Unable to infer schema for Parquet. It must be specified manually.

In [15]:
df.count()

                                                                                

9149687

In [None]:
df.toPandas()

                                                                                

In [None]:
df['event_time'] = pd.to_datetime(df['event_time'])

In [None]:
spark.stop()

In [25]:
df