In [None]:
# Spark imports
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import udf, pandas_udf, col, PandasUDFType, lit, round, array_contains, from_unixtime
from pyspark.sql.functions import col, radians, sin, cos, sqrt, atan2, array, collect_list, struct, row_number, expr
from pyspark.sql.functions import monotonically_increasing_id, row_number, col
from pyspark.sql.types import DoubleType, StructType, StructField
from pyspark.sql.functions import when, split, col, concat_ws,  min, max, to_date
from pyspark.sql import functions as F
from pyspark.sql.window import Window


# Regular imports
from IPython.display import display, HTML
import os, time
import subprocess
import os,shutil
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import h3pandas
import h3

# Settings
project = "project_opdi"

# Getting today's date
today = datetime.today().strftime('%d %B %Y')

# Setting logging properties
#shutil.copy("/runtime-addons/cmladdon-2.0.40-b154/log4j.properties", "/etc/spark/conf/") 

# Spark Session Initialization
spark = SparkSession.builder \
    .appName("OSN Flight Table") \
    .config("spark.log.level", "ERROR")\
    .config("spark.hadoop.fs.azure.ext.cab.required.group", "eur-app-opdi") \
    .config("spark.kerberos.access.hadoopFileSystems", "abfs://storage-fs@cdpdllive.dfs.core.windows.net/data/project/opdi.db/unmanaged") \
    .config("spark.driver.cores", "1") \
    .config("spark.driver.memory", "8G") \
    .config("spark.executor.memory", "8G") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.instances", "2") \
    .config("spark.dynamicAllocation.maxExecutors", "20") \
    .config("spark.network.timeout", "800s") \
    .config("spark.executor.heartbeatInterval", "400s") \
    .config("spark.driver.maxResultSize", "4g") \
    .enableHiveSupport() \
    .getOrCreate()

# Get environment variables
engine_id = os.getenv('CDSW_ENGINE_ID')
domain = os.getenv('CDSW_DOMAIN')

# Format the URL
url = f"https://spark-{engine_id}.{domain}"

# Display the clickable URL
display(HTML(f'<a href="{url}">{url}</a>'))

## Query testing sample function

In [None]:
import shutil
from datetime import datetime, date
import dateutil.relativedelta
import calendar
import h3_pyspark
import os
import pandas as pd
from datetime import datetime, date
import dateutil.relativedelta
import calendar


In [None]:

def generate_months(start_date, end_date):
    """Generate a list of dates corresponding to the first day of each month between two dates.

    Args:
    start_date (datetime.date): The starting date.
    end_date (datetime.date): The ending date.

    Returns:
    list: A list of date objects for the first day of each month within the specified range.
    """
    current = start_date
    months = []
    while current <= end_date:
        months.append(current)
        # Increment month
        month = current.month
        year = current.year
        if month == 12:
            current = date(year + 1, 1, 1)
        else:
            current = date(year, month + 1, 1)
    return months

def get_start_end_of_month(date):
    """Return a datetime object for the first and last second  of the given month and year."""
    year = date.year
    month = date.month
    
    first_second = datetime(year, month, 1, 0, 0, 0)
    last_day = calendar.monthrange(year, month)[1]
    last_second = datetime(year, month, last_day, 23, 59, 59)
    return first_second.timestamp(), last_second.timestamp()

# Settings
## Config
project = "project_opdi"
max_h3_resolution = 12
start_month = date(2022, 1, 1)

## Which months to process
today = date.today()
end_month = today - dateutil.relativedelta.relativedelta(months=1) # We work on the d-1 months


In [None]:
to_process_months = generate_months(start_month, end_month)

In [None]:
to_process_months

In [None]:
## Load logs
fpath = '../../logs/02_osn-tracks-etl-log.parquet'
processed_months = pd.read_parquet(fpath)

In [None]:
processed_months.sort_values('months').reset_index(drop=True).iloc[0:29].to_parquet(fpath)

In [None]:
end_month

In [None]:
# importing datetime module
from datetime import datetime, date
import time
from datetime import datetime, timedelta
from typing import List

def fetch_tracks(
    spark, 
    date, 
    ades, 
    adep):
    # assigned regular string date
    date_str = date.strftime('%Y-%m-%d')

    # displaying unix timestamp after conversion
    date_unix = int(time.mktime(date.timetuple()))

    start_time_unix = date_unix - 1.5*24*60*60
    end_time_unix = date_unix + 1.5*24*60*60

    ades_sql = ''
    adep_sql = ''
    
    if pd.isnull(ades):
        ades = '*'
    else:
        ades_sql = f"AND oft.ADES = '{ades}'"
    
    if pd.isnull(adep):
        adep = '*'
    else:
        adep_sql = f"AND oft.ADEP = '{adep}'"
    
    query = f"""
    SELECT otc.* 
    FROM (
        SELECT *
        FROM project_opdi.osn_tracks_clustered
        WHERE event_time BETWEEN {start_time_unix} AND {end_time_unix}
        ) otc
    JOIN project_opdi.osn_flight_table oft 
        ON otc.track_id = oft.id
    WHERE oft.DOF = TO_DATE('{date_str}')
    {ades_sql} 
    {adep_sql};"""
    
    print(query)
    
    #df = spark.sql(query).toPandas()

    path = f'{adep}-to-{ades}/'
    os.makedirs(path,exist_ok=True)
    fname = path + f'tracks_ades_{ades}_adep_{adep}_{date_str}.parquet'
    print(fname)
    #df.to_parquet(fpath)


def get_all_days_between(start_date: datetime, end_date: datetime) -> List[datetime]:
    """
    Given two datetimes (at 00:00:00), return a list of all days in between, including the given ones.
    
    Parameters:
    start_date (datetime): The start date.
    end_date (datetime): The end date.
    
    Returns:
    List[datetime]: A list of datetime objects representing each day in the range.
    """
    if start_date > end_date:
        raise ValueError("Start date must be before or equal to end date")
    
    delta = end_date - start_date
    all_days = [start_date + timedelta(days=i) for i in range(delta.days + 1)]
    
    return all_days

# Example usage
start = datetime(2024, 3, 1)
end = datetime(2024, 6, 30)
days_list = get_all_days_between(start, end)

print("Starting ADES: LPPT")
for date in days_list:
    print(f"Processing: {date}")
    fetch_tracks(spark, date, ades = 'LPPT', adep = None)

In [None]:
import pandas as pd
df = pd.DataFrame()

In [None]:
df['time'] = pd.to_datetime(df['event_time'],unit='s')

In [None]:
import plotly.express as px

In [None]:
px.histogram(df, x='time')

In [None]:
px.line(df,x='time',y='baro_altitude', color = 'track_id')

In [None]:
px.line(df,x='lat',y='lon', color = 'track_id')