In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
from pathlib import Path
from tqdm import tqdm
import gzip
import holoviews as hv
import hdf5storage

import unfoc

from utig_radar_loading import file_util, stream_util, geo_util, segment_splits, opr_gps_file_generation, opr_header_generation, preprocessing

In [None]:
pd.options.mode.copy_on_write = True
tqdm.pandas()
hv.extension('bokeh')

In [None]:
use_cache = True
cache_dir = "outputs/file_index.csv"
base_path = "/kucresis/scratch/data/UTIG"

df_files = file_util.load_file_index_df(base_path, cache_dir, read_cache=use_cache)
df_artifacts = file_util.create_artifacts_df(df_files) # df_artifacts is a dataframe with one row per stream file

df_artifacts.head()

In [None]:
# Group by transect, selecting the stream types that are needed

usable_artifact_types = {
    "gps": {"stream_types": ["GPSnc1", "GPStp2", "GPSap3"], "file_names": ["xds.gz"]},
    "radar": {"stream_types": ["RADnh5", "RADnh3", "RADnh2", "RADnh4", "RADjh1"], "file_names": ["bxds"]},
    "imu": {"stream_types": ["AVNnp1"], "file_names": ["bxds"]},
}

df_transects = file_util.arrange_by_transect(df_artifacts, usable_artifact_types)
df_transects.head()

In [None]:
df_all_seasons = file_util.assign_seasons(df_transects)
df_all_seasons.head()

In [None]:
print(f"The following seasons were found in the dataset:")
seasons = np.array(df_all_seasons['season'].unique())
seasons.sort()
print(seasons)

### Select a single season to extract

In [None]:
season_year = 2017

In [None]:
# Create df_season (filtered to selected season) and check for missing data

df_season = df_all_seasons[df_all_seasons['season'] == season_year]
df_season = df_season.sort_values(by='start_timestamp')

df_season_missing_data = df_season[df_season['radar_path'].isnull()]
df_season = df_season[df_season['radar_path'].notnull()]

if len(df_season_missing_data) > 0:
    print(f"[WARNING] Missing radar data for {len(df_season_missing_data)} transects out of {len(df_season)+len(df_season_missing_data)}")

df_season_missing_imu = df_season[df_season['imu_path'].isnull()]
if len(df_season_missing_imu) > 0:
    print(f"[WARNING] Missing IMU data for {len(df_season_missing_imu)} transects")

# Display information about this season

# - Types of stream files:
print(f"GPS stream types: {df_season['gps_stream_type'].unique()}")
print(f"Radar stream types: {df_season['radar_stream_type'].unique()}")
print(f"IMU stream types: {df_season['imu_stream_type'].unique()}")

# - Sets
print(f"Sets: {df_season.reset_index()['set'].unique()}")

# - Projects
print(f"Projects: {df_season.reset_index()['prj'].unique()}")

# - Aircraft
ac_ident = df_season.reset_index()['set'].iloc[0][:3]
print(f"Aircraft identifier: {ac_ident}")

# - Season name
season_name = f"{season_year}_Antarctica_Basler{ac_ident}"
print(f"Season name: {season_name}")

In [None]:
# Assign each transect to a segment

df_season = segment_splits.assign_segments(df_season, timestamp_field='tim', parse_ct=False, timestamp_split_threshold=2000) # Split based on the 'tim' counter (in ms)
#df_season = segment_splits.assign_segments(df_season, timestamp_field='TIMESTAMP', parse_ct=True, timestamp_split_threshold=pd.Timedelta(milliseconds=2000)) # OR, split based on the 'TIMESTAMP' field

n_segments = len(df_season['segment_path'].unique())
max_segments_per_day = df_season['segment_number'].max()

print(f"Created {n_segments} segments. Maximum segment number on a single day is {max_segments_per_day}.")
df_season.head()

In [None]:
# Create map of segments

segment_dfs = geo_util.load_gps_data(df_season)
missing_data_dfs = geo_util.load_gps_data(df_season_missing_data)

paths = []

# Add missing data
if len(missing_data_dfs) > 0:
    _, p = geo_util.create_path(missing_data_dfs)
    paths.append(p.opts(color='red', line_width=3).relabel('Missing Radar Data'))
else:
    print("No missing data to display.")

# Add segments with data
for segment_path in df_season['segment_path'].unique():
    dfs_list_tmp = [df for df in segment_dfs if df['segment_path'].iloc[0] == segment_path]
    _, p = geo_util.create_path(dfs_list_tmp)
    p = p.relabel(f"Segment {segment_path}")
    paths.append(p)

p = stream_util.create_antarctica_basemap() * hv.Overlay(paths)
p = p.opts(aspect='equal', frame_width=800, frame_height=800, tools=['hover'])
p = p.opts(title=season_name, legend_position='right')
p

In [None]:
hv.save(p, f"outputs/maps/{season_name}.html")

### Create GPS support files for each segment

In [None]:
overwrite_existing_gps_files = False

gps_paths = df_season.groupby(['segment_date_str', 'segment_number'])[['segment_date_str', 'segment_number', 'gps_path']].apply(
    opr_gps_file_generation.make_segment_gps_file,
    include_groups=False,
    output_base_dir=f"outputs/gps/{season_name}",
    overwrite=overwrite_existing_gps_files)

gps_paths

### Create parameter spreadsheet starting templates

In [None]:
def radar_paths_ordered(x):
    l = x.sort_values('start_timestamp')['radar_path'].tolist()
    l = [str(Path(*Path(p).parts[-5:-1])) for p in l]
    return "{'" + "', '".join(l) + "'}"

radar_paths = df_season.groupby(['segment_date_str', 'segment_number'])[['radar_path', 'start_timestamp']].apply(radar_paths_ordered)
radar_paths

In [None]:
def first_prj_set_str(x):
    return list(x['prj'].unique())

def transect_names(x):
    return list(x.sort_values('start_timestamp')['trn'])

mission_names = df_season.reset_index().groupby(['segment_date_str', 'segment_number'])[['prj', 'set']].apply(first_prj_set_str)
transect_names = df_season.reset_index().groupby(['segment_date_str', 'segment_number'])[['start_timestamp', 'trn']].apply(transect_names)

In [None]:
defaults = preprocessing.load_defaults(f'src/utig_radar_loading/defaults/{season_name}.yaml')

base_params_dir = Path(f'outputs/params/{season_name}')
Path(base_params_dir).mkdir(parents=True, exist_ok=True)

def make_parameter_sheet(default_values, segments, overrides={}):
    df = pd.DataFrame(default_values, index=segments)
    for key, value in overrides.items():
       df[key] = value
    return df

make_parameter_sheet(defaults['cmd'], radar_paths.index, overrides={
    'mission_names': mission_names,
    'notes': transect_names
}).to_csv(base_params_dir / 'cmd.csv')

make_parameter_sheet(defaults['records'], radar_paths.index, overrides={
    'file.board_folder_name': radar_paths,
    'gps.fn': gps_paths
}).to_csv(base_params_dir / 'records.csv')

make_parameter_sheet(defaults['qlook'], radar_paths.index).to_csv(base_params_dir / 'qlook.csv')
make_parameter_sheet(defaults['radar'], radar_paths.index).to_csv(base_params_dir / 'radar.csv')

### Generate temporary header files

In [None]:
df_season_subset = df_season #.loc[[('PEL', 'JKB2u', 'Y20b'), ('ICP10', 'JKB2u', 'F01T02a'), ('PEL', 'JKB2u', 'X48a')]]
df_season_subset.head()

In [None]:
import dask.dataframe as dd
from dask.distributed import Client, as_completed, progress
from dask import delayed

print("Setting up Dask LocalCluster for parallel processing...")
client = Client(n_workers=10)
print(f"Dashboard link: {client.dashboard_link}")
client

In [None]:
header_base_dir = f"/kucresis/scratch/tteisberg_sta/scripts/opr_user_tmp/headers/rds/{season_name}/"

# Get file locations (this is fast, no need to parallelize)
header_file_locations = df_season_subset['radar_path'].apply(opr_header_generation.get_header_file_location, base_dir=header_base_dir)

# Parallelized version using Dask

# Create delayed tasks for each file
header_file_locations_to_generate = []
delayed_tasks = []
for path, fn in zip(df_season_subset['radar_path'], header_file_locations):
    if Path(fn).exists():
        print(f"Header file already exists for {path}, skipping.")
    else:
        delayed_tasks.append(delayed(opr_header_generation.get_header_information)(path))
        header_file_locations_to_generate.append(fn)

if len(header_file_locations_to_generate) > 0:
    # Compute in parallel with progress bar
    print(f"Processing {len(delayed_tasks)} files in parallel...")
    headers_list = []

    # Submit all tasks
    futures = client.compute(delayed_tasks)

    # Track progress
    progress(futures)

    # Gather results
    headers_list = client.gather(futures)

    # Convert to Series matching original index
    headers = pd.Series(headers_list, index=df_season_subset.index)
else:
    print("No header files need to be generated, all files already exist.")

In [None]:
# TODO: Seems like this hangs or behaves weirdly when overwriting an existing file...

if len(header_file_locations_to_generate) > 0:
    for header, fn in zip(headers.values, header_file_locations_to_generate.values()):
        fn = Path(fn)
        fn.parent.mkdir(parents=True, exist_ok=True)

        header_tmp = header.copy()
        header_tmp['offset'] = header_tmp['offset'].astype(np.int64)  # Convert offsets to int64 for saving -- TODO: should have been this type originally

        print(f"Writing header to {fn}")
        hdf5storage.savemat(str(fn), header_tmp, format='7.3')

TODO

Just got updated headers writing with the new Nb x Nx x Nc offset format. Next step is to regenreate records files in matlab and see if the records look correct. Then need to update data load pathways.

In [None]:
header['offset'].astype(np.int64)

In [None]:
for header, fn in zip(headers.values, header_file_locations.values):
    fn = Path(fn)
    fn.parent.mkdir(parents=True, exist_ok=True)
    print(f"[DRYRUN] Writing header to {fn}")

    header_tmp = header.copy()
    header_tmp['offset'] = header_tmp['offset'].astype(np.int64)  # Convert offsets to int64 for saving
    
    for k, v in header_tmp.items():
        print(f"  {k}: {v.shape} {v.dtype} itemsize={v.itemsize} total size={v.nbytes / (1024*1024):.2f} MB ")

In [None]:
input_filename = df_season_subset.iloc[0]['radar_path']
ct_data = stream_util.load_ct_file(input_filename)
ct_data = stream_util.parse_CT(ct_data)
fpos, header_len, header = zip(*unfoc.index_RADnhx_bxds(input_filename, full_header=True))

In [None]:
ct_data

In [None]:
fpos, header_len, header = zip(*unfoc.index_RADnhx_bxds(input_filename, full_header=True))

rseq = np.array([h.rseq for h in header])
choff = np.array([h.choff for h in header])

df = pd.DataFrame({
    'rseq': rseq,
    'choff': choff,
    'start_fpos': fpos,
    'header_len': header_len,
    'ch0_offset': pd.NA,
    'ch1_offset': pd.NA,
    'ch2_offset': pd.NA,
    'ch3_offset': pd.NA
})

df = df.join(ct_data[['tim', 'TIMESTAMP']])

choff0 = np.where(df['choff'] == 0)[0]
choff2 = np.where(df['choff'] == 2)[0]
df.iloc[choff0, df.columns.get_loc('ch0_offset')] = df.iloc[choff0]['start_fpos'] + df.iloc[choff0]['header_len']
df.iloc[choff0, df.columns.get_loc('ch1_offset')] = df.iloc[choff0]['start_fpos'] + df.iloc[choff0]['header_len'] + 6400
df.iloc[choff2, df.columns.get_loc('ch2_offset')] = df.iloc[choff2]['start_fpos'] + df.iloc[choff2]['header_len']
df.iloc[choff2, df.columns.get_loc('ch3_offset')] = df.iloc[choff2]['start_fpos'] + df.iloc[choff2]['header_len'] + 6400

df = df[['tim', 'TIMESTAMP', 'rseq', 'ch0_offset', 'ch1_offset', 'ch2_offset', 'ch3_offset']]
# Group by rseq and get first non-NAN value for each channel
df = df.groupby('rseq').first()
# Set nan values to -2^31
df = df.fillna(-2**31)

df

#offsets_array = df_offsets.to_numpy()
#offsets_array = np.expand_dims(offsets_array, axis=0) # Add a board axis

#offsets_array

# STOP HERE

## Break segments into frames

In [None]:
break_distance = 50 # km

frame_outputs = {}
all_entries = []

segment_paths = df_season['segment_path'].unique()
for seg in segment_paths:
    print(f"Processing segment: {seg}")
    seg_df = df_season[df_season['segment_path'] == seg].sort_values('start_timestamp')
    # Note: Should have already been sorted, but just in case

    frame_idx = 1 # Frame index we're currently assigning
    accumulated_km = 0 # Sum of line-km currently assigned to frame_idx
    transect_iloc = 0 # Index of the current transect being processed

    frame_outputs[seg] = {frame_idx: []}
    last_x, last_y = None, None

    for transect_iloc in range(len(seg_df)):
        print(f" -> Allocating transect {transect_iloc} {seg_df.index[transect_iloc]}")

        # Load the geometry of this transect
        df = stream_util.load_gzipped_stream_file(
            seg_df.iloc[transect_iloc]['gps_path'],
            debug=False, parse=True, parse_kwargs={'use_ct': True}
            )

        x_proj, y_proj, line_length_m = geo_util.project_split_and_simplify(
            df['LON'].values, df['LAT'].values, calc_length=True, simplify_tolerance=None)
        
        x_proj = x_proj[:-1]
        y_proj = y_proj[:-1]

        # Calculate the along-track distance, accounting for possible distance from the
        # end of the last transect
        deltas = np.sqrt(np.diff(x_proj)**2 + np.diff(y_proj)**2) / 1000  # Convert to km
        if last_x:
            deltas = np.insert(deltas, 0, np.sqrt((x_proj[0] - last_x)**2 + (y_proj[0] - last_y)**2) / 1000)
        else:
            deltas = np.insert(deltas, 0, 0)
        dist = np.cumsum(deltas)
        #print(f"Transect total length is {dist[-1]} km")
        # print(x_proj)
        # print(y_proj)
        # print(dist)
        # raise Exception("test")

        # Allocate parts of this transect to frames
        transect_start_tim = df['tim'].iloc[0]
        transect_start_idx = 0
        while transect_start_tim < df['tim'].iloc[-1]:
            # Find the 'tim' index that fits into the current segment
            remaining_distance = break_distance - accumulated_km

            dists_from_idx = np.maximum(0, dist - dist[transect_start_idx])
            #print(f"With transect_start_idx={transect_start_idx}, remaining distance in this transect is {dists_from_idx[-1]} km")

            break_idx = np.argmin(np.abs(dists_from_idx - remaining_distance))
            break_tim = df['tim'].iloc[break_idx]

            entry = seg_df.iloc[transect_iloc:transect_iloc+1].copy()
            entry['gps_idx_start'] = transect_start_idx
            entry['gps_idx_stop'] = break_idx
            entry['tim_start'] = transect_start_tim
            entry['tim_stop'] = break_tim
            entry['frame_number'] = frame_idx

            all_entries.append(entry)

            # Add an entry to this frame and update distance
            frame_outputs[seg][frame_idx].append(entry)
            accumulated_km += dist[break_idx] - dist[transect_start_idx]
            print(f"   -> Assigned indices {transect_start_idx} to {break_idx} (distance {dist[break_idx] - dist[transect_start_idx]} km) to frame {frame_idx}, now at {accumulated_km} km")

            # Move transect start index
            transect_start_idx = break_idx
            transect_start_tim = break_tim

            # Check if the frame is full
            if accumulated_km >= 0.98*break_distance:
                print(f"    Frame {frame_idx} is full with {accumulated_km} km")
                frame_idx += 1
                accumulated_km = 0
                frame_outputs[seg][frame_idx] = []
            

        last_x, last_y = x_proj[-1], y_proj[-1]


In [None]:
frames_plan_df = pd.concat(all_entries).reset_index().set_index(['segment_date_str', 'segment_number', 'frame_number'])
frames_plan_df

In [None]:
def make_segment_gps_file(x):
    x = x.reset_index()
    print(f"{x['segment_date_str'].iloc[0]}_{x['segment_number'].iloc[0]}")
    gps_paths = list(x['gps_path'].unique())
    output_path = f"outputs/gps/{season_name}/gps_{x['segment_date_str'].iloc[0]}_{x['segment_number'].iloc[0]}.mat"

    # Only generate if the file does not exist
    if not Path(output_path).exists():
        opr_gps_file_generation.generate_gps_file(gps_paths, output_path, format='hdf5')
    else:
        print(f"File {output_path} already exists. Skipping generation. If you want to regenerate, manually delete the file.")
        return None

    return output_path

frames_plan_df.groupby(['segment_date_str', 'segment_number']).apply(make_segment_gps_file)

In [None]:
frames_plan_df_tmp = frames_plan_df[:3]
frames_plan_df_tmp

In [None]:
# Extract headers from bxds files
bxds_files = list(frames_plan_df_tmp['radar_path'].unique())
headers = preprocessing.extract_headers(bxds_files)

In [None]:
segments = preprocessing.create_segments_from_frames(
      frames_plan_df_tmp,
      bxds_files,
      headers
  )

In [None]:
# # Extract headers (if needed for records_create)
#   headers = preprocessing.extract_headers(bxds_files)

#   # Create segments from existing frames
#   segments = preprocessing.create_segments_from_frames(
#       frames_plan_df,
#       bxds_files,
#       headers
#   )

#   # Generate parameters
#   params = preprocessing.generate_all_parameters(
#       segments,
#       season_name='2022_Antarctica_BaslerMKB',
#       radar_name='rds',
#       defaults_file='path/to/2022_Antarctica_BaslerMKB.yaml',
#       base_dir='/data/path',
#       board_folder_name='F01'
#   )

#   # Write spreadsheets
#   preprocessing.write_parameter_spreadsheet(
#       params,
#       'output/2022_Antarctica_BaslerMKB_param'
#   )

#   # Save headers for MATLAB records_create
#   preprocessing.save_temporary_headers(
#       headers,
#       bxds_files,
#       Path('/opr_tmp'),
#       '2022_Antarctica_BaslerMKB',
#       'F01'
#   )