In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
from pathlib import Path
from tqdm import tqdm
import gzip
import holoviews as hv
import hdf5storage

import unfoc

from utig_radar_loading import file_util, stream_util, geo_util, segment_splits, opr_gps_file_generation, opr_header_generation, preprocessing

In [None]:
pd.options.mode.copy_on_write = True
tqdm.pandas()
hv.extension('bokeh')

# Step 0: Indexing all files

This stage indexes all of the UTIG data files that can be found and creates a DataFrame of "artifacts" corresponding to each individual data file.

This artifacts DataFrame is then processed to group it by transect (P/S/T triples in UTIG terminology). Finally, some data is read from the context files to assign seasons to each artifact. Once seasons are defined, a list of possible season years is printed and the user can select a year to process for the following stages.

In [None]:
use_cache = True
cache_dir = "outputs/file_index.csv"
base_path = "/kucresis/scratch/data/UTIG"

df_files = file_util.load_file_index_df(base_path, cache_dir, read_cache=use_cache)
df_artifacts = file_util.create_artifacts_df(df_files) # df_artifacts is a dataframe with one row per stream file

df_artifacts.head()

In [None]:
# TODO: Debug

# Find all transects matching a specific prj and trn
prj, trn = 'ASB', 'R04Ea'
df_tmp = df_artifacts[(df_artifacts['prj'] == prj) & (df_artifacts['trn'] == trn)]
df_tmp
#df_tmp = df_tmp[df_tmp['stream'] == 'GPStp2']
#df_tmp.iloc[0]['full_path']

In [None]:
# Group by transect, selecting the stream types that are needed

usable_artifact_types = {
    "gps": {"stream_types": ["GPSnc1", "GPStp2", "GPSap3", "GPSkc1"], "file_names": ["xds.gz"]},
    "radar": {"stream_types": ["RADnh5", "RADnh3", "RADnh2", "RADnh4", "RADjh1"], "file_names": ["bxds", "bxds1"]}, # We assume that if a bxds1 file exists, it implies a bxds2 file
    "imu": {"stream_types": ["AVNnp1"], "file_names": ["bxds"]},
}

df_transects = file_util.arrange_by_transect(df_artifacts, usable_artifact_types)

# If needed, find streams from other sets to include
add_from_other_sets = True
if add_from_other_sets:
    usable_artifact_types_other_sets = {
        "gps_with_position": {"stream_types": ["GPSnc1", "GPStp2", "GPSap3"], "file_names": ["xds.gz"]},
    }
    df_transects_other_sets = file_util.arrange_by_transect(df_artifacts, usable_artifact_types_other_sets, ignore_set=True)
    df_transects_other_sets = df_transects_other_sets.drop(columns = [c for c in df_transects_other_sets if c in set(df_transects.columns)])
    df_transects_other_sets = df_transects_other_sets.rename(columns={'set': 'other_set_source'})

    # Merge them into df_transects
    df_transects = df_transects.join(df_transects_other_sets, on=['prj', 'trn'])


df_transects.head()

In [None]:
#df_transects[df_transects.index == ('ASB', 'R04Ea')]['gps_path'].values[0] # TODO

In [None]:
df_all_seasons = file_util.assign_seasons(df_transects)
df_all_seasons.head()

In [None]:
print(f"The following seasons were found in the dataset:")
seasons = np.array(df_all_seasons['season'].unique())
seasons.sort()
print(seasons)

# Step 1: Select a single season to extract

From here on, the rest of this workflow operates on a single season at a time. The season is identified initially by a year, corresponding to the year at the start of the season.

If available, `season_gps_postprocessed_dir` should be set to the path of the post-processed GPS and IMU data. If it is not available, it can be set to None:

```
season_gps_postprocessed_dir = "/resfs/GROUPS/CRESIS/dataproducts/metadata/2015_Antarctica_BaslerJKB/gps" # Preferred if available
season_gps_postprocessed_dir = None # Data can be processed from field GPS and IMU data only if that's all that's available
```

In [None]:
season_year = 2008 # Want: 2008 -- waiting on GPS data

season_gps_postprocessed_dir = "/resfs/GROUPS/CRESIS/dataproducts/metadata/2008_Antarctica_BaslerJKB/gps"

In [None]:
df_season = df_all_seasons[df_all_seasons['season'] == season_year]
df_season = df_season.sort_index()

# Combine rows with matching prj and trn, but different set - take the row with radar data if available. If both rows have radar data, print a warning.
df_reset = df_season.reset_index()

# Group by prj and trn
grouped = df_reset.groupby(['prj', 'trn'])

rows_to_keep = []
for (prj, trn), group in grouped:
    if len(group) == 1:
        # No duplicates, keep as is
        rows_to_keep.append(group.iloc[0])
    else:
        # Multiple rows with same prj/trn but different set
        rows_with_radar = group[group['radar_path'].notna()]
        
        if len(rows_with_radar) == 0:
            # No radar data in any row, keep the first one
            rows_to_keep.append(group.iloc[0])
        elif len(rows_with_radar) == 1:
            # Exactly one row with radar data, keep that one
            rows_to_keep.append(rows_with_radar.iloc[0])
        else:
            # Multiple rows with radar data - print warning and keep first one with radar
            print(f"[WARNING] Multiple rows with radar data for prj={prj}, trn={trn}:")
            print(f"  Sets: {rows_with_radar['set'].tolist()}")
            rows_to_keep.append(rows_with_radar.iloc[0])

# Reconstruct the dataframe
df_season = pd.DataFrame(rows_to_keep)
df_season = df_season.set_index(['prj', 'set', 'trn'])
df_season = df_season.sort_index()

In [None]:
# Create df_season (filtered to selected season) and check for missing data

#df_season = df_all_seasons[df_all_seasons['season'] == season_year]
#df_season = df_season.sort_index()

# Add post-processed GPS paths
if season_gps_postprocessed_dir is not None:
    df_season = file_util.add_postprocessed_gps_paths(df_season, season_gps_postprocessed_dir, ignore_set=True)

    # Update start timestamps based on post-processed GPS data if available
    df_season['start_timestamp'] = df_season.apply(file_util.get_start_timestamp, axis=1)
else:
    df_season['postprocessed_gps_path'] = None

# Check for timing-only GPS streams
only_timing_mask = (df_season['gps_stream_type'] == 'GPSkc1') & (df_season['postprocessed_gps_path'].isnull())
df_season_timing_only = df_season[only_timing_mask]
if len(df_season_timing_only) > 0:
    print(f"[WARNING] There are {len(df_season_timing_only)} transects with only timing-only GPS data (GPSkc1) and no post-processed GPS data.")
    df_season = df_season[~only_timing_mask]

# Handle missing radar data
df_season_missing_data = df_season[df_season['radar_path'].isnull()]
df_season = df_season[df_season['radar_path'].notnull()]

if len(df_season_missing_data) > 0:
    print(f"[WARNING] Missing radar data for {len(df_season_missing_data)} transects out of {len(df_season)+len(df_season_missing_data)}")
    if np.any(df_season_missing_data['gps_stream_type'] == 'GPSkc1'): # Timing only GPS type
        gpskc1_only_count = np.sum(df_season_missing_data['gps_stream_type'] == 'GPSkc1')
        print(f"  - Of these, {gpskc1_only_count} transects only have GPSkc1 timing-only GPS data")
    if np.any(df_season_missing_data['postprocessed_gps_path'].notnull()):
        postprocessed_gps_count = np.sum(df_season_missing_data['postprocessed_gps_path'].notnull())
        print(f"  - Of these, {postprocessed_gps_count} transects have post-processed GPS data available")
        print(df_season_missing_data[df_season_missing_data['postprocessed_gps_path'].notnull()].index)

# Handle missing post-processed GPS data
df_season_missing_postprocessed_gps = df_season[df_season['postprocessed_gps_path'].isnull()]
if season_gps_postprocessed_dir is not None and len(df_season_missing_postprocessed_gps) > 0:
    print(f"[WARNING] Missing post-processed GPS data for {len(df_season_missing_postprocessed_gps)} transects")

    df_season = df_season[df_season['postprocessed_gps_path'].notnull()]

# Display information about this season

# - Types of stream files:
print(f"GPS stream types: {df_season['gps_stream_type'].unique()}")
print(f"Radar stream types: {df_season['radar_stream_type'].unique()}")
print(f"IMU stream types: {df_season['imu_stream_type'].unique()}")

# - Sets
print(f"Sets: {df_season.reset_index()['set'].unique()}")

# - Projects
print(f"Projects: {df_season.reset_index()['prj'].unique()}")

# - Aircraft
ac_ident = df_season.reset_index()['set'].iloc[0][:3]
print(f"Aircraft identifier: {ac_ident}")

# - Season name
season_name = f"{season_year}_Antarctica_Basler{ac_ident}"

if False: # TODO: For testing only -- limit to first 5 segments
    season_name += "TEST2" # TODO: Remove after testing
    #df_season = df_season.head(5)
    #print("[WARNING] Limiting to first 5 segments for testing purposes")
print(f"Season name: {season_name}")

In [None]:
# Check for postprocessed GPS files without corresponding radar data in df_season
if season_gps_postprocessed_dir is not None:
    # Get all postprocessed GPS files in the directory
    gps_files = glob.glob(f"{season_gps_postprocessed_dir}/*PUTG1B_*_position.txt")
    
    # Parse filenames and check against df_season
    unmatched_gps_files = []
    
    for f in gps_files:
        parts = Path(f).stem.split('_')
        if len(parts) < 6:
            continue
        prj = parts[-4]
        set_ = parts[-3]
        trn = parts[-2]
        
        # Check if this PRJ/TRN exists in df_season with radar data
        # Note: df_season uses ignore_set=True for GPS matching, so we check by prj and trn
        matching_rows = df_season.reset_index()
        matching_rows = matching_rows[(matching_rows['prj'] == prj) & (matching_rows['trn'] == trn)]
        
        # Get first timestamp from the GPS file
        first_timestamp = None
        try:
            gps_df = opr_gps_file_generation.load_and_parse_postprocessed_gps_file(f)
            if len(gps_df) > 0:
                first_timestamp = pd.Timestamp.fromtimestamp(gps_df.iloc[0]['GPS_TIME'])
        except Exception as e:
            first_timestamp = f"Error: {str(e)[:30]}"
        
        if len(matching_rows) == 0:
            # No match at all in df_season
            unmatched_gps_files.append({
                'file': Path(f).name,
                'prj': prj,
                'set': set_,
                'trn': trn,
                'first_timestamp': first_timestamp,
                'reason': 'Not in df_season (not in this season or filtered out)'
            })
        elif matching_rows['radar_path'].isna().all():
            # Match exists but no radar data
            unmatched_gps_files.append({
                'file': Path(f).name,
                'prj': prj,
                'set': set_,
                'trn': trn,
                'first_timestamp': first_timestamp,
                'reason': 'No radar data'
            })
    
    if len(unmatched_gps_files) > 0:
        print(f"\nFound {len(unmatched_gps_files)} postprocessed GPS files without corresponding radar data in df_season:")
        print("-" * 130)
        print(f"  {'Filename':<60} | {'PRJ':>5} {'SET':>6} {'TRN':<10} | {'First Timestamp':<19} | {'Reason'}")
        print("-" * 130)
        for item in unmatched_gps_files:
            ts_str = str(item['first_timestamp'])[:19] if item['first_timestamp'] else 'N/A'
            print(f"  {item['file']:<60} | {item['prj']:>5} {item['set']:>6} {item['trn']:<10} | {ts_str:<19} | {item['reason']}")
    else:
        print("\nAll postprocessed GPS files in the directory have corresponding radar data in df_season.")

In [None]:
# Assign each transect to a segment

df_season = segment_splits.assign_segments(df_season, timestamp_field='tim', parse_ct=False, timestamp_split_threshold=2000) # Split based on the 'tim' counter (in ms)
#df_season = segment_splits.assign_segments(df_season, timestamp_field='TIMESTAMP', parse_ct=True, timestamp_split_threshold=pd.Timedelta(milliseconds=2000)) # OR, split based on the 'TIMESTAMP' field

n_segments = len(df_season['segment_path'].unique())
max_segments_per_day = df_season['segment_number'].max()

print(f"Created {n_segments} segments. Maximum segment number on a single day is {max_segments_per_day}.")
df_season.head()

In [None]:
# Create map of segments

missing_data_dfs = geo_util.load_gps_data(df_season_missing_data.dropna(subset=['gps_path']))

only_field_segment_dfs = geo_util.load_gps_data(df_season_missing_postprocessed_gps, source_type='field')

if season_gps_postprocessed_dir is not None:
    segment_dfs = geo_util.load_gps_data(df_season, source_type='postprocessed')

In [None]:
paths = []

# Add missing data
if len(missing_data_dfs) > 0:
    _, p = geo_util.create_path(missing_data_dfs)
    paths.append(p.opts(color='red', line_width=2).relabel('Missing Radar Data'))
else:
    print("No missing data to display.")

if len(only_field_segment_dfs) > 0:
    _, p = geo_util.create_path(only_field_segment_dfs)
    paths.append(p.opts(color='purple', line_width=2).relabel('Field GPS Data Only'))

# Add segments with data
if season_gps_postprocessed_dir is not None:
    _, p = geo_util.create_path(segment_dfs)
    paths.append(p.opts(color='blue', line_width=1).relabel('Radar + Post-processed GPS Data'))

# Individually labelled segments (doesn't work well for large number of segments)
# for segment_path in df_season['segment_path'].unique():
#     dfs_list_tmp = [df for df in segment_dfs if df['segment_path'].iloc[0] == segment_path]
#     _, p = geo_util.create_path(dfs_list_tmp)
#     p = p.relabel(f"Segment {segment_path}")
#     paths.append(p)

p = stream_util.create_antarctica_basemap() * hv.Overlay(paths)
p = p.opts(aspect='equal', frame_width=800, frame_height=800, tools=['hover'])
p = p.opts(title=season_name, legend_position='right')
p

In [None]:
hv.save(p, f"outputs/maps/{season_name}.html")

### Create GPS support files for each segment

In [None]:
df_season

In [None]:
# # TODO DEBUG FORCE USING FIELD GPS

# df_tmp = df_season.rename(columns={
#         "gps_path": "gps_path_timing_only",
#         "gps_stream_type": "gps_stream_type_timing_only"
#     }).rename(columns={
#         "gps_with_position_path": "gps_path",
#         "gps_with_position_stream_type": "gps_stream_type"
#     })

# gps_paths = df_tmp.groupby(['segment_date_str', 'segment_number'])[['segment_date_str', 'segment_number', 'gps_path']].apply(
#     opr_gps_file_generation.make_segment_gps_file,
#     include_groups=False,
#     output_base_dir=f"outputs/gps/{season_name}",
#     overwrite=True)

# gps_paths

In [None]:
overwrite_existing_gps_files = False

gps_paths = df_season.groupby(['segment_date_str', 'segment_number'])[['segment_date_str', 'segment_number', 'gps_path', 'postprocessed_gps_path']].apply(
    opr_gps_file_generation.make_segment_gps_file,
    include_groups=False,
    output_base_dir=f"outputs/gps/{season_name}",
    overwrite=overwrite_existing_gps_files)

gps_paths

### Create parameter spreadsheet starting templates

In [None]:
def radar_paths_ordered(x):
    l = x.sort_values('start_timestamp')['radar_path'].tolist()
    l = [str(Path(*Path(p).parts[-5:-1])) for p in l]
    return "{'" + "', '".join(l) + "'}"

radar_paths = df_season.groupby(['segment_date_str', 'segment_number'])[['radar_path', 'start_timestamp']].apply(radar_paths_ordered)
radar_paths

In [None]:
def first_prj_set_str(x):
    return list(x['prj'].unique())

def transect_names(x):
    return list(x.sort_values('start_timestamp')['trn'])

mission_names = df_season.reset_index().groupby(['segment_date_str', 'segment_number'])[['prj', 'set']].apply(first_prj_set_str)
transect_names = df_season.reset_index().groupby(['segment_date_str', 'segment_number'])[['start_timestamp', 'trn']].apply(transect_names)

In [None]:
defaults = preprocessing.load_defaults(f'seasons_config/{season_name}.yaml')

base_params_dir = Path(f'outputs/params/{season_name}')
Path(base_params_dir).mkdir(parents=True, exist_ok=True)

def make_parameter_sheet(default_values, segments, overrides={}):
    df = pd.DataFrame(default_values, index=segments)
    for key, value in overrides.items():
       df[key] = value
    return df

make_parameter_sheet(defaults['params']['cmd'], radar_paths.index, overrides={
    'mission_names': mission_names,
    'notes': transect_names
}).to_csv(base_params_dir / 'cmd.csv')

make_parameter_sheet(defaults['params']['records'], radar_paths.index, overrides={
    'file.board_folder_name': radar_paths,
    'gps.fn': gps_paths
}).to_csv(base_params_dir / 'records.csv')

sheets_with_defaults_only = ['qlook', 'sar', 'array', 'radar', 'post', 'analysis_noise']

for sheet_name in sheets_with_defaults_only:
    if sheet_name in defaults['params']:
        make_parameter_sheet(defaults['params'][sheet_name], radar_paths.index).to_csv(base_params_dir / f'{sheet_name}.csv')


### Generate temporary header files

In [None]:
df_season_subset = df_season #.loc[[('PEL', 'JKB2u', 'Y20b'), ('ICP10', 'JKB2u', 'F01T02a'), ('PEL', 'JKB2u', 'X48a')]]
df_season_subset.head()

In [None]:
df_season_subset.iloc[0]['postprocessed_gps_path']

In [None]:
import dask.dataframe as dd
from dask.distributed import Client, as_completed, progress
from dask import delayed

print("Setting up Dask LocalCluster for parallel processing...")
client = Client(n_workers=10)
print(f"Dashboard link: {client.dashboard_link}")
client

In [None]:
header_base_dir = f"/kucresis/scratch/tteisberg_sta/scripts/opr_user_tmp/headers/rds/{season_name}/"

# Get file locations (this is fast, no need to parallelize)
header_file_locations = df_season_subset['radar_path'].apply(opr_header_generation.get_header_file_location, base_dir=header_base_dir)

# Function to get header and save it
def get_and_save_header(path, fn):
    """Get header information and save it to file"""
    header = opr_header_generation.get_header_information(path)
    
    # Create directory if it doesn't exist
    fn_path = Path(fn)
    fn_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Save to file
    hdf5storage.savemat(str(fn_path), header, format='7.3', truncate_existing=True)
    print(f"Saved header to {fn}")
    
    return header

In [None]:
# Expand RADjh1 paths to include both channels (bxds1 and bxds2)
# RADjh1 files have separate files per channel, so we need to process both
expanded_radar_paths = []
expanded_header_file_locations = []

for path, fn in zip(df_season_subset['radar_path'], header_file_locations):
    path_obj = Path(path)
    
    # Check if this is a RADjh1 file
    if path_obj.parent.name == 'RADjh1':
        # Add both bxds1 and bxds2
        parent_dir = path_obj.parent
        for channel_file in ['bxds1', 'bxds2']:
            channel_path = parent_dir / channel_file
            if channel_path.exists():
                expanded_radar_paths.append(str(channel_path))
                # Calculate header file location for this channel
                channel_header_fn = opr_header_generation.get_header_file_location(str(channel_path), header_base_dir)
                expanded_header_file_locations.append(channel_header_fn)
            else:
                print(f"[WARNING] RADjh1 channel file not found: {channel_path}")
    else:
        # Not RADjh1, keep as is
        expanded_radar_paths.append(path)
        expanded_header_file_locations.append(fn)

print(f"Original paths: {len(df_season_subset['radar_path'])}")
print(f"Expanded paths: {len(expanded_radar_paths)} (includes both RADjh1 channels)")

# Replace the original paths with expanded paths
radar_paths_to_process = expanded_radar_paths
header_file_locations_to_process = expanded_header_file_locations

In [None]:
# Parallelized version using Dask
# Create delayed tasks for each file
delayed_tasks = []
files_to_process = []

for path, fn in zip(radar_paths_to_process, header_file_locations_to_process):
    if Path(fn).exists():
        print(f"Header file already exists for {path} at {fn}, skipping.")
    else:
        delayed_tasks.append(delayed(get_and_save_header)(path, fn))
        files_to_process.append((path, fn))

if len(delayed_tasks) > 0:
    # Compute in parallel with progress bar
    print(f"Processing and saving {len(delayed_tasks)} header files in parallel...")
    
    # Submit all tasks
    futures = client.compute(delayed_tasks)
    
    # Track progress
    progress(futures)
    
    # Gather results
    headers_list = client.gather(futures)
    
    print(f"Successfully generated and saved {len(headers_list)} header files.")
else:
    print("No header files need to be generated, all files already exist.")