In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
from pathlib import Path
from tqdm import tqdm
import gzip
import holoviews as hv
import hdf5storage

from utig_radar_loading import file_util, stream_util, geo_util, opr_gps_file_generation, preprocessing

In [None]:
pd.options.mode.copy_on_write = True
tqdm.pandas()
hv.extension('bokeh')

In [None]:
use_cache = True
cache_dir = "outputs/file_index.csv"
base_path = "/kucresis/scratch/data/UTIG"

df_files = file_util.load_file_index_df(base_path, cache_dir, read_cache=use_cache)

df_artifacts = file_util.create_artifacts_df(df_files)

In [None]:
df_artifacts.head()

In [None]:
def arrange_by_transect(df_artifacts, streams):
    """
    Group by transects (unique combinations of (prj, set, trn)) and pull out paths
    to the desired data streams.

    streams is a dictionary mapping names of data categories to a list of acceptable
    stream types. For example:
    { "gps": ["GPSnc1", "GPSnc2"],
      "radar": ["RADnh5", "RADnh6"] }

    The resulting dataframe will have two columns per entry in the streams dictionary:
    <data category>_stream_type will contain the matched stream type and
    <data category>_path will contain the path to the data file.

    If multiple matching stream types are available, preference will be given to the
    first stream type in the list. If no matching stream types are available, columns
    will be filled with NaN.
    """
    
    def agg_fn(group):
        df = pd.DataFrame(index=[0])
        
        # Look for requested data streams
        for data_category in streams.keys():
            df[f"{data_category}_stream_type"] = np.nan
            df[f"{data_category}_path"] = np.nan
            
            matching_entry = group[(group['stream'].isin(streams[data_category]['stream_types'])) & \
                (group['file_name'].isin(streams[data_category]['file_names']))]
            if not matching_entry.empty:
                df[f"{data_category}_stream_type"] = matching_entry['stream'].values[0]
                df[f"{data_category}_path"] = matching_entry['full_path'].values[0]

        # Add in any other unique keys
        for k in group:
            if k in ['full_path', 'stream', 'processing_level', 'processing_type']:
                continue
            
            if len(group[k].unique()) == 1:
                df[k] = str(group[k].values[0]) # TODO

        return df

    df = df_artifacts.groupby(['prj', 'set', 'trn']).apply(agg_fn, include_groups=False)
    df.index = df.index.droplevel(-1)
    return df

df_transects = arrange_by_transect(df_artifacts, {
    "gps": {"stream_types": ["GPSnc1", "GPStp2", "GPSap3"], "file_names": ["xds.gz"]},
    "radar": {"stream_types": ["RADnh5", "RADnh3", "RADnh2", "RADnh4", "RADjh1"], "file_names": ["bxds"]}
})
df_transects

In [None]:
def get_start_timestamp(transect):
    # Iterate over stream data until we find one that has a valid context file
    
    fp = transect['gps_path']
    if isinstance(fp, float) and np.isnan(fp):
        return None

    ct_df = stream_util.load_ct_file(fp, read_csv_kwargs={'nrows': 1})
    ct_df = stream_util.parse_CT(ct_df)

    return ct_df.iloc[0]['TIMESTAMP']

def get_end_timestamp(transect):
    fp = transect['gps_path']
    
    # Read last few bytes and extract last line
    with gzip.open(fp, 'rb') as f:
        f.seek(-2, os.SEEK_END)
        while f.read(1) != b'\n':
            f.seek(-2, os.SEEK_CUR)
        last_line = f.readline().decode()
    
    # Load and parse just the last line
    from io import StringIO
    ct_columns = ['prj', 'set', 'trn', 'seq', 'clk_y', 'clk_n', 'clk_d', 'clk_h', 'clk_m', 'clk_s', 'clk_f', 'tim']
    ct_df = pd.read_csv(StringIO(last_line), sep=r'\s+', names=ct_columns, index_col=False)
    ct_df = stream_util.parse_CT(ct_df)
    return ct_df.iloc[0]['TIMESTAMP']

def season_from_datetime(d):
    if d.month >= 6:
        return d.year
    else:
        return d.year - 1

df_all_seasons = df_transects

df_all_seasons['start_timestamp'] = df_all_seasons.apply(get_start_timestamp, axis=1)
df_all_seasons['season'] = df_all_seasons['start_timestamp'].apply(season_from_datetime)
df_all_seasons = df_all_seasons.sort_values('prj')
#df_all_seasons.to_csv('tmp.csv')
df_all_seasons.head()

In [None]:
print(f"The following seasons were found in the dataset:")
seasons = np.array(list(df_all_seasons['season'].unique()))
seasons.sort()
print(seasons)

### Select a single season to extract

In [None]:
season_year = 2018

In [None]:
df_season = df_all_seasons[df_all_seasons['season'] == season_year]
df_season = df_season.sort_values(by='start_timestamp')

df_season_missing_data = df_season[df_season['radar_path'].isnull()]
df_season = df_season[df_season['radar_path'].notnull()]

print(f"Missing radar data for {len(df_season_missing_data)} transects out of {len(df_season)+len(df_season_missing_data)}")

In [None]:
df_season['gps_stream_type'].unique(), df_season['radar_stream_type'].unique()

In [None]:
df_season.reset_index()['set'].unique()

In [None]:
ac_ident = df_season.reset_index()['set'].iloc[0][:3]

season_name = f"{season_year}_Antarctica_Basler{ac_ident}"
season_name

In [None]:
# Projects in the season
df_season.index.get_level_values(0).unique()

In [None]:
# Merge segments

timestamp_field = 'tim'
timestamp_split_threshold = 1000
parse_ct = False

# timestamp_field = 'TIMESTAMP'
# timestamp_split_threshold = pd.Timedelta(milliseconds=10000)
# parse_ct = True

last_segment_ct = stream_util.load_ct_file(df_season.iloc[0]['radar_path'])
if parse_ct:
    last_segment_ct = stream_util.parse_CT(last_segment_ct)

df_season['segment_path'] = ""
df_season['segment_date_str'] = ""
df_season['segment_number'] = -1
current_segment_datestring = df_season.iloc[0]['start_timestamp'].strftime("%Y%m%d")
current_segment_idx = 1

df_season.iloc[0, df_season.columns.get_loc('segment_date_str')] = current_segment_datestring
df_season.iloc[0, df_season.columns.get_loc('segment_path')] = f"{current_segment_datestring}_{current_segment_idx:02d}"
df_season.iloc[0, df_season.columns.get_loc('segment_number')] = current_segment_idx


print(f"Initial segment path is: {df_season.iloc[0]['segment_path']}")

for row_iloc in tqdm(range(1, len(df_season))):
    try:
        curr_segment_ct = stream_util.load_ct_file(df_season.iloc[row_iloc]['radar_path'])
        if parse_ct:
            curr_segment_ct = stream_util.parse_CT(curr_segment_ct)

        delta_from_last = curr_segment_ct[timestamp_field].iloc[0] - last_segment_ct[timestamp_field].iloc[-1]

        if np.abs(delta_from_last) > timestamp_split_threshold:
            new_datestring = df_season.iloc[row_iloc]['start_timestamp'].strftime("%Y%m%d")
            if new_datestring == current_segment_datestring:
                current_segment_idx += 1
            else:
                current_frame_idx = 1
                current_segment_idx = 1
                current_segment_datestring = new_datestring

            print(f"Segment path changed to {current_segment_datestring}_{current_segment_idx:02d}. Delta in '{timestamp_field}' was {delta_from_last}")

        df_season.iloc[row_iloc, df_season.columns.get_loc('segment_date_str')] = current_segment_datestring
        df_season.iloc[row_iloc, df_season.columns.get_loc('segment_path')] = f"{current_segment_datestring}_{current_segment_idx:02d}"
        df_season.iloc[row_iloc, df_season.columns.get_loc('segment_number')] = current_segment_idx

        last_segment_ct = curr_segment_ct
    except Exception as e:
        print(f"Could not load index {row_iloc}")
        print(df_season.iloc[row_iloc]['radar_path'])
        print(e)

df_season.head()

In [None]:
df_season_missing_data.head()

In [None]:
def load_gps_data(transects_df):
    segment_dfs = []

    for _, row in tqdm(transects_df.iterrows(), total=len(transects_df)):

        f = row['gps_path']
        
        df = stream_util.load_gzipped_stream_file(f, debug=False, parse=True, parse_kwargs={'use_ct': True})

        line_length_km = stream_util.calculate_track_distance_km(df)

        _, _, line_length_m_shapely = geo_util.project_split_and_simplify(df['LON'].values, df['LAT'].values, calc_length=True, simplify_tolerance=100)

        necessary_keys = ['prj', 'set', 'trn', 'clk_y', 'LAT', 'LON', 'TIMESTAMP']
        for k in necessary_keys:
            if k not in df:
                df[k] = np.nan

        df_sub = df[['prj', 'set', 'trn', 'clk_y', 'LAT', 'LON', 'TIMESTAMP']]

        if 'segment_path' in row:
            df_sub['segment_path'] = row['segment_path']

        segment_dfs.append(df_sub)
    return segment_dfs

segment_dfs = load_gps_data(df_season)
missing_data_dfs = load_gps_data(df_season_missing_data)

In [None]:
n_segments = len(df_season['segment_path'].unique())
max_segments_per_day = df_season['segment_number'].max()

print(f"Created {n_segments} segments. Maximum segment number on a single day is {max_segments_per_day}.")

In [None]:
paths = []

# Add missing data
if len(missing_data_dfs) > 0:
    _, p = geo_util.create_path(missing_data_dfs)
    paths.append(p.opts(color='red', line_width=3).relabel('Missing Radar Data'))
else:
    print("No missing data to display.")

# Add segments with data
for segment_path in df_season['segment_path'].unique():
    dfs_list_tmp = [df for df in segment_dfs if df['segment_path'].iloc[0] == segment_path]
    _, p = geo_util.create_path(dfs_list_tmp)
    p = p.relabel(f"Segment {segment_path}")
    paths.append(p)

p = stream_util.create_antarctica_basemap() * hv.Overlay(paths)
p = p.opts(aspect='equal', frame_width=800, frame_height=800, tools=['hover'])
p = p.opts(title=season_name, legend_position='right')
p

In [None]:
hv.save(p, f"outputs/maps/{season_name}.html")

In [None]:
df_season[df_season['segment_path'] == '20190114_01']

## Create GPS files for each segment

In [None]:
def make_segment_gps_file(x):
    x = x.reset_index()
    print(f"{x['segment_date_str'].iloc[0]}_{x['segment_number'].iloc[0]}")
    gps_paths = list(x['gps_path'].unique())
    output_path = f"outputs/gps/{season_name}/gps_{x['segment_date_str'].iloc[0]}_{x['segment_number'].iloc[0]}.mat"
    output_path = Path(output_path)

    # Only generate if the file does not exist
    if not output_path.exists():
        opr_gps_file_generation.generate_gps_file(gps_paths, output_path, format='hdf5')
    else:
        print(f"File {output_path} already exists. Skipping generation. If you want to regenerate, manually delete the file.")

    return output_path.resolve()

gps_paths = df_season.groupby(['segment_date_str', 'segment_number'])[['segment_date_str', 'segment_number', 'gps_path']].apply(make_segment_gps_file, include_groups=False)
gps_paths

In [None]:
def radar_paths_ordered(x):
    l = x.sort_values('start_timestamp')['radar_path'].tolist()
    l = [str(Path(*Path(p).parts[-5:-1])) for p in l]
    return "{'" + "', '".join(l) + "'}"

radar_paths = df_season.groupby(['segment_date_str', 'segment_number'])[['radar_path', 'start_timestamp']].apply(radar_paths_ordered)
radar_paths

In [None]:
def first_prj_set_str(x):
    return list(x['prj'].unique())

def transect_names(x):
    return list(x.sort_values('start_timestamp')['trn'])

mission_names = df_season.reset_index().groupby(['segment_date_str', 'segment_number'])[['prj', 'set']].apply(first_prj_set_str)
transect_names = df_season.reset_index().groupby(['segment_date_str', 'segment_number'])[['start_timestamp', 'trn']].apply(transect_names)

In [None]:
defaults = preprocessing.load_defaults('src/utig_radar_loading/defaults/2018_Antarctica_BaslerJKB.yaml')

def make_parameter_sheet(default_values, segments, overrides={}):
    df = pd.DataFrame(default_values, index=segments)
    for key, value in overrides.items():
       df[key] = value
    return df

make_parameter_sheet(defaults['cmd'], radar_paths.index, overrides={
    'mission_names': mission_names,
    'notes': transect_names
}).to_csv('outputs/params/cmd.csv')

make_parameter_sheet(defaults['records'], radar_paths.index, overrides={
    'file.board_folder_name': radar_paths,
    'gps.fn': gps_paths
}).to_csv('outputs/params/records.csv')

make_parameter_sheet(defaults['qlook'], radar_paths.index).to_csv('outputs/params/qlook.csv')
make_parameter_sheet(defaults['radar'], radar_paths.index).to_csv('outputs/params/radar.csv')

### Generate temporary header files

In [None]:
df_season.head()

In [None]:
def get_header_information(f):
    ct_data = stream_util.load_ct_file(f)
    ct_data = stream_util.parse_CT(ct_data)
    headers = {
        # comp_time is ct_data['TIMESTAMP'] converted to a floating point unix timestamp
        'comp_time': ((ct_data['TIMESTAMP'] - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s')).values.astype(np.float64),
        # radar_time is ct_data['tim']
        'radar_time': ct_data['tim'].values,
    }
    #print(f"comp_time[0]={headers['comp_time'][0]}, comp_time[-1]={headers['comp_time'][-1]}")
    #print(f"radar_time[0]={headers['radar_time'][0]}, radar_time[-1]={headers['radar_time'][-1]}")

    headers['offset'] = np.arange(len(ct_data)) # TODO: Unclear to me what 'offset' is supposed to be
    return headers

def get_header_file_location(f, base_dir=f"/kucresis/scratch/tteisberg_sta/scripts/opr_user_tmp/headers/rds/{season_name}/"):
    p = Path(f)
    fn_name = p.stem
    board_folder_name = Path(*p.parts[-5:-1])
    board_folder_name_cur = base_dir / board_folder_name
    return str(board_folder_name_cur / (fn_name + '.mat'))


headers = df_season['radar_path'].progress_apply(get_header_information)
header_file_locations = df_season['radar_path'].apply(get_header_file_location)

In [None]:
for header, fn in zip(headers.values, header_file_locations.values):
    fn = Path(fn)
    fn.parent.mkdir(parents=True, exist_ok=True)
    print(f"Writing header to {fn}")
    hdf5storage.savemat(str(fn), header, format='7.3')

## Break segments into frames

In [None]:
break_distance = 50 # km

frame_outputs = {}
all_entries = []

segment_paths = df_season['segment_path'].unique()
for seg in segment_paths:
    print(f"Processing segment: {seg}")
    seg_df = df_season[df_season['segment_path'] == seg].sort_values('start_timestamp')
    # Note: Should have already been sorted, but just in case

    frame_idx = 1 # Frame index we're currently assigning
    accumulated_km = 0 # Sum of line-km currently assigned to frame_idx
    transect_iloc = 0 # Index of the current transect being processed

    frame_outputs[seg] = {frame_idx: []}
    last_x, last_y = None, None

    for transect_iloc in range(len(seg_df)):
        print(f" -> Allocating transect {transect_iloc} {seg_df.index[transect_iloc]}")

        # Load the geometry of this transect
        df = stream_util.load_gzipped_stream_file(
            seg_df.iloc[transect_iloc]['gps_path'],
            debug=False, parse=True, parse_kwargs={'use_ct': True}
            )

        x_proj, y_proj, line_length_m = geo_util.project_split_and_simplify(
            df['LON'].values, df['LAT'].values, calc_length=True, simplify_tolerance=None)
        
        x_proj = x_proj[:-1]
        y_proj = y_proj[:-1]

        # Calculate the along-track distance, accounting for possible distance from the
        # end of the last transect
        deltas = np.sqrt(np.diff(x_proj)**2 + np.diff(y_proj)**2) / 1000  # Convert to km
        if last_x:
            deltas = np.insert(deltas, 0, np.sqrt((x_proj[0] - last_x)**2 + (y_proj[0] - last_y)**2) / 1000)
        else:
            deltas = np.insert(deltas, 0, 0)
        dist = np.cumsum(deltas)
        #print(f"Transect total length is {dist[-1]} km")
        # print(x_proj)
        # print(y_proj)
        # print(dist)
        # raise Exception("test")

        # Allocate parts of this transect to frames
        transect_start_tim = df['tim'].iloc[0]
        transect_start_idx = 0
        while transect_start_tim < df['tim'].iloc[-1]:
            # Find the 'tim' index that fits into the current segment
            remaining_distance = break_distance - accumulated_km

            dists_from_idx = np.maximum(0, dist - dist[transect_start_idx])
            #print(f"With transect_start_idx={transect_start_idx}, remaining distance in this transect is {dists_from_idx[-1]} km")

            break_idx = np.argmin(np.abs(dists_from_idx - remaining_distance))
            break_tim = df['tim'].iloc[break_idx]

            entry = seg_df.iloc[transect_iloc:transect_iloc+1].copy()
            entry['gps_idx_start'] = transect_start_idx
            entry['gps_idx_stop'] = break_idx
            entry['tim_start'] = transect_start_tim
            entry['tim_stop'] = break_tim
            entry['frame_number'] = frame_idx

            all_entries.append(entry)

            # Add an entry to this frame and update distance
            frame_outputs[seg][frame_idx].append(entry)
            accumulated_km += dist[break_idx] - dist[transect_start_idx]
            print(f"   -> Assigned indices {transect_start_idx} to {break_idx} (distance {dist[break_idx] - dist[transect_start_idx]} km) to frame {frame_idx}, now at {accumulated_km} km")

            # Move transect start index
            transect_start_idx = break_idx
            transect_start_tim = break_tim

            # Check if the frame is full
            if accumulated_km >= 0.98*break_distance:
                print(f"    Frame {frame_idx} is full with {accumulated_km} km")
                frame_idx += 1
                accumulated_km = 0
                frame_outputs[seg][frame_idx] = []
            

        last_x, last_y = x_proj[-1], y_proj[-1]


In [None]:
frames_plan_df = pd.concat(all_entries).reset_index().set_index(['segment_date_str', 'segment_number', 'frame_number'])
frames_plan_df

In [None]:
def make_segment_gps_file(x):
    x = x.reset_index()
    print(f"{x['segment_date_str'].iloc[0]}_{x['segment_number'].iloc[0]}")
    gps_paths = list(x['gps_path'].unique())
    output_path = f"outputs/gps/{season_name}/gps_{x['segment_date_str'].iloc[0]}_{x['segment_number'].iloc[0]}.mat"

    # Only generate if the file does not exist
    if not Path(output_path).exists():
        opr_gps_file_generation.generate_gps_file(gps_paths, output_path, format='hdf5')
    else:
        print(f"File {output_path} already exists. Skipping generation. If you want to regenerate, manually delete the file.")
        return None

    return output_path

frames_plan_df.groupby(['segment_date_str', 'segment_number']).apply(make_segment_gps_file)

In [None]:
frames_plan_df_tmp = frames_plan_df[:3]
frames_plan_df_tmp

In [None]:
# Extract headers from bxds files
bxds_files = list(frames_plan_df_tmp['radar_path'].unique())
headers = preprocessing.extract_headers(bxds_files)

In [None]:
segments = preprocessing.create_segments_from_frames(
      frames_plan_df_tmp,
      bxds_files,
      headers
  )

In [None]:
# # Extract headers (if needed for records_create)
#   headers = preprocessing.extract_headers(bxds_files)

#   # Create segments from existing frames
#   segments = preprocessing.create_segments_from_frames(
#       frames_plan_df,
#       bxds_files,
#       headers
#   )

#   # Generate parameters
#   params = preprocessing.generate_all_parameters(
#       segments,
#       season_name='2022_Antarctica_BaslerMKB',
#       radar_name='rds',
#       defaults_file='path/to/2022_Antarctica_BaslerMKB.yaml',
#       base_dir='/data/path',
#       board_folder_name='F01'
#   )

#   # Write spreadsheets
#   preprocessing.write_parameter_spreadsheet(
#       params,
#       'output/2022_Antarctica_BaslerMKB_param'
#   )

#   # Save headers for MATLAB records_create
#   preprocessing.save_temporary_headers(
#       headers,
#       bxds_files,
#       Path('/opr_tmp'),
#       '2022_Antarctica_BaslerMKB',
#       'F01'
#   )