In [1]:
import os
import re
from datetime import datetime, timedelta
from typing import List, Optional, Tuple, Dict
import pandas as pd

In [2]:
# Paths and mapping
data_path = "/Users/moshe/deepvoice/soundbay/noaa_tmp"
txt_path = os.path.join(data_path, "disk_tree_NOAA.txt")
annotations_path = os.path.join(data_path, "annotations")

In [3]:
file_to_date_mapping: Dict[str, datetime] = {
    "NEFSC_MA-RI_202202_COX01_narwlog.csv": datetime(2021, 1, 1),
    "NEFSC_MA-RI_202110_NS01_narwlog.csv": datetime(1970, 1, 1),
    "NEFSC_MA-RI_202111_COX01_narwlog.csv": datetime(1970, 1, 1),
    "NEFSC_MA-RI_202205_COX01_narwlog.csv": datetime(2021, 1, 1),
    "NEFSC_MA-RI_202205_NS02_narwlog.csv": datetime(2021, 1, 1),
    "NEFSC_MA-RI_202202_NS02_narwlog.csv": datetime(2006, 1, 1),
    "NEFSC_MA-RI_202102_COX01_narwlog.csv": datetime(2021, 1, 1),
    "NEFSC_MA-RI_202202_NS01_narwlog.csv": datetime(2006, 1, 1),
    "NEFSC_MA-RI_202103_NS02_narwlog.csv": datetime(2021, 1, 1),
    "NEFSC_MA-RI_202107_COX01_narwlog.csv": datetime(2021, 1, 1),
    "NEFSC_MA-RI_202107_NS01_narwlog.csv": datetime(2021, 1, 1),
    "NEFSC_MA-RI_202107_NS02_narwlog.csv": datetime(2021, 1, 1),
    "NEFSC_MA-RI_202103_NS01_narwlog.csv": datetime(2021, 1, 1),
    "NEFSC_MA-RI_202110_NS02_narwlog.csv": datetime(1970, 1, 1),
}

In [4]:
def find_closest_number(target: int, numbers: List[int]) -> Optional[int]:
    """Find the closest number to the target, prioritizing numbers <= target."""
    if not numbers:
        return None

    numbers = sorted(numbers)
    left_numbers = [num for num in numbers if num <= target]
    return max(left_numbers) if left_numbers else min(numbers)

def extract_serial_number_from_path(path: str) -> Optional[int]:
    """Extract serial number from the WAV file path."""
    try:
        return int(path.split("/")[-1].split(".")[1])
    except (IndexError, ValueError):
        print(path, "error")
        return None

def get_closest_wav(serial_number: int, grandparent: str, wav_files: List[str]) -> Optional[str]:
    """Find the closest WAV file by serial number within a specified directory."""
    relevant_wavs = [file for file in wav_files if file.split("/")[0] == grandparent]
    wav_numbers = [extract_serial_number_from_path(f) for f in relevant_wavs]
    wav_numbers = [num for num in wav_numbers if num is not None]
    closest_number = find_closest_number(serial_number, wav_numbers)
    return next((f for f in relevant_wavs if f".{closest_number}." in f), None)

def find_surrounding_files(serial_number: int, grandparent: str, wav_files: List[str]) -> Tuple[Optional[str], Optional[str]]:
    """Find WAV files just before and after the given serial number."""
    relevant_wavs = [file for file in wav_files if file.split("/")[0] == grandparent]
    wav_numbers = sorted(
        num for num in (extract_serial_number_from_path(f) for f in relevant_wavs) if num is not None
    )

    previous, next_ = None, None
    for i, num in enumerate(wav_numbers):
        if num >= serial_number:
            previous = wav_numbers[i - 1] if i > 0 else None
            next_ = num
            break

    previous_file = next((f for f in relevant_wavs if previous and f".{previous}." in f), None)
    next_file = next((f for f in relevant_wavs if next_ and f".{next_}." in f), None)
    return (previous_file, next_file)

def calculate_time_difference(ts1: Optional[int], ts2: Optional[int], fractional_seconds: float = 0.0) -> Optional[float]:
    """Calculate time difference in seconds between two timestamps."""
    if ts1 is None or ts2 is None:
        return None
    try:
        dt1 = datetime.strptime(str(ts1), "%y%m%d%H%M%S")
        dt2 = datetime.strptime(str(ts2), "%y%m%d%H%M%S")
        return abs((dt1 - dt2).total_seconds()) + fractional_seconds
    except ValueError:
        print(ts1, ts2, "errorrrr")
        return None

def seconds_to_datetime(start_date: datetime, seconds: float) -> datetime:
    """Convert a number of seconds into a datetime object from the given start date."""
    return start_date + timedelta(seconds=seconds)

def parse_tree_lines(lines: List[str]) -> List[Tuple[int, str]]:
    """Parse tree structure lines and return depth-name tuples."""
    pattern = re.compile(r"^(?P<indent>[\s\xa0│]*)(?:├── |└── )(?P<name>.+)$")
    parsed = []
    for line in lines:
        if "├── " not in line and "└── " not in line:
            continue
        match = pattern.match(line.rstrip("\n"))
        if match:
            depth = len(match.group("indent")) // 4
            name = match.group("name").strip()
            parsed.append((depth, name))
    return parsed

def tree_to_filepaths(parsed_lines: List[Tuple[int, str]]) -> List[str]:
    """Convert parsed tree lines into full file paths."""
    file_paths = []
    stack = []
    for i, (depth, name) in enumerate(parsed_lines):
        stack = stack[:depth]
        is_dir = i + 1 < len(parsed_lines) and parsed_lines[i + 1][0] > depth
        if is_dir:
            stack.append(name)
        elif "." in name:
            file_paths.append("/".join(stack + [name]))
        else:
            stack.append(name)
    return file_paths

def parse_tree_file(filename: str, filter_wav_only: bool = True) -> List[str]:
    """Read and parse a tree file to extract WAV file paths."""
    with open(filename, "r", encoding="utf-8") as f:
        lines = f.readlines()
    parsed = parse_tree_lines(lines)
    paths = tree_to_filepaths(parsed)
    return [x for x in paths if x.endswith(".wav")] if filter_wav_only else paths


In [5]:
def process_nefsc_mari_dataset(
    annotations_path: str,
    disk_tree_path: str,
    file_to_date_mapping: Dict[str, datetime],
) -> pd.DataFrame:
    """Process and enrich NEFSC MA-RI annotations with WAV file metadata."""
    dfs = []
    for filename in os.listdir(annotations_path):
        if not filename.endswith(".csv"):
            continue
        df = pd.read_csv(os.path.join(annotations_path, filename))
        df = df.loc[~((df["species"] == 999) & (df["call_type"] == 0))]
        df["filename"] = filename
        dfs.append(df)

    df = pd.concat(dfs)
    df["start_time"] = df["start_time"].astype(str).str.replace(",", "").astype(float)
    df["end_time"] = df["end_time"].astype(str).str.replace(",", "").astype(float)

    df["start_time_date"] = df.apply(
        lambda x: seconds_to_datetime(file_to_date_mapping.get(x["filename"]), x["start_time"]), axis=1
    )
    df["end_time_date"] = df.apply(
        lambda x: seconds_to_datetime(file_to_date_mapping.get(x["filename"]), x["end_time"]), axis=1
    )
    df["serial_number_int"] = df["start_time_date"].apply(lambda x: int(x.strftime("%y%m%d%H%M%S")))
    df["serial_number"] = df["start_time_date"].apply(lambda x: x.timestamp())

    wav_files = parse_tree_file(disk_tree_path)
    df["closest_wav"] = df.apply(
        lambda x: get_closest_wav(x["serial_number_int"], x["filename"].split("_narwlog.csv")[0], wav_files), axis=1
    )
    df["closest_pair"] = df.apply(
        lambda x: find_surrounding_files(x["serial_number_int"], x["filename"].split("_narwlog.csv")[0], wav_files), axis=1
    )

    df["fractional_seconds"] = df["start_time"] - df["start_time"].astype(int)
    df["time_diff_seconds"] = df.apply(
        lambda x: calculate_time_difference(
            x["serial_number_int"],
            extract_serial_number_from_path(x["closest_wav"]),
            x["fractional_seconds"]
        ), axis=1
    )

    df["Begin Time (s)"] = df["time_diff_seconds"]
    df["End Time (s)"] = df.apply(
        lambda x: x["Begin Time (s)"] + (x["end_time"] - x["start_time"]) if x["Begin Time (s)"] is not None else None,
        axis=1
    )
    return df

In [6]:
# Process dataset
df = process_nefsc_mari_dataset(
    annotations_path=annotations_path,
    disk_tree_path=txt_path,
    file_to_date_mapping=file_to_date_mapping
)

In [7]:
df.head()

Unnamed: 0,start_time,end_time,start_freq,end_freq,species,call_type,filename,start_time_date,end_time_date,serial_number_int,serial_number,closest_wav,closest_pair,fractional_seconds,time_diff_seconds,Begin Time (s),End Time (s)
1,1634706000.0,1634706000.0,62.7845,120.924,7,1,NEFSC_MA-RI_202110_NS02_narwlog.csv,2021-10-20 05:01:29.740,2021-10-20 05:01:30.460,211020050129,1634706000.0,NEFSC_MA-RI_202110_NS02/6125_64kHz_UTC/6125.21...,(NEFSC_MA-RI_202110_NS02/6125_64kHz_UTC/6125.2...,0.74,11851.74,11851.74,11852.46
2,1634706000.0,1634706000.0,41.8543,113.947,7,1,NEFSC_MA-RI_202110_NS02_narwlog.csv,2021-10-20 05:08:10.720,2021-10-20 05:08:11.510,211020050810,1634706000.0,NEFSC_MA-RI_202110_NS02/6125_64kHz_UTC/6125.21...,(NEFSC_MA-RI_202110_NS02/6125_64kHz_UTC/6125.2...,0.72,12252.72,12252.72,12253.51
3,1634707000.0,1634707000.0,72.0869,188.366,7,1,NEFSC_MA-RI_202110_NS02_narwlog.csv,2021-10-20 05:10:17.100,2021-10-20 05:10:18.160,211020051017,1634707000.0,NEFSC_MA-RI_202110_NS02/6125_64kHz_UTC/6125.21...,(NEFSC_MA-RI_202110_NS02/6125_64kHz_UTC/6125.2...,0.1,12379.1,12379.1,12380.16
4,1634707000.0,1634707000.0,65.1101,148.831,7,1,NEFSC_MA-RI_202110_NS02_narwlog.csv,2021-10-20 05:12:16.050,2021-10-20 05:12:16.910,211020051216,1634707000.0,NEFSC_MA-RI_202110_NS02/6125_64kHz_UTC/6125.21...,(NEFSC_MA-RI_202110_NS02/6125_64kHz_UTC/6125.2...,0.05,12498.05,12498.05,12498.91
5,1634708000.0,1634708000.0,60.459,120.924,7,1,NEFSC_MA-RI_202110_NS02_narwlog.csv,2021-10-20 05:28:09.870,2021-10-20 05:28:10.850,211020052809,1634708000.0,NEFSC_MA-RI_202110_NS02/6125_64kHz_UTC/6125.21...,(NEFSC_MA-RI_202110_NS02/6125_64kHz_UTC/6125.2...,0.87,13451.87,13451.87,13452.85


In [33]:
wav_files = parse_tree_file(txt_path)
single_df = pd.read_csv("/Users/moshe/deepvoice/soundbay/noaa_tmp/annotations/NEFSC_MA-RI_202202_COX01_narwlog.csv")
single_df["filename"] = "NEFSC_MA-RI_202202_COX01_narwlog.csv"

single_df["start_time"] = single_df["start_time"].astype(str).str.replace(",", "").astype(float)
single_df["end_time"] = single_df["end_time"].astype(str).str.replace(",", "").astype(float)

single_df["start_time_date"] = single_df.apply(
        lambda x: seconds_to_datetime(file_to_date_mapping.get(x["filename"]), x["start_time"]), axis=1
    )
single_df["serial_number_int"] = single_df["start_time_date"].apply(lambda x: int(x.strftime("%y%m%d%H%M%S")))
single_df["serial_number"] = single_df["start_time_date"].apply(lambda x: x.timestamp())
single_df["closest_wav"] = single_df.apply(
        lambda x: get_closest_wav(x["serial_number_int"], x["filename"].split("_narwlog.csv")[0], wav_files), axis=1
    )

In [34]:
single_df.head()

Unnamed: 0,start_time,end_time,start_freq,end_freq,species,call_type,filename,start_time_date,serial_number_int,serial_number,closest_wav
0,36078469.19,36078470.06,116.586,186.093,999,0,NEFSC_MA-RI_202202_COX01_narwlog.csv,2022-02-22 13:47:49.190,220222134749,1645538000.0,NEFSC_MA-RI_202202_COX01/6124_64kHz_UTC/6124.2...
1,36081331.78,36081332.62,143.492,286.989,7,1,NEFSC_MA-RI_202202_COX01_narwlog.csv,2022-02-22 14:35:31.780,220222143531,1645541000.0,NEFSC_MA-RI_202202_COX01/6124_64kHz_UTC/6124.2...
2,36088826.63,36088827.44,123.312,251.115,999,0,NEFSC_MA-RI_202202_COX01_narwlog.csv,2022-02-22 16:40:26.630,220222164026,1645548000.0,NEFSC_MA-RI_202202_COX01/6124_64kHz_UTC/6124.2...
3,36092320.46,36092321.96,78.4692,206.272,999,0,NEFSC_MA-RI_202202_COX01_narwlog.csv,2022-02-22 17:38:40.460,220222173840,1645552000.0,NEFSC_MA-RI_202202_COX01/6124_64kHz_UTC/6124.2...
4,36092520.04,36092521.1,71.7428,195.061,999,0,NEFSC_MA-RI_202202_COX01_narwlog.csv,2022-02-22 17:42:00.040,220222174200,1645552000.0,NEFSC_MA-RI_202202_COX01/6124_64kHz_UTC/6124.2...


In [32]:
len(set(wav_files))

9830

In [18]:
really_smalle = (df['End Time (s)'] - df['Begin Time (s)']) < 0.05

In [19]:
really_smalle.sum()

18

In [27]:
aaa = df['End Time (s)'] > 4 * 3600
aaa.sum()

8