# REFORMATTING AUDACITY FILES TO BE ABLE TO BE USED BY OPSO 0.12
OPSO contains helper functions to load raven files  
Audacity annotations must be modified to work with opensoundscapes as audacity generates TSV files instead of the expected CSV files

In [None]:

import pandas as pd
import os
import glob

### Prerequisite 2: Set paths (Absolute)
Make sure that your file structure looks like this
+ *"unique name for source of data (i.e. Macaulay, XC, field)"* ....... *folder*
    + csv files ....... *folder*
    + tsv files ....... *folder*
        + file1.wav
        + file2.wav
        + ...
        + fileN.wav


tsv files should contain all of your labels from audacity  
csv files should be COMPLETELY EMPTY, this will be populated with temporary csvs from this notebook

In [None]:
pathtsv = "/home/dah238/Kauai-Amakihi/Annotations/macaulay/tsv files"
pathcsv = "/home/dah238/Kauai-Amakihi/Annotations/macaulay/csv files"
audiofolder = "/home/dah238/Kauai-Amakihi/Audio/Macaulay Focal Recordings"
save_path = '/home/dah238/Kauai-Amakihi/Annotations/macaulay/combined_output_macaulay_v3.csv'
audio_folder = '/home/dah238/Kauai-Amakihi/Audio'

### Step 1: Convert audacity generated TSV files to CSV files

In [None]:
os.makedirs(pathcsv, exist_ok=True)
def tsv_to_csv_pandas(tsv_file, csv_file):
    if os.path.getsize(tsv_file) == 0:
        print(f"Skipped empty file: {tsv_file}")
        return
    df = pd.read_csv(tsv_file, sep='\t')
    df.to_csv(csv_file, index=False) # index=False prevents writing the DataFrame index as a column


dir_list = os.listdir(pathtsv)

for item in dir_list:
    tsv_to_csv_pandas(f'{pathtsv}/{item}', f'{pathcsv}/{item[:-4]}.csv')
    print(f"Conversion complete: {item} converted to {item[:-4]}.csv using pandas")

### Step 2: Format the CSVs to include file path in the CSV

In [None]:
def formatting_csv(csv_file,audiofolder):
    # Read the CSV and rename columns
    df = pd.read_csv(csv_file, header=None)
    df.rename(columns={0: 'start_time', 1: 'end_time', 2: 'annotation'}, inplace=True)

    # Audio folder and file match logic
    audio_extensions = ['wav', 'm4a', 'mp3']
    
    base_name = os.path.splitext(os.path.basename(csv_file))[0]

    matched_file = None
    for ext in audio_extensions:
        pattern = os.path.join(audio_folder, '**', f'*{base_name}*.{ext}')
        matches = glob.glob(pattern, recursive=True)

        for m in matches:
            audio_base = os.path.splitext(os.path.basename(m))[0].lower()
            if base_name.lower() in audio_base:
                matched_file = m
                break
        if matched_file:
            break

    # Handle matched/unmatched audio
    if matched_file:
        relative_path = os.path.abspath(matched_file)
    else:
        print(f"‚ö†Ô∏è No match found for {base_name}")
        relative_path = os.path.join(audio_folder, f"{base_name}.UNKNOWN")

    df['file'] = relative_path

    # Define output directory
    output_dir = pathcsv
    os.makedirs(output_dir, exist_ok=True)

    # Save file
    output_path = os.path.join(output_dir, f"{base_name}_reform.csv")
    df.to_csv(output_path, index=False)
    print(f"‚úÖ Saved reformatted CSV: {output_path}")


# === Main section ===

if not os.path.exists(pathcsv):
    raise FileNotFoundError(f"‚ùå Directory not found: {pathcsv}")

dir_list = [f for f in os.listdir(pathcsv) if f.endswith('.csv')]

if not dir_list:
    print(f"‚ö†Ô∏è No CSV files found in {pathcsv}")
else:
    for item in dir_list:
        full_path = os.path.join(pathcsv, item)
        formatting_csv(full_path,audiofolder)
        print(f"Conversion complete: {item}")

### Optional Step: Remove all old CSV files

In [None]:
# === Delete Non-_reform CSVs ===
print("\nüßπ Cleaning up non-reformatted CSV files...")

deleted_count = 0
for root, dirs, files in os.walk(pathcsv):
    for file in files:
        if file.endswith('.csv') and not file.endswith('_reform.csv'):
            full_path = os.path.join(root, file)
            try:
                os.remove(full_path)
                deleted_count += 1
                print(f"üóëÔ∏è Deleted: {full_path}")
            except Exception as e:
                print(f"‚ùå Error deleting {full_path}: {e}")

print(f"\n‚úÖ Cleanup complete. {deleted_count} original CSV file(s) deleted.")

### Step 3: Combine all Reformatted (with file path column and headers) CSVs into one combined output

In [None]:
pattern = os.path.join(pathcsv, '*_reform.csv')
reform_files = glob.glob(pattern)

if not reform_files:
    print(f"‚ö†Ô∏è No reformatted CSV files found in {pathcsv}.")
else:
    df_list = []
    for f in reform_files:
        try:
            df = pd.read_csv(f)
            df_list.append(df)
            print(f"üìÑ Added: {os.path.basename(f)}")
        except Exception as e:
            print(f"‚ùå Error reading {f}: {e}")

    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
        combined_df = combined_df.rename(columns={"file": "audio_file"})


        
        combined_df.to_csv(save_path, index=False)

        print(f"\n‚úÖ Combined CSV saved to: {save_path}")
        print(f"üìä Total rows: {len(combined_df)} across {len(df_list)} files")
    else:
        print("‚ö†Ô∏è No valid data frames were loaded. Combined file not created.")

### Optional Step: Remove all formatted CSVs now that combined_output.csv has been created

In [None]:
# === Delete Non-_reform CSVs ===
print("\nüßπ Cleaning up reformatted CSV files...")

deleted_count = 0
for root, dirs, files in os.walk(pathcsv):
    for file in files:
        if file.endswith('_reform.csv'):
            full_path = os.path.join(root, file)
            try:
                os.remove(full_path)
                deleted_count += 1
                print(f"üóëÔ∏è Deleted: {full_path}")
            except Exception as e:
                print(f"‚ùå Error deleting {full_path}: {e}")

print(f"\n‚úÖ Cleanup complete. {deleted_count} original CSV file(s) deleted.")