# Regularize and Clean AudioSet Training and Evaluation Sets

In [1]:
import pandas as pd
import csv
import os

### Utils

In [4]:
def process_csv(input, folder):
    """
    Processes a CSV file by modifying and writing results to a new file avoiding the comma delimiters.
    Parameters:
    - input: str, the name of the CSV file to read
    - folder: str, the name of the folder where the output CSV file will be written
    """
    os.makedirs(folder, exist_ok=True)

    lines_to_write = []
    with open(input, "r") as f:
        reader = csv.reader(f, delimiter="\t")
        for i, line in enumerate(reader):
            if i < 3 and i > 1: # header rows
                print(line)
                lines_to_write.append(line[0])
            elif i > 1 and i >= 3: # actual rows
                line_to_write = line
                comma_count = 0
                for j, ch in enumerate(line_to_write[0]):
                    if ch == ',':
                        if comma_count < 3:
                            comma_count += 1
                        else:
                            line_to_write[0] = line_to_write[0][:j] + ';' + line_to_write[0][j+1:]
                lines_to_write.append(line_to_write[0])
    f.close()

    print(lines_to_write)

    output = os.path.join(folder, f"{os.path.splitext(os.path.basename(input))[0]}_reg.csv")
    
    with open(output, "w") as fout:
        for ltw in lines_to_write:
            fout.write(ltw + '\n')
    fout.close()

def clean_csv(input, folder, remove_labs):
    """
    Cleans the specified CSV file by removing rows with specific labels in ' positive_labels'.
    Parameters:
    - input: string, the name of the CSV file to clean
    - folder: string, the folder to save the cleaned CSV file
    - remove_labs: list, a list of strings to filter out from 'positive_labels'
    """
    os.makedirs(folder, exist_ok=True)
    df = pd.read_csv(input, quotechar='"')

    # Mask to filter labels
    mask = df[' positive_labels'].notnull() & df[' positive_labels'].apply(
        lambda x: all(labels not in str(x) for labels in remove_labs)
    )
    cleaned = df[mask]

    # Save to CSV
    output = os.path.join(folder, f"{os.path.splitext(os.path.basename(input))[0]}_clean.csv")
    cleaned.to_csv(output, index=False)
    print(f"Saved successfully in '{folder}' as '{os.path.basename(output)}'.")

In [5]:
output_folder = 'trains_evals'

In [6]:
process_csv("eval_segments.csv", output_folder)
process_csv("balanced_train_segments.csv", output_folder)

clean_csv(f'{output_folder}/eval_segments_reg.csv', output_folder, remove_labs=['/m/09x0r', '/m/04rlf']) # speech & music
clean_csv(f'{output_folder}/balanced_train_segments_reg.csv', output_folder, remove_labs=['/m/09x0r', '/m/04rlf']) # speech & music

['# YTID, start_seconds, end_seconds, positive_labels']
['# YTID, start_seconds, end_seconds, positive_labels', '--4gqARaEJE, 0.000, 10.000, "/m/068hy;/m/07q6cd_;/m/0bt9lr;/m/0jbk"', '--BfvyPmVMo, 20.000, 30.000, "/m/03l9g"', '--U7joUcTCo, 0.000, 10.000, "/m/01b_21"', '--i-y1v8Hy8, 0.000, 9.000, "/m/04rlf;/m/09x0r;/t/dd00004;/t/dd00005"', '-0BIyqJj9ZU, 30.000, 40.000, "/m/07rgt08;/m/07sq110;/t/dd00001"', '-0CamVQdP_Y, 0.000, 6.000, "/m/04rlf;/m/07pbtc8;/m/09x0r"', '-0Gj8-vB1q4, 30.000, 40.000, "/m/0140xf;/m/02cjck;/m/04rlf"', '-0RWZT-miFs, 420.000, 430.000, "/m/03v3yw;/m/0k4j"', '-0YUDn-1yII, 30.000, 40.000, "/m/02cjck;/m/04rlf"', '-0jeONf82dE, 21.000, 31.000, "/m/03k3r;/m/04rlf;/m/07q5rw0;/m/09x0r;/m/0jbk"', '-0nqfRcnAYE, 370.000, 380.000, "/m/04brg2"', '-0p7hKXZ1ww, 30.000, 40.000, "/g/122z_qxw;/m/09x0r"', '-0vPFx-wRRI, 30.000, 40.000, "/m/025_jnm;/m/04rlf"', '-0xzrMun0Rs, 30.000, 40.000, "/m/01g90h;/m/04rlf"', '-0yRK50zyTI, 30.000, 40.000, "/m/07pzfmf;/m/09x0r"', '-116CjQ3MAg, 160.0