In [62]:
import os

import numpy as np
import pandas as pd
from tqdm import tqdm

classes_labels = {
    0: "Alarm_bell_ringing",
    1: "Blender",
    2: "Cat",
    3: "Dishes",
    4: "Dog",
    5: "Electric_shaver_toothbrush",
    6: "Frying",
    7: "Running_water",
    8: "Speech",
    9: "Vacuum_cleaner",
}

In [3]:
# load ref
ref = pd.read_csv("db/audioset/audioset_train_strong.tsv", sep="\t")
mid2name = pd.read_csv("db/audioset/mid_to_display_name.tsv", sep="\t", names=["mid", "label"])
mid2label = dict()
label2mid = dict()
for mid, label in mid2name.itertuples(index=False):
    mid2label[mid] = label
    label2mid[label] = mid
# load preds
results = np.load("../audioset_preds.npy", allow_pickle=True)
print(len(results))

101161


In [4]:
pred_dict = dict()
for x in results:
    segment_id = x["filename"].split("/")[-1][:-4]
    pred_dict[segment_id] = x["preds"]

In [5]:
n_frames = 156
audioset_preds_by_label = dict()
ref_group_label = ref.groupby(by=["label"])
for mid, g in tqdm(ref_group_label):
    if len(set(g.segment_id)) < 4: # continue if label has few samples
        continue
    label = mid2label[mid]
    audioset_preds_by_label[label] = []
    for segment_id, start, end, _ in g.itertuples(index=False):
        if segment_id in pred_dict:
            preds = pred_dict[segment_id]
            start_frame = round(n_frames * (start / 10))
            end_frame = round(n_frames * (end / 10))
            audioset_preds_by_label[label].append(preds[:, start_frame:end_frame])

100%|██████████| 447/447 [00:01<00:00, 223.86it/s]


In [6]:
audioset_preds_mean = dict()
for label, preds in audioset_preds_by_label.items():
    preds = np.concatenate(preds, -1)
    audioset_preds_mean[label] = preds.mean(-1)


In [7]:
min_thres = 0.3
label_candidates = []
for label, pred_mean in audioset_preds_mean.items():
    if (pred_mean > min_thres).any():
        label_candidates.append(label)
print(len(label_candidates), label_candidates)

107 ['Kettle whistle', 'Toothbrush', 'Sink (filling or washing)', 'Aircraft engine', 'Dial tone', 'Washing machine', 'Drill', 'Conversation', 'Ringtone', 'Laughter', 'Chainsaw', 'Toilet flush', 'Stairs', 'Sine wave', 'Smoke detector, smoke alarm', 'Lawn mower', 'Cat', 'Busy signal', 'Cutlery, silverware', 'Babbling', 'Wind chime', 'Telephone dialing, DTMF', 'Beep, bleep', 'Electric shaver, electric razor', 'Water tap, faucet', 'Propeller, airscrew', 'Hiccup', 'Blender, food processor', 'Narration, monologue', 'Female speech, woman speaking', 'Bell', 'Glass', 'Ice cream truck, ice cream van', 'Bathtub (filling or washing)', 'Hammer', 'Jackhammer', 'Hair dryer', 'Doorbell', 'Jet engine', 'Alarm clock', 'Dishes, pots, and pans', 'Electric toothbrush', 'Battle cry', 'Bark', 'Male speech, man speaking', 'Cupboard open or close', 'Domestic animals, pets', 'Rapping', 'Radio', 'Mechanical bell', 'Sonar', 'Telephone', 'Tuning fork', 'Fill (with liquid)', 'Sizzle', 'Snap', 'Ding', 'Chopping (foo

In [8]:
audioset2desed_list = []
for label in label_candidates:
    preds_mean = audioset_preds_mean[label]
    audioset2desed_list.append(
        {
            "AudioSet": label,
            "DESED": classes_labels[preds_mean.argmax()],
            "pred": max(preds_mean),
            "n_samples": len(set(ref_group_label.get_group(label2mid[label]).segment_id)),
        }
    )
audioset2desed = pd.DataFrame(audioset2desed_list)

In [9]:
audioset2desed = audioset2desed.sort_values(by=["DESED"], ignore_index=True)

In [16]:
audioset2desed.to_csv("desed-lab/data/audioset2desed.tsv", sep="\t", index=False)

In [17]:
audioset2desed.loc[20:40]

Unnamed: 0,AudioSet,DESED,pred,n_samples
20,"Beep, bleep",Alarm_bell_ringing,0.362647,2338
21,Alert,Alarm_bell_ringing,0.360047,27
22,Wind chime,Alarm_bell_ringing,0.546944,205
23,Sine wave,Alarm_bell_ringing,0.370052,385
24,"Telephone dialing, DTMF",Alarm_bell_ringing,0.341993,208
25,Ringtone,Alarm_bell_ringing,0.554473,296
26,Busy signal,Alarm_bell_ringing,0.510701,199
27,"Smoke detector, smoke alarm",Alarm_bell_ringing,0.72201,87
28,Dial tone,Alarm_bell_ringing,0.555186,203
29,Chainsaw,Blender,0.366771,379


In [67]:
selected_audioset_labels = {
    "Alarm_bell_ringing": ["Doorbell", "Telephone bell ringing", "Ding-dong", "Fire alarm", "Chime",
    "Bicycle bell", "Wind chime", "Ringtone", "Busy signal", "Smoke detector, smoke alarm", "Dial tone"],
    "Blender": ["Blender, food processor"],
    "Cat": ["Cat", "Caterwaul", "Meow"], 
    "Dishes": ["Dishes, pots, and pans", "Cutlery, silverware"],
    "Dog": ["Dog", "Bow-wow", "Howl", "Bark"],
    "Electric_shaver_toothbrush": ["Electric shaver, electric razor", "Electric toothbrush"],
    "Frying": ["Frying (food)", "Sizzle"],
    "Running_water": ["Toilet flush"],
    "Speech": ["Female speech, woman speaking", "Male speech, man speaking", "Conversation", "Narration, monologue"],
    "Vacuum_cleaner": ["Vacuum cleaner"]
}
print(sum([len(v) for v in selected_audioset_labels.values()]))

31


In [68]:
df_list = []
for k, labels in selected_audioset_labels.items():
    for label in labels:
        _df = ref_group_label.get_group(label2mid[label])
        _df.label = k
        df_list.append(_df)
df = pd.concat(df_list, axis=0)
df.segment_id = df.segment_id + ".wav"
df = df.rename(columns={"segment_id": "filename", "start_time_seconds": "onset", "end_time_seconds": "offset", "label": "event_label"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [69]:
flist = os.listdir("db/audioset/train_strong")

df_list = []
for filename, g in tqdm(df.groupby("filename")):
    if filename in flist:
        df_list.append(g)
df = pd.concat(df_list, axis=0)
df

100%|██████████| 48423/48423 [00:34<00:00, 1394.48it/s]


Unnamed: 0,filename,onset,offset,event_label
447873,--7UmfOkRbM_30000.wav,0.000,1.906,Speech
447876,--7UmfOkRbM_30000.wav,2.528,4.150,Speech
447878,--7UmfOkRbM_30000.wav,5.110,6.268,Speech
447880,--7UmfOkRbM_30000.wav,6.717,7.795,Speech
447882,--7UmfOkRbM_30000.wav,8.323,10.000,Speech
...,...,...,...,...
746048,zzz3PZXRQ_8_30000.wav,6.530,10.000,Alarm_bell_ringing
746039,zzz3PZXRQ_8_30000.wav,0.000,0.909,Speech
746041,zzz3PZXRQ_8_30000.wav,1.070,1.874,Speech
746046,zzz3PZXRQ_8_30000.wav,3.608,5.547,Speech


In [70]:
df_list = []
for l, g in df.groupby("event_label"):
    if len(g) > 500:
        g = g.sample(500)
    df_list.append(g)
df = pd.concat(df_list, axis=0)
df

Unnamed: 0,filename,onset,offset,event_label
389460,WPXwLt0p4RY_30000.wav,1.174,2.272,Alarm_bell_ringing
95773,9mhdoH0MMM8_400000.wav,8.438,10.000,Alarm_bell_ringing
570441,UxjPjw04DF4_30000.wav,4.808,5.856,Alarm_bell_ringing
280539,9IF2WssLfPk_100000.wav,0.000,2.639,Alarm_bell_ringing
502846,rs9RVyWkHBo_30000.wav,8.819,9.644,Alarm_bell_ringing
...,...,...,...,...
278839,ygTDx1KnhYI_10000.wav,0.000,10.000,Vacuum_cleaner
80583,ygf34D5ZaL4_180000.wav,0.000,10.000,Vacuum_cleaner
335206,zdRjHLR4TzQ_110000.wav,0.000,10.000,Vacuum_cleaner
194658,zmHlSeVAEdw_170000.wav,0.000,10.000,Vacuum_cleaner


In [71]:
for l, g in df.groupby(by="event_label"):
    print(l, len(g))

Alarm_bell_ringing 500
Blender 373
Cat 500
Dishes 500
Dog 500
Electric_shaver_toothbrush 369
Frying 500
Running_water 359
Speech 500
Vacuum_cleaner 294


In [61]:
df.to_csv("db/audioset/audioset2desed_train_strong.tsv", sep="\t", index=False)