In [1]:
import numpy as np
import pandas as pd
import json
import os

In [2]:
# from https://research.google.com/audioset/download_strong.html
X = pd.read_csv('audioset_train_strong.tsv', sep='\t')

In [3]:
# from https://github.com/audioset/ontology
with open("ontology.json", "r") as read_file:
    data = json.load(read_file)

In [4]:
def translate(x): #see ontology.json and this will make sense. data is a list of dictionaries so item is one dictionary
    try:
        return next(item for item in data if item["id"] == x['label'])['name'].replace(" ","").replace(",","_")
    except:
        return 'N/A'

In [5]:
next(item for item in data if item["id"] == '/m/0cmf2')['name'].replace(" ","").replace(",","_") #a demo of translate's operation

'Fixed-wingaircraft_airplane'

In [6]:
X['time_interval'] = X.apply(lambda row: (row['start_time_seconds'], row['end_time_seconds']), axis=1) #the time intervals are listed in the tsv file next to their corresponding ids. I assume it is easier to get the info from this tsv than it is to split each youtube link name (in json) into parts for it
X['link'] = X.apply(lambda row: "youtu.be/" + "_".join(row['segment_id'].split('_')[:-1]), axis=1) #just adds the link to youtube plus the segment id from the tsv to the last column of our X df
X.head()

Unnamed: 0,segment_id,start_time_seconds,end_time_seconds,label,time_interval,link
0,b0RFKhbpFJA_30000,0.0,10.0,/m/03m9d0z,"(0.0, 10.0)",youtu.be/b0RFKhbpFJA
1,b0RFKhbpFJA_30000,4.753,5.72,/m/05zppz,"(4.753, 5.72)",youtu.be/b0RFKhbpFJA
2,b0RFKhbpFJA_30000,0.0,10.0,/m/07pjwq1,"(0.0, 10.0)",youtu.be/b0RFKhbpFJA
3,b0RFKhbpFJA_30000,6.899,7.01,/m/07qjznt,"(6.899, 7.01)",youtu.be/b0RFKhbpFJA
4,b0RFKhbpFJA_30000,8.534,9.156,/t/dd00092,"(8.534, 9.156)",youtu.be/b0RFKhbpFJA


In [7]:
# this takes a long time
X['name'] = X.apply(translate, axis=1) #gets the names in a reasonable format out of the json and add that to the end too
X.head()

Unnamed: 0,segment_id,start_time_seconds,end_time_seconds,label,time_interval,link,name
0,b0RFKhbpFJA_30000,0.0,10.0,/m/03m9d0z,"(0.0, 10.0)",youtu.be/b0RFKhbpFJA,Wind
1,b0RFKhbpFJA_30000,4.753,5.72,/m/05zppz,"(4.753, 5.72)",youtu.be/b0RFKhbpFJA,Malespeech_manspeaking
2,b0RFKhbpFJA_30000,0.0,10.0,/m/07pjwq1,"(0.0, 10.0)",youtu.be/b0RFKhbpFJA,Buzz
3,b0RFKhbpFJA_30000,6.899,7.01,/m/07qjznt,"(6.899, 7.01)",youtu.be/b0RFKhbpFJA,Tick
4,b0RFKhbpFJA_30000,8.534,9.156,/t/dd00092,"(8.534, 9.156)",youtu.be/b0RFKhbpFJA,Windnoise(microphone)


In [8]:
classes = ["Motorboat_speedboat",
"Racecar_autoracing",
"Carpassingby",
"Tiresqueal",
"Caralarm",
"Vehiclehorn_carhorn_honking",
"Airbrake",
"Airhorn_truckhorn",
"Bus",
"Motorcycle",
"Trafficnoise_roadwaynoise",
"Railroadcar_trainwagon",
"Trainwheelssquealing",
"Helicopter",
"Fixed-wingaircraft_airplane",
"Lightengine(highfrequency)",
"Mediumengine(midfrequency)",
"Heavyengine(lowfrequency)"]

In [9]:
Y = X[[X['name'].values[i] in classes for i in range(len(X))]]
Y.head()

Unnamed: 0,segment_id,start_time_seconds,end_time_seconds,label,time_interval,link,name
255,3zOfKHtXxws_30000,4.181,4.75,/m/02mfyn,"(4.181, 4.75)",youtu.be/3zOfKHtXxws,Caralarm
269,-x1F1bVe6JM_30000,1.809,10.0,/t/dd00066,"(1.809, 10.0)",youtu.be/-x1F1bVe6JM,Mediumengine(midfrequency)
376,YRS0I6WrjOs_30000,0.0,10.0,/t/dd00066,"(0.0, 10.0)",youtu.be/YRS0I6WrjOs,Mediumengine(midfrequency)
466,18PPxEB6Cb4_540000,0.0,10.0,/m/02rlv9,"(0.0, 10.0)",youtu.be/18PPxEB6Cb4,Motorboat_speedboat
476,1MjtxzIAz4s_30000,0.0,10.0,/t/dd00065,"(0.0, 10.0)",youtu.be/1MjtxzIAz4s,Lightengine(highfrequency)


In [21]:
Y.groupby('name').agg({'link':len})

Unnamed: 0_level_0,link
name,Unnamed: 1_level_1
Airbrake,503
Airhorn_truckhorn,926
Bus,365
Caralarm,630
Carpassingby,390
Fixed-wingaircraft_airplane,243
Heavyengine(lowfrequency),583
Helicopter,396
Lightengine(highfrequency),153
Mediumengine(midfrequency),2142


In [16]:
df = Y.groupby('segment_id').agg({'time_interval': list, 'name': list, 'link': np.unique})
df.to_csv('clean_download.csv')

In [18]:
x=lambda a:a+2
x(3)

5