In [24]:
import sys
import os
import pandas as pd

TUT_CSV = 'Datasets/TUT18_train.csv'
SCAPPER_CSV = 'Datasets/scrapper_train_dataset.csv'
AUDIOSET_CSV = 'class_labels_indices.csv'

TUT_DF = pd.read_csv(TUT_CSV)
SCAPPER_DF = pd.read_csv(SCAPPER_CSV)
AUDIOSET_DF = pd.read_csv(AUDIOSET_CSV)


In [28]:
scapper_scenes = SCAPPER_DF['acoustic_scene_label'].unique()
tut_scenes = TUT_DF['labels'].unique()
audioset_events = AUDIOSET_DF['display_name'].to_list()

In [20]:
import re
pattern = r'[^a-zA-Z0-9\s]'
event_label_list = []
scapper_unique_events = set()

for i in range (len(SCAPPER_DF['events_label_list'])):
    event_list = re.sub(pattern, '', SCAPPER_DF['events_label_list'][i])
    event_label_list.append(event_list.split(' '))

    scapper_unique_events.update(event_list.split(' '))

In [29]:
from sentence_transformers import SentenceTransformer
language_model = SentenceTransformer('all-MiniLM-L6-v2')

scapper_scenes_semantic_embeddings = language_model.encode(scapper_scenes)
tut_scenes_semantic_embeddings = language_model.encode(tut_scenes)
scapper_event_semantic_embeddings = language_model.encode(list(scapper_unique_events))
audioset_event_semantic_embeddings = language_model.encode(audioset_events)


In [55]:
from sklearn.metrics.pairwise import cosine_distances

def get_embeddings_distance(label1, label2):
    semantic_embeddings1 = np.reshape(language_model.encode(label1), (1, -1))
    semantic_embeddings2 = np.reshape(language_model.encode(label2), (1, -1))
    return cosine_distances(semantic_embeddings1, semantic_embeddings2)

In [58]:
scapper_tut_scenes_df = pd.DataFrame(index=scapper_scenes, columns=tut_scenes)

for i in range(len(tut_scenes)):
    for j in range(len(scapper_scenes)):
        cosine_distanc = get_embeddings_distance(tut_scenes[i], scapper_scenes[j])
        scapper_tut_scenes_df[tut_scenes[i]][scapper_scenes[j]] = cosine_distanc


In [59]:
scapper_tut_scenes_df

Unnamed: 0,airport,bus,metro,metro_station,park,public_square,shopping_mall,street_pedestrian,street_traffic,tram
bus,[[0.56299496]],[[0.0]],[[0.63388085]],[[0.63291025]],[[0.57808065]],[[0.77133995]],[[0.7124024]],[[0.6849394]],[[0.61847985]],[[0.62227905]]
busystreet,[[0.7786833]],[[0.7707808]],[[0.780583]],[[0.77740395]],[[0.81745017]],[[0.7635962]],[[0.80434763]],[[0.7257683]],[[0.7191614]],[[0.7486269]]
office,[[0.6667457]],[[0.6474495]],[[0.70749116]],[[0.7138093]],[[0.66423905]],[[0.6953491]],[[0.7377806]],[[0.7137201]],[[0.7505362]],[[0.7773689]]
openairmarket,[[0.57567334]],[[0.7776028]],[[0.84255636]],[[0.8416214]],[[0.8504772]],[[0.7633233]],[[0.7735147]],[[0.8848965]],[[0.8722714]],[[0.72082406]]
park,[[0.666398]],[[0.57808065]],[[0.68195534]],[[0.6800808]],[[0.0]],[[0.7549496]],[[0.66364014]],[[0.6830393]],[[0.6923012]],[[0.6651525]]
quietstreet,[[0.83606726]],[[0.8781643]],[[0.8349311]],[[0.8735795]],[[0.8455584]],[[0.75372744]],[[0.84339213]],[[0.7419493]],[[0.84801966]],[[0.79420066]]
restaurant,[[0.6609124]],[[0.54670084]],[[0.66154945]],[[0.66234535]],[[0.5512289]],[[0.80988026]],[[0.6068625]],[[0.76710606]],[[0.81833196]],[[0.7055687]]
supermarket,[[0.71209264]],[[0.6994479]],[[0.71578526]],[[0.75346863]],[[0.72066957]],[[0.8455949]],[[0.42583978]],[[0.7846128]],[[0.8210941]],[[0.6781119]]
tube,[[0.7166865]],[[0.6667471]],[[0.88900834]],[[0.86957026]],[[0.82274616]],[[0.79702944]],[[0.853956]],[[0.8693592]],[[0.8263468]],[[0.7189609]]
tubestation,[[0.8214318]],[[0.65559435]],[[0.95246065]],[[0.9118539]],[[0.90046316]],[[0.9227542]],[[0.9588466]],[[0.8882047]],[[0.86309165]],[[0.79150736]]


In [66]:
scapper_tut_scenes_df.to_csv('scrapper_tut_scenes_cosineDistance.csv', index=False)

In [61]:
scapper_audioset_df = pd.DataFrame(index=audioset_events, columns=list(scapper_unique_events))

for i in range(len(list(scapper_unique_events))):
    for j in range(len(audioset_events)):
        cosine_distanc = get_embeddings_distance(list(scapper_unique_events)[i], audioset_events[j])
        scapper_audioset_df[list(scapper_unique_events)[i]][audioset_events[j]] = cosine_distanc


In [67]:
scapper_audioset_df.to_csv('scrapper_audioset_events_cosineDistance.csv', index=False)

In [49]:

import numpy as np

cosine_distances(np.reshape(scapper_scenes_semantic_embeddings[3], (1, -1)), np.reshape(tut_scenes_semantic_embeddings[2], (1, -1)))

array([[0.8425564]], dtype=float32)

In [50]:
scapper_scenes[3], tut_scenes[2]

('openairmarket', 'metro')