# Building validation dataset for vocals classification

In [1]:
from typing import List, Set
import requests
import itertools

In [2]:
import musicbrainzngs
musicbrainzngs.set_useragent("research", 1.0)

In [3]:
def get_recordings_from_artists(mbids: List[str]) -> Set[str]:
    recordings = set()
    for artist in mbids:
        resp = musicbrainzngs.browse_recordings(artist=artist)
        for recording in resp["recording-list"]:
            recordings.add(recording["id"])
    return recordings

def get_recordings_from_releases(mbids: List[str]) -> Set[str]:
    recordings = set()
    for release in mbids:
        resp = musicbrainzngs.browse_recordings(release=release)
        for recording in resp["recording-list"]:
            recordings.add(recording["id"])
    return recordings

In [4]:
def is_in_acousticbrainz(recording_mbid: str) -> bool:
    resp = requests.get("https://acousticbrainz.org/api/v1/%s/count" % recording_mbid).json()
    return resp["count"] > 0

In [5]:
artists_with = [
    "6bbb3983-ce8a-4971-96e0-7cae73268fc4",  # Jungle
    "debabff3-2559-46e5-862d-ef2a906d7010",  # Daryl Hall and John Oates
    "31a4c5ca-1899-4a44-a4b1-31e1921ddf17",  # Daryl Hall
    "d1fc999f-6184-41a6-bcb1-7c59bf74a6e1",  # K.Flay
]
releases_with = [
    "69f69c69-9018-45c9-a245-1b993df9919c",  # Furniteur - Furniteur
    "aaa0a2b0-e48f-42af-8cd2-7e6b09d2240c",  # Calvin Harris - Motion
    "98215ea8-f57d-49f6-8a77-cad6957b1181",  # Ellie Goulding - Halcyon Days
    "eed328a1-5f3c-33d5-bf9e-b9033175124c",  # Rammstein - Mutter
    "f3bfed3d-c1d2-3599-b6d7-3916be6c53dc",  # Rammstein - Rosenrot
    "3b87c7dd-2e90-42da-bc4a-ebadf91161ec",  # Bon Iver - Bon Iver, Bon Iver
    "7755956b-7886-40fc-8474-a0d86d08fa06",  # Poe - Haunted
]

artists_without = [
    "147ad01e-3496-44a0-b77b-55e089759b3c",  # Jón Hallur Haraldsson
    "7eefe357-3aaa-4d89-b530-3b131cca1b35",  # Ludique
    "a2af1f31-c9eb-4fff-990c-c4f547a11b75",  # Solar Fields
    "162c7a95-9b96-4b7a-bac2-6f2c2e2357e7",  # M|O|O|N
    "8208d8f9-61b6-47f2-90c0-2680dadd56a8",  # Stellardrone
]
releases_without = [
    "863857c5-a6b4-43d6-97be-2f39b7e75d5f",  # Moderat - III (Instrumentals)
    "a44cd2fc-f8f0-42a4-aa45-743223e14642",  # Hotline Miami 2: Wrong Number
    "f2d22a23-fa13-4248-a4a3-855468609de4",  # Hotline Miami - Official Soundtrack
]

In [6]:
with_vocals = {
    "name": "With vocals",
    "recordings": set(list(filter(
        is_in_acousticbrainz, 
        set(itertools.chain(
            get_recordings_from_artists(artists_with),
            get_recordings_from_releases(releases_with),
        ))
    ))),
}
without_vocals = {
    "name": "Without vocals",
    "recordings": set(list(filter(
        is_in_acousticbrainz, 
        set(itertools.chain(
            get_recordings_from_artists(artists_without),
            get_recordings_from_releases(releases_without),
        ))
    ))),
}

In [7]:
with open("dataset.csv", "w") as f:

    # With vocals
    for mbid in with_vocals["recordings"]:
        f.write("%s,%s\n" % (mbid, with_vocals["name"]))
    
    # Without vocals
    for mbid in without_vocals["recordings"]:
        f.write("%s,%s\n" % (mbid, without_vocals["name"]))

### Stats

In [8]:
print("Recordings with vocals:", len(with_vocals["recordings"]))
print("Recordings without vocals:", len(without_vocals["recordings"]))

Recordings with vocals: 113
Recordings without vocals: 105
