In [1]:
import musicbrainzngs
import time
import pandas as pd

In [17]:
def get_tag_artists(tags, artists):
    list_genres = []
    list_artists = [item['artist']['name']  for item in artists['release-group']['artist-credit'] if 'artist' in item]
    if 'tag-list' in tags['release-group']:
        list_genres = [tag['name'] for tag in tags['release-group']['tag-list']]
    return list_genres, list_artists

In [18]:
def get_data(artist, releases):
    rows = []
    for release in releases:
        time.sleep(1)
        tags = musicbrainzngs.get_release_group_by_id(release['id'], includes=["tags"])
        artists = musicbrainzngs.get_release_group_by_id(release['id'], includes=["artists"])
        genre, art = get_tag_artists(tags, artists)
        rows.append({'title':tags['release-group']['title'], 'autor': artist, 'artists':art, 'genre': genre})
    df = pd.DataFrame(rows)
    return df

In [19]:
def get_singles(artist, limit = 100, offset = 0, total_releases = 0):
    releases = []
    # Extraer los primeros 100 registros
    result = musicbrainzngs.browse_release_groups(artist = artist[0], release_type=['single'], limit = limit, offset = offset)
    releases += result['release-group-list']
    total_releases = result['release-group-count'] - limit
    # Mientras haya más registros sin extraer
    while total_releases >= 0:
        offset += limit
        result = musicbrainzngs.browse_release_groups(artist = artist[0], release_type=['single'], limit = limit, offset = offset)
        releases += result['release-group-list']
        total_releases = total_releases - result['release-group-count']
    df = get_data(artist[1], releases)
    return df
    

In [20]:
# Variables
musicbrainzngs.set_useragent('viz', '1.0')
columnas = ['title', 'autor', 'artists', 'genre']
data_songs = pd.DataFrame(columns=columnas)

# Artists
list_artist_id = [("aabb1d9f-be12-45b3-a84d-a1fc3e8181fd", 'Tiësto'),
                  ("302bd7b9-d012-4360-897a-93b00c855680", "David Guetta"),
                  ("91d8e441-73a6-48a6-aed1-2bef4da87799", "Angerfist"),
                  ("a39acc4a-985e-4173-a9ed-f325f2d3bc1c", "Steve Aoki"),
                  ("5dd3cf5b-474d-465a-a539-456d7e125ebc", "Nina Kraviz"),
                  ("477b8c0c-c5fc-4ad2-b5b2-191f0bf2a9df", "Armin van Buuren"),
                  ("170d6637-36e0-49dd-a225-c4fd5b77a285", "Hardwell"),
                  ("897be100-4719-4b8b-8e17-d373ae728e9b", "Miss K8"),
                  ("167f05d9-c9d2-48b5-977d-98d7c78834f8", "Miss Monique"),
                  ("f81ace47-638d-4729-a155-1be48dc2fa48", "Nora en Pure"),
                  ("af6bd973-3e30-4339-94b5-85b139362e6d", "Miss Kittin"),
                  ("a3ee920f-4e7f-4993-8aca-4b8538cfaa4a", "Afrojack"),
                  ("ccb77c78-ffc3-4790-9006-61f2277e9c91", "Scott Brown"),
                  ("026c4d7c-8dfe-46e8-ab14-cf9304d6863d", "Paul Elstak")]
               

for artist in list_artist_id:
    df = get_singles(artist)
    data_songs = pd.concat([data_songs, df], ignore_index = False)

In [40]:
# Eliminar filas sin genero musical
data_songs = data_songs.dropna(subset=['genre'], how='any')
data_songs = data_songs.reset_index(drop=True)

In [41]:
# Convertir palabras compuestas en listas de elementos
data_songs['genre'] = data_songs['genre'].apply(lambda x: [item.strip(' []') for item in x.split(' / ')] if ' / ' in x else x)
data_songs['genre'] = data_songs['genre'].apply(lambda x: [item.strip(' []') for item in x.split('/')] if '/' in x else x)
data_songs['genre'] = data_songs['genre'].apply(lambda x: [item.strip(' []') for item in x.split(' & ')] if ' & ' in x else x)

# Aplanar listas artistas y generos
data_songs = data_songs.explode('artists').explode('genre')
data_songs = data_songs.reset_index(drop = True)

In [42]:
# Normalizacion de los datos
replacement_genre_dict= {
    "hip-hop" : "hip hop",
    "hip-house" : "hip house"
}

data_songs['genre'] = data_songs['genre'].replace(replacement_genre_dict)

# Normalizar datos autores
replacement_artist_dict = {
    "DJ Tiësto": "Tiësto",
    "DJ Scott Brown": "Scott Brown",
    "DJ Paul Elstak": "Paul Elstak",
    "DJ Paul": "Paul Elstak"
}
data_songs['autor'] = data_songs['autor'].replace(replacement_artist_dict)
data_songs['artists'] = data_songs['artists'].replace(replacement_artist_dict)
data_songs.head()

Unnamed: 0,title,autor,artists,genre,artist_gender
0,The Tube,Tiësto,Tiësto,electronic,male
1,The Tube,Tiësto,Tiësto,trance,male
2,Sparkles,Tiësto,Tiësto,electronic,male
3,Sparkles,Tiësto,Tiësto,trance,male
4,Theme From Norefjell,Tiësto,Tiësto,electronic,male


In [43]:
data_songs.to_csv("data_songs.csv", index=False)

In [25]:
# Obtener el sexo de los artistas
artists_sex = {}
artists_country = {}
all_artists = data_songs['artists'].unique()
for name in all_artists:
    time.sleep(1)
    res = musicbrainzngs.search_artists(artist = name)
    type_value = res['artist-list'][0].get('type', None)
    if type_value == 'Person':
        # Si 'type' es 'Person', obtenemos el género
        artists_sex[name] = res['artist-list'][0].get('gender', 'unknown')
    else:
        # En otros casos o si 'type' no está presente, asignamos 'unknown'
        artists_sex[name] = 'unknown'

In [44]:
# Crear dataframe sexo y concatenar al original
artists_sex_df = pd.DataFrame(list(artists_sex.items()), columns=['artists', 'artist_gender'])
data_songs = pd.merge(data_songs, artists_sex_df, on ='artists', how='left')

In [45]:
# Eliminar filas sin genero musical
data_songs = data_songs.dropna(subset=['genre'], how='any')
data_songs = data_songs.reset_index(drop=True)

Unnamed: 0,title,autor,artists,genre,artist_gender_x,artist_gender_y
0,The Tube,Tiësto,Tiësto,electronic,male,male
1,The Tube,Tiësto,Tiësto,trance,male,male
2,Sparkles,Tiësto,Tiësto,electronic,male,male
3,Sparkles,Tiësto,Tiësto,trance,male,male
4,Theme From Norefjell,Tiësto,Tiësto,electronic,male,male
...,...,...,...,...,...,...
3199,Mega Mix 2004,Paul Elstak,Paul Elstak,gabber,male,male
3200,Mega Mix 2004,Paul Elstak,Paul Elstak,happy hardcore,male,male
3201,Mega Mix 2004,Paul Elstak,Paul Elstak,hardcore,male,male
3202,The Promised Land (The Viper remix),Paul Elstak,Paul Elstak,dance,male,male


In [23]:
data = pd.read_csv("data_songs.csv")
data = data.rename(columns={'autor':'id'})
data

Unnamed: 0,title,id,artists,genre,artist_gender
0,The Tube,Tiësto,Tiësto,electronic,male
1,The Tube,Tiësto,Tiësto,trance,male
2,Sparkles,Tiësto,Tiësto,electronic,male
3,Sparkles,Tiësto,Tiësto,trance,male
4,Theme From Norefjell,Tiësto,Tiësto,electronic,male
...,...,...,...,...,...
3199,Mega Mix 2004,Paul Elstak,Paul Elstak,gabber,male
3200,Mega Mix 2004,Paul Elstak,Paul Elstak,happy hardcore,male
3201,Mega Mix 2004,Paul Elstak,Paul Elstak,hardcore,male
3202,The Promised Land (The Viper remix),Paul Elstak,Paul Elstak,dance,male


## Nodos

In [87]:
# Artistas
artists_df = pd.DataFrame(data['artists'].unique(), columns=['id'])

# Union dataframes
artists_nodes = pd.merge(artists_df, data[['artists', 'artist_gender']], left_on='id', right_on='artists').drop(columns=['artists'])
artists_nodes = artists_nodes.drop_duplicates().reset_index(drop=True)
artists_nodes = artists_nodes.rename(columns={'artist_gender':'label'})
artists_nodes.to_csv("nodos_artistas.csv", index=False)

# Generos
genre_df = pd.DataFrame(data['genre'].unique(), columns=['id'])
genre_df['id'] = genre_df['id'].apply(lambda x: x.upper())

genre_df.to_csv("genre_nodes.csv", index=False)

## Aristas

In [63]:
# id -> Artistas
link_artists = data[['title', 'id', 'artists']]
link_artists = link_artists.drop_duplicates().drop(columns=['title']).reset_index(drop=True)
link_artists = link_artists.groupby(['id', 'artists']).size().reset_index(name='weight').rename(columns={'id':'source',
                                                                                                        'artists':'target'})
link_artists['label'] = 'collaboration'
link_artists.to_csv('link_artists.csv', index=False)

In [86]:
# Artistas -> generos
link_genres = data[['id','genre']]
link_genres = link_genres.groupby(['id', 'genre']).size().reset_index(name='weight').rename(columns={'id':'source',
                                                                                                        'genre':'target'})
link_genres['label'] = 'musical_genre'
link_genres['target'] = link_genres['target'].apply(lambda x: x.upper())
link_genres.to_csv("link_genres.csv", index=False)