<a href="https://colab.research.google.com/github/giacomosansoni/Siamese_Network_Artist_Similarity/blob/main/tsne.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import tensorflow as tf
from tensorflow import keras as tfk
from keras import backend as K
from keras import layers as tfkl
from tqdm import tqdm
import time
#PCA
from sklearn.decomposition import PCA
#TSNE
import sklearn
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import plotly.colors as pc
from sklearn.metrics import pairwise_distances

from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/Thesis/Thesis/

def id_name (artist_id):
  return artists[artists['musicbrainz_id'] == artist_id]['name'].values[0]

# define the metrics and load the siamese model already trained
def cosine_similarity(x, y):
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return K.sum(x * y, axis=-1)

def distance(x, y, name):
    return tfkl.Lambda(lambda tensors: 1 - cosine_similarity(tensors[0], tensors[1]), name=name)([x, y])

@tf.function
def triplet_loss(y_true, y_pred):
    margin = 0.7
    ap_distance, an_distance = y_pred[:, 0], y_pred[:, 1]
    loss_value = ap_distance - an_distance + margin
    return tf.maximum(loss_value, 0.0)

def id_name (artist_id):
  return artists[artists['musicbrainz_id'] == artist_id]['name'].values[0]

# load the 61440 dimensional embeddings
def load_embeddings(file_path):
  with open(file_path, 'rb') as file:
    return pickle.load(file)

def plot_2d_nations(component1, component2, nationalities, colors, names, nationality_to_color, name):
    fig = go.Figure()

    # Get unique nationalities and their respective colors
    for nationality in set(nationalities):
        indices = [i for i, x in enumerate(nationalities) if x == nationality]
        fig.add_trace(go.Scatter(
            x=[component1[i] for i in indices],
            y=[component2[i] for i in indices],
            mode='markers',
            marker=dict(
                size=10,
                color=nationality_to_color[nationality],  # Set color specific to nationality
                line=dict(width=1)
            ),
            name=nationality,  # Use nationality as the name of the trace for legend
            text=[names[i] for i in indices],  # Include artist names in hover information
            hoverinfo='text+name'
        ))

    fig.update_layout(
        margin=dict(l=100, r=100, b=100, t=100),
        width=3000,
        height=1200,
        template='plotly_dark',
        legend_title_text='Nationality'
    )

    # Save the figure as an HTML file
    fig.write_html(os.path.join(root_dir, 'tsne_plots', f'{name}_nations.html'))

    fig.show()

def plot_2d_genre(component1, component2, nationalities, colors, names, nationality_to_color, name):
    fig = go.Figure()

    # Get unique nationalities and their respective colors
    for nationality in set(nationalities):
        indices = [i for i, x in enumerate(nationalities) if x == nationality]
        fig.add_trace(go.Scatter(
            x=[component1[i] for i in indices],
            y=[component2[i] for i in indices],
            mode='markers',
            marker=dict(
                size=10,
                color=nationality_to_color[nationality],  # Set color specific to nationality
                line=dict(width=1)
            ),
            name=nationality,  # Use nationality as the name of the trace for legend
            text=[names[i] for i in indices],  # Include artist names in hover information
            hoverinfo='text+name'
        ))

    fig.update_layout(
        margin=dict(l=100, r=100, b=100, t=100),
        width=3000,
        height=1200,
        template='plotly_dark',
        legend_title_text='Genres'
    )

    # Save the figure as an HTML file
    fig.write_html(os.path.join(root_dir, 'tsne_plots', f'{name}_genres.html'))

    fig.show()

Mounted at /gdrive
/gdrive/.shortcut-targets-by-id/1D4FFhBTYn6QfBuxB5OXjn7JgI3z14YxH/Thesis


### Create Embeddings_256

In [2]:
# load the 61440 dimensional embeddings

def load_embeddings(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)

root_dir = os.getcwd()
'''embeddings1 = load_embeddings(os.path.join(root_dir, 'embeddings_1.pkl'))
embeddings3 = load_embeddings(os.path.join(root_dir, 'embeddings_3.pkl'))
embeddings4 = load_embeddings(os.path.join(root_dir, 'embeddings_4.pkl'))
embeddings2 = load_embeddings(os.path.join(root_dir, 'embeddings_2.pkl'))'''
embeddings_img = load_embeddings(os.path.join(root_dir, 'embeddings_img.pkl'))
embeddings_captions1 = load_embeddings(os.path.join(root_dir, 'embeddings_captions1.pkl'))
embeddings_captions2 = load_embeddings(os.path.join(root_dir, 'embeddings_captions2.pkl'))
embeddings_captions3 = load_embeddings(os.path.join(root_dir, 'embeddings_captions3.pkl'))
embeddings_captions4 = load_embeddings(os.path.join(root_dir, 'embeddings_captions4.pkl'))

In [22]:
# define the metrics and load the siamese model already trained

def cosine_similarity(x, y):
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return K.sum(x * y, axis=-1)

def distance(x, y, name):
    return tfkl.Lambda(lambda tensors: 1 - cosine_similarity(tensors[0], tensors[1]), name=name)([x, y])

@tf.function
def triplet_loss(y_true, y_pred):
    margin = 0.7
    ap_distance, an_distance = y_pred[:, 0], y_pred[:, 1]
    loss_value = ap_distance - an_distance + margin
    return tf.maximum(loss_value, 0.0)

siamese_network = tf.keras.models.load_model(
    #os.path.join(root_dir, 'Triplets', 'full_model_img'),
    #os.path.join(root_dir, 'Triplets', 'full_model_caption1'),
    #os.path.join(root_dir, 'Triplets', 'full_model_caption2'),
    #os.path.join(root_dir, 'Triplets', 'full_model_caption3'),
    os.path.join(root_dir, 'Triplets', 'full_model_caption4'),
    custom_objects={'triplet_loss': triplet_loss})

siamese_network.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 anchor (InputLayer)         [(None, 30720, 1)]           0         []                            
                                                                                                  
 positive (InputLayer)       [(None, 30720, 1)]           0         []                            
                                                                                                  
 negative (InputLayer)       [(None, 30720, 1)]           0         []                            
                                                                                                  
 cnn (Functional)            (None, 256)                  340640    ['anchor[0][0]',              
                                                                     'positive[0][0]',        

In [23]:
# Extract the cnn layer from the siamese model and make it non trainable

cnn = siamese_network.get_layer('cnn')
cnn.trainable = False
cnn.summary()

Model: "cnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 30720, 1)]        0         
                                                                 
 conv1d_5 (Conv1D)           (None, 15360, 128)        512       
                                                                 
 max_pooling1d_5 (MaxPoolin  (None, 7680, 128)         0         
 g1D)                                                            
                                                                 
 batch_normalization_5 (Bat  (None, 7680, 128)         512       
 chNormalization)                                                
                                                                 
 conv1d_6 (Conv1D)           (None, 3840, 128)         49280     
                                                                 
 max_pooling1d_6 (MaxPoolin  (None, 1920, 128)         0       

In [None]:
# Let's do a check

anch = embeddings2['acb5a55d-2e39-4098-962e-acbc68adc663']
pos = embeddings2['eeacb319-8d4c-48e0-80a0-944e71c375bf']
neg = embeddings2['00c49f40-d715-4b79-b223-432048602cce']

out_anc = cnn.predict(anch)
out_pos = cnn.predict(pos)
out_neg = cnn.predict(neg)

print(cosine_similarity(out_anc, out_pos).numpy())
print(cosine_similarity(out_anc, out_neg).numpy())

[0.94197917]
[0.00227578]


In [24]:
# Let's use the cnn to create a dictionary like embeddings2 containing the 256 dim embeddings for every artist

embeddings_256 ={}
for artist_id in embeddings_captions4.keys(): #embeddings_captions1
  embeddings_256[artist_id] = cnn.predict(embeddings_captions4[artist_id]) #embeddings_captions1

assert embeddings_256['00c49f40-d715-4b79-b223-432048602cce'].shape == (1, 256), f"Error: The shape is {embeddings_256['00c49f40-d715-4b79-b223-432048602cce'].shape}!"

path = os.path.join(root_dir, 'embeddings_captions4_256.pkl') #embeddings2_256, embeddings_img_256, embeddings_captions_256
with open(path, 'wb') as file:
  pickle.dump(embeddings_256, file)



In [None]:
# Let's compute the average distance between the positive artists and the average distance between negatives artists

# FULL MODEL 1

triplets_ids = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music_spot.csv'))
pos_avg = 0.0
neg_avg = 0.0
for index, row in tqdm(triplets_ids.iterrows()):
  pos_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[1]]).numpy()
  neg_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[2]]).numpy()
  pos_avg = pos_avg + pos_sim
  neg_avg = neg_avg + neg_sim

print(pos_avg / len(triplets_ids))
print(neg_avg / len(triplets_ids))

27367it [00:30, 892.39it/s]

[0.8094061]
[0.11073426]





In [None]:
# Let's compute the average distance between the positive artists and the average distance between negatives artists

# FULL MODEL 2

triplets_ids = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music_spot.csv'))
pos_avg = 0.0
neg_avg = 0.0
for index, row in tqdm(triplets_ids.iterrows()):
  pos_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[1]]).numpy()
  neg_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[2]]).numpy()
  pos_avg = pos_avg + pos_sim
  neg_avg = neg_avg + neg_sim

print(pos_avg / len(triplets_ids))
print(neg_avg / len(triplets_ids))

27367it [01:30, 301.94it/s]

[0.8051956]
[0.11530831]





In [None]:
# Let's compute the average distance between the positive artists and the average distance between negatives artists

# FULL MODEL 3

triplets_ids = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music_spot.csv'))
pos_avg = 0.0
neg_avg = 0.0
for index, row in tqdm(triplets_ids.iterrows()):
  pos_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[1]]).numpy()
  neg_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[2]]).numpy()
  pos_avg = pos_avg + pos_sim
  neg_avg = neg_avg + neg_sim

print(pos_avg / len(triplets_ids))
print(neg_avg / len(triplets_ids))

27367it [00:30, 885.93it/s]

[0.810641]
[0.12004229]





In [None]:
# Let's compute the average distance between the positive artists and the average distance between negatives artists

# FULL MODEL 4

triplets_ids = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music_spot.csv'))
pos_avg = 0.0
neg_avg = 0.0
for index, row in tqdm(triplets_ids.iterrows()):
  pos_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[1]]).numpy()
  neg_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[2]]).numpy()
  pos_avg = pos_avg + pos_sim
  neg_avg = neg_avg + neg_sim

print(pos_avg / len(triplets_ids))
print(neg_avg / len(triplets_ids))

27367it [00:31, 876.19it/s]

[0.7982138]
[0.097976]





In [8]:
# Let's compute the average distance between the positive artists and the average distance between negatives artists

# IMG ONLY

triplets_ids = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music_spot.csv'))
pos_avg = 0.0
neg_avg = 0.0
for index, row in tqdm(triplets_ids.iterrows()):
  pos_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[1]]).numpy()
  neg_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[2]]).numpy()
  pos_avg = pos_avg + pos_sim
  neg_avg = neg_avg + neg_sim

print(f'Average distance between positives: {pos_avg / len(triplets_ids)}')
print(f'Average distance between negatives: {neg_avg / len(triplets_ids)}')

27367it [00:30, 899.51it/s]

Average distance between positives: [0.8203987]
Average distance between negatives: [0.13166505]





In [13]:
# Let's compute the average distance between the positive artists and the average distance between negatives artists

# CAPTION 1

triplets_ids = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music_spot.csv'))
pos_avg = 0.0
neg_avg = 0.0
for index, row in tqdm(triplets_ids.iterrows()):
  pos_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[1]]).numpy()
  neg_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[2]]).numpy()
  pos_avg = pos_avg + pos_sim
  neg_avg = neg_avg + neg_sim

print(f'Average distance between positives: {pos_avg / len(triplets_ids)}')
print(f'Average distance between negatives: {neg_avg / len(triplets_ids)}')

27367it [00:30, 890.47it/s]

Average distance between positives: [0.7617329]
Average distance between negatives: [0.16946533]





In [17]:
# Let's compute the average distance between the positive artists and the average distance between negatives artists

# CAPTION 2

triplets_ids = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music_spot.csv'))
pos_avg = 0.0
neg_avg = 0.0
for index, row in tqdm(triplets_ids.iterrows()):
  pos_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[1]]).numpy()
  neg_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[2]]).numpy()
  pos_avg = pos_avg + pos_sim
  neg_avg = neg_avg + neg_sim

print(f'Average distance between positives: {pos_avg / len(triplets_ids)}')
print(f'Average distance between negatives: {neg_avg / len(triplets_ids)}')

27367it [00:30, 884.65it/s]

Average distance between positives: [0.8143417]
Average distance between negatives: [0.14573035]





In [21]:
# Let's compute the average distance between the positive artists and the average distance between negatives artists

# CAPTION 3

triplets_ids = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music_spot.csv'))
pos_avg = 0.0
neg_avg = 0.0
for index, row in tqdm(triplets_ids.iterrows()):
  pos_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[1]]).numpy()
  neg_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[2]]).numpy()
  pos_avg = pos_avg + pos_sim
  neg_avg = neg_avg + neg_sim

print(f'Average distance between positives: {pos_avg / len(triplets_ids)}')
print(f'Average distance between negatives: {neg_avg / len(triplets_ids)}')

27367it [00:31, 873.18it/s]

Average distance between positives: [0.8218815]
Average distance between negatives: [0.13322063]





In [25]:
# Let's compute the average distance between the positive artists and the average distance between negatives artists

# CAPTION 4

triplets_ids = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music_spot.csv'))
pos_avg = 0.0
neg_avg = 0.0
for index, row in tqdm(triplets_ids.iterrows()):
  pos_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[1]]).numpy()
  neg_sim = cosine_similarity(embeddings_256[row[0]], embeddings_256[row[2]]).numpy()
  pos_avg = pos_avg + pos_sim
  neg_avg = neg_avg + neg_sim

print(f'Average distance between positives: {pos_avg / len(triplets_ids)}')
print(f'Average distance between negatives: {neg_avg / len(triplets_ids)}')

27367it [00:31, 873.54it/s]

Average distance between positives: [0.72605234]
Average distance between negatives: [0.15683255]





### tSNE - Full Model_1

In [None]:
root_dir = os.getcwd()
artists = pd.read_csv(os.path.join(root_dir, 'artists_mio_3000_updated.csv'))
triplets = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music_spot.csv'))
embeddings2 = load_embeddings(os.path.join(root_dir, 'embeddings_1.pkl'))
embeddings2_256 = load_embeddings(os.path.join(root_dir, 'embeddings1_256.pkl'))
embeddings_256_list = [embeddings2_256[key] for key in embeddings2_256]
embeddings_256_list = [embedding[0] for embedding in embeddings_256_list]
# Convert list of arrays to a single 2D numpy array
embeddings_256_array = np.vstack(embeddings_256_list)
print(embeddings_256_array.shape)

(3008, 256)


Nations, Cosine 2D

In [None]:
# Create a mapping of musicbrainz_id to nationality
id_to_nationality = dict(zip(artists['musicbrainz_id'], artists['nationality']))

# Create a list of genres for the embeddings
nationalities = [id_to_nationality.get(key, 'unknown') for key in embeddings2_256.keys()]

# Convert nationalities to a color
unique_nationalities = list(set(nationalities))
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']#pc.qualitative.Plotly[:len(unique_nationalities)]  # Generate discrete colors
nationality_to_color = {'es': '#E8F319','gb': '#f3f7f2','it': '#FFA15A','us': '#636EFA','de': '#EF553B','fr': '#00CC96'}#{nationality: discrete_colors[i] for i, nationality in enumerate(unique_nationalities)}
colors = [nationality_to_color[nationality] for nationality in nationalities]

# Create a legend for the discrete colors
color_scale = {i: nationality for i, nationality in enumerate(unique_nationalities)}

names = [id_name(mid) for mid in embeddings2_256.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array)
plot_2d_nations(tsne2[:, 0], tsne2[:, 1], nationalities, colors, names, nationality_to_color, 'full_model1')


[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 3008 samples in 0.001s...
[t-SNE] Computed neighbors for 3008 samples in 0.180s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3008
[t-SNE] Computed conditional probabilities for sample 2000 / 3008
[t-SNE] Computed conditional probabilities for sample 3000 / 3008
[t-SNE] Computed conditional probabilities for sample 3008 / 3008
[t-SNE] Mean sigma: 0.071872
[t-SNE] KL divergence after 250 iterations with early exaggeration: 62.687645
[t-SNE] KL divergence after 800 iterations: 0.787967


Genre 2D - Cosine

In [None]:
genres_of_interest = ['rock', 'pop', 'metal', 'hip hop', 'jazz', 'classical']

# Filter the DataFrame
filtered_artists = artists[artists['genre'].isin(genres_of_interest)]
print(f"Number of artists: {filtered_artists.shape[0]}")

#'musicbrainz_id' from the filtered DataFrame
musicbrainz_ids = filtered_artists['musicbrainz_id'].tolist()

# Filter the embeddings dictionary to only include keys that are in the musicbrainz_ids list
embeddings2_256_filtered = {key: embeddings2_256[key] for key in musicbrainz_ids if key in embeddings2_256}

# Convert the filtered embeddings dictionary to a list
embeddings2_256_list_filtered = list(embeddings2_256_filtered.values())
embeddings_256_array_filtered = np.vstack(embeddings2_256_list_filtered)
print(embeddings_256_array_filtered.shape)

Number of artists: 1860
(1860, 256)


In [None]:
# Create a mapping of musicbrainz_id to nationality
id_to_genre = dict(zip(filtered_artists['musicbrainz_id'], filtered_artists['genre']))

# Create a list of genres for the embeddings
genres = [id_to_genre.get(key, 'unknown') for key in embeddings2_256_filtered.keys()]
unique_genres = list(set(genres))
num_colors_needed = len(unique_genres)
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']
genre_to_color = {'hip hop': '#f3f7f2', 'jazz': '#E8F319', 'rock': '#EF553B','classical': '#00CC96','metal': '#636EFA','pop': '#FFA15A'} #{genre: discrete_colors[i] for i, genre in enumerate(unique_genres)}
colors = [genre_to_color[genre] for genre in genres]

# Create a legend for the discrete colors
color_scale = {i: genre for i, genre in enumerate(unique_genres)}

names = [id_name(mid) for mid in embeddings2_256_filtered.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array_filtered)
plot_2d_genre(tsne2[:, 0], tsne2[:, 1], genres, colors, names, genre_to_color, 'full_model1')

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1860 samples in 0.001s...
[t-SNE] Computed neighbors for 1860 samples in 0.086s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1860
[t-SNE] Computed conditional probabilities for sample 1860 / 1860
[t-SNE] Mean sigma: 0.081829
[t-SNE] KL divergence after 250 iterations with early exaggeration: 58.120064
[t-SNE] KL divergence after 800 iterations: 0.615424


### tSNE - Full Model_2

In [None]:
root_dir = os.getcwd()
artists = pd.read_csv(os.path.join(root_dir, 'artists_mio_3000_updated.csv'))
triplets = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music_spot.csv'))
embeddings2 = load_embeddings(os.path.join(root_dir, 'embeddings_2.pkl'))
embeddings2_256 = load_embeddings(os.path.join(root_dir, 'embeddings2_256.pkl'))
embeddings_256_list = [embeddings2_256[key] for key in embeddings2_256]
embeddings_256_list = [embedding[0] for embedding in embeddings_256_list]
# Convert list of arrays to a single 2D numpy array
embeddings_256_array = np.vstack(embeddings_256_list)
print(embeddings_256_array.shape)

(3008, 256)


In [None]:
# Average number of similar artists
triplets.groupby('anchor').size().agg(['min','max','mean'])

min      1.000000
max     53.000000
mean     9.780915
dtype: float64

In [None]:
embeddings_256_array[0].shape

(256,)

Nations, Cosine 2D

In [None]:
# Create a mapping of musicbrainz_id to nationality
id_to_nationality = dict(zip(artists['musicbrainz_id'], artists['nationality']))

# Create a list of genres for the embeddings
nationalities = [id_to_nationality.get(key, 'unknown') for key in embeddings2_256.keys()]

# Convert nationalities to a color
unique_nationalities = list(set(nationalities))
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']#pc.qualitative.Plotly[:len(unique_nationalities)]  # Generate discrete colors
nationality_to_color = {'es': '#E8F319','gb': '#f3f7f2','it': '#FFA15A','us': '#636EFA','de': '#EF553B','fr': '#00CC96'}#{nationality: discrete_colors[i] for i, nationality in enumerate(unique_nationalities)}
colors = [nationality_to_color[nationality] for nationality in nationalities]

# Create a legend for the discrete colors
color_scale = {i: nationality for i, nationality in enumerate(unique_nationalities)}

names = [id_name(mid) for mid in embeddings2_256.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array)
plot_2d_nations(tsne2[:, 0], tsne2[:, 1], nationalities, colors, names, nationality_to_color,'full_model2')

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 3008 samples in 0.001s...
[t-SNE] Computed neighbors for 3008 samples in 0.181s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3008
[t-SNE] Computed conditional probabilities for sample 2000 / 3008
[t-SNE] Computed conditional probabilities for sample 3000 / 3008
[t-SNE] Computed conditional probabilities for sample 3008 / 3008
[t-SNE] Mean sigma: 0.062892
[t-SNE] KL divergence after 250 iterations with early exaggeration: 62.805347
[t-SNE] KL divergence after 800 iterations: 0.788050


Nations, Cosine Distance, 3D

In [None]:
def plot_3d(component1, component2, component3, nationalities, colors, nationality_to_color):
    fig = go.Figure()

    # Get unique nationalities and their respective colors
    for nationality in set(nationalities):
        indices = [i for i, x in enumerate(nationalities) if x == nationality]
        fig.add_trace(go.Scatter3d(
            x=[component1[i] for i in indices],
            y=[component2[i] for i in indices],
            z=[component3[i] for i in indices],
            mode='markers',
            marker=dict(
                size=5,
                color=nationality_to_color[nationality],  # Set color specific to nationality
                line=dict(width=1)
            ),
            name=nationality  # Use nationality as the name of the trace for legend
        ))

    fig.update_layout(
        margin=dict(l=100, r=100, b=100, t=100),
        width=3000,
        height=1200,
        template='plotly_dark',
        legend_title_text='Nationality',
        scene=dict(
            xaxis_title='Component 1',
            yaxis_title='Component 2',
            zaxis_title='Component 3'
        )
    )

    fig.show()

tsne3 = TSNE(random_state=42, n_components=3, verbose=1, perplexity=50, n_iter=800,metric='cosine').fit_transform(embeddings_256_array)
plot_3d(tsne3[:, 0], tsne3[:, 1], tsne3[:, 2], nationalities, colors, nationality_to_color)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 3008 samples in 0.001s...
[t-SNE] Computed neighbors for 3008 samples in 0.350s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3008
[t-SNE] Computed conditional probabilities for sample 2000 / 3008
[t-SNE] Computed conditional probabilities for sample 3000 / 3008
[t-SNE] Computed conditional probabilities for sample 3008 / 3008
[t-SNE] Mean sigma: 0.062892
[t-SNE] KL divergence after 250 iterations with early exaggeration: 63.188538
[t-SNE] KL divergence after 800 iterations: 0.678183


Genre 2D - Cosine

In [None]:
genres_of_interest = ['rock', 'pop', 'metal', 'hip hop', 'jazz', 'classical']

# Filter the DataFrame
filtered_artists = artists[artists['genre'].isin(genres_of_interest)]
print(f"Number of artists: {filtered_artists.shape[0]}")

#'musicbrainz_id' from the filtered DataFrame
musicbrainz_ids = filtered_artists['musicbrainz_id'].tolist()

# Filter the embeddings dictionary to only include keys that are in the musicbrainz_ids list
embeddings2_256_filtered = {key: embeddings2_256[key] for key in musicbrainz_ids if key in embeddings2_256}

# Convert the filtered embeddings dictionary to a list
embeddings2_256_list_filtered = list(embeddings2_256_filtered.values())
embeddings_256_array_filtered = np.vstack(embeddings2_256_list_filtered)
print(embeddings_256_array_filtered.shape)

Number of artists: 1860
(1860, 256)


In [None]:
# Create a mapping of musicbrainz_id to nationality
id_to_genre = dict(zip(filtered_artists['musicbrainz_id'], filtered_artists['genre']))

# Create a list of genres for the embeddings
genres = [id_to_genre.get(key, 'unknown') for key in embeddings2_256_filtered.keys()]
unique_genres = list(set(genres))
num_colors_needed = len(unique_genres)
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']
genre_to_color = {'hip hop': '#f3f7f2', 'jazz': '#E8F319', 'rock': '#EF553B','classical': '#00CC96','metal': '#636EFA','pop': '#FFA15A'} #{genre: discrete_colors[i] for i, genre in enumerate(unique_genres)}
colors = [genre_to_color[genre] for genre in genres]

# Create a legend for the discrete colors
color_scale = {i: genre for i, genre in enumerate(unique_genres)}

names = [id_name(mid) for mid in embeddings2_256_filtered.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array_filtered)
plot_2d_genre(tsne2[:, 0], tsne2[:, 1], genres, colors, names, genre_to_color,'full_model2')

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1860 samples in 0.001s...
[t-SNE] Computed neighbors for 1860 samples in 0.087s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1860
[t-SNE] Computed conditional probabilities for sample 1860 / 1860
[t-SNE] Mean sigma: 0.076058
[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.897930
[t-SNE] KL divergence after 800 iterations: 0.611227


Genres 3d - cosine

In [None]:
tsne2 = TSNE(random_state=42, n_components=3, verbose=1, metric='cosine', perplexity=50, n_iter=800).fit_transform(embeddings_256_array_filtered)
plot_3d(tsne2[:, 0], tsne2[:, 1], tsne2[:, 2],genres, colors, genre_to_color)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1860 samples in 0.001s...
[t-SNE] Computed neighbors for 1860 samples in 0.094s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1860
[t-SNE] Computed conditional probabilities for sample 1860 / 1860
[t-SNE] Mean sigma: 0.076058
[t-SNE] KL divergence after 250 iterations with early exaggeration: 58.468582
[t-SNE] KL divergence after 800 iterations: 0.510427


NameError: name 'plot_3d' is not defined

### tSNE - Full Model_3

In [None]:
root_dir = os.getcwd()
artists = pd.read_csv(os.path.join(root_dir, 'artists_mio_3000_updated.csv'))
triplets = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music_spot.csv'))
embeddings2 = load_embeddings(os.path.join(root_dir, 'embeddings_3.pkl'))
embeddings2_256 = load_embeddings(os.path.join(root_dir, 'embeddings3_256.pkl'))
embeddings_256_list = [embeddings2_256[key] for key in embeddings2_256]
embeddings_256_list = [embedding[0] for embedding in embeddings_256_list]
# Convert list of arrays to a single 2D numpy array
embeddings_256_array = np.vstack(embeddings_256_list)
print(embeddings_256_array.shape)

(3008, 256)


Nations, Cosine 2D

In [None]:
# Create a mapping of musicbrainz_id to nationality
id_to_nationality = dict(zip(artists['musicbrainz_id'], artists['nationality']))

# Create a list of genres for the embeddings
nationalities = [id_to_nationality.get(key, 'unknown') for key in embeddings2_256.keys()]

# Convert nationalities to a color
unique_nationalities = list(set(nationalities))
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']#pc.qualitative.Plotly[:len(unique_nationalities)]  # Generate discrete colors
nationality_to_color = {'es': '#E8F319','gb': '#f3f7f2','it': '#FFA15A','us': '#636EFA','de': '#EF553B','fr': '#00CC96'}#{nationality: discrete_colors[i] for i, nationality in enumerate(unique_nationalities)}
colors = [nationality_to_color[nationality] for nationality in nationalities]

# Create a legend for the discrete colors
color_scale = {i: nationality for i, nationality in enumerate(unique_nationalities)}

names = [id_name(mid) for mid in embeddings2_256.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array)
plot_2d_nations(tsne2[:, 0], tsne2[:, 1], nationalities, colors, names, nationality_to_color,'full_model3')

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 3008 samples in 0.001s...
[t-SNE] Computed neighbors for 3008 samples in 0.183s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3008
[t-SNE] Computed conditional probabilities for sample 2000 / 3008
[t-SNE] Computed conditional probabilities for sample 3000 / 3008
[t-SNE] Computed conditional probabilities for sample 3008 / 3008
[t-SNE] Mean sigma: 0.071402
[t-SNE] KL divergence after 250 iterations with early exaggeration: 62.707508
[t-SNE] KL divergence after 800 iterations: 0.800433


Genre 2D - Cosine

In [None]:
genres_of_interest = ['rock', 'pop', 'metal', 'hip hop', 'jazz', 'classical']

# Filter the DataFrame
filtered_artists = artists[artists['genre'].isin(genres_of_interest)]
print(f"Number of artists: {filtered_artists.shape[0]}")

#'musicbrainz_id' from the filtered DataFrame
musicbrainz_ids = filtered_artists['musicbrainz_id'].tolist()

# Filter the embeddings dictionary to only include keys that are in the musicbrainz_ids list
embeddings2_256_filtered = {key: embeddings2_256[key] for key in musicbrainz_ids if key in embeddings2_256}

# Convert the filtered embeddings dictionary to a list
embeddings2_256_list_filtered = list(embeddings2_256_filtered.values())
embeddings_256_array_filtered = np.vstack(embeddings2_256_list_filtered)
print(embeddings_256_array_filtered.shape)

Number of artists: 1860
(1860, 256)


In [None]:
# Create a mapping of musicbrainz_id to nationality
id_to_genre = dict(zip(filtered_artists['musicbrainz_id'], filtered_artists['genre']))

# Create a list of genres for the embeddings
genres = [id_to_genre.get(key, 'unknown') for key in embeddings2_256_filtered.keys()]
unique_genres = list(set(genres))
num_colors_needed = len(unique_genres)
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']
genre_to_color = {'hip hop': '#f3f7f2', 'jazz': '#E8F319', 'rock': '#EF553B','classical': '#00CC96','metal': '#636EFA','pop': '#FFA15A'} #{genre: discrete_colors[i] for i, genre in enumerate(unique_genres)}
colors = [genre_to_color[genre] for genre in genres]

# Create a legend for the discrete colors
color_scale = {i: genre for i, genre in enumerate(unique_genres)}

names = [id_name(mid) for mid in embeddings2_256_filtered.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array_filtered)
plot_2d_genre(tsne2[:, 0], tsne2[:, 1], genres, colors, names, genre_to_color,'full_model3')

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1860 samples in 0.001s...
[t-SNE] Computed neighbors for 1860 samples in 0.088s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1860
[t-SNE] Computed conditional probabilities for sample 1860 / 1860
[t-SNE] Mean sigma: 0.084392
[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.801262
[t-SNE] KL divergence after 800 iterations: 0.612394


### tSNE - Full Model_4

In [None]:
root_dir = os.getcwd()
artists = pd.read_csv(os.path.join(root_dir, 'artists_mio_3000_updated.csv'))
triplets = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music_spot.csv'))
embeddings2 = load_embeddings(os.path.join(root_dir, 'embeddings_4.pkl'))
embeddings2_256 = load_embeddings(os.path.join(root_dir, 'embeddings4_256.pkl'))
embeddings_256_list = [embeddings2_256[key] for key in embeddings2_256]
embeddings_256_list = [embedding[0] for embedding in embeddings_256_list]
# Convert list of arrays to a single 2D numpy array
embeddings_256_array = np.vstack(embeddings_256_list)
print(embeddings_256_array.shape)

(3008, 256)


Nations, Cosine 2D

In [None]:
# Create a mapping of musicbrainz_id to nationality
id_to_nationality = dict(zip(artists['musicbrainz_id'], artists['nationality']))

# Create a list of genres for the embeddings
nationalities = [id_to_nationality.get(key, 'unknown') for key in embeddings2_256.keys()]

# Convert nationalities to a color
unique_nationalities = list(set(nationalities))
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']#pc.qualitative.Plotly[:len(unique_nationalities)]  # Generate discrete colors
nationality_to_color = {'es': '#E8F319','gb': '#f3f7f2','it': '#FFA15A','us': '#636EFA','de': '#EF553B','fr': '#00CC96'}#{nationality: discrete_colors[i] for i, nationality in enumerate(unique_nationalities)}
colors = [nationality_to_color[nationality] for nationality in nationalities]

# Create a legend for the discrete colors
color_scale = {i: nationality for i, nationality in enumerate(unique_nationalities)}

names = [id_name(mid) for mid in embeddings2_256.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array)
plot_2d_nations(tsne2[:, 0], tsne2[:, 1], nationalities, colors, names, nationality_to_color,'full_model4')

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 3008 samples in 0.001s...
[t-SNE] Computed neighbors for 3008 samples in 0.168s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3008
[t-SNE] Computed conditional probabilities for sample 2000 / 3008
[t-SNE] Computed conditional probabilities for sample 3000 / 3008
[t-SNE] Computed conditional probabilities for sample 3008 / 3008
[t-SNE] Mean sigma: 0.050968
[t-SNE] KL divergence after 250 iterations with early exaggeration: 60.450977
[t-SNE] KL divergence after 800 iterations: 0.709386


Genre 2D - Cosine

In [None]:
genres_of_interest = ['rock', 'pop', 'metal', 'hip hop', 'jazz', 'classical']

# Filter the DataFrame
filtered_artists = artists[artists['genre'].isin(genres_of_interest)]
print(f"Number of artists: {filtered_artists.shape[0]}")

#'musicbrainz_id' from the filtered DataFrame
musicbrainz_ids = filtered_artists['musicbrainz_id'].tolist()

# Filter the embeddings dictionary to only include keys that are in the musicbrainz_ids list
embeddings2_256_filtered = {key: embeddings2_256[key] for key in musicbrainz_ids if key in embeddings2_256}

# Convert the filtered embeddings dictionary to a list
embeddings2_256_list_filtered = list(embeddings2_256_filtered.values())
embeddings_256_array_filtered = np.vstack(embeddings2_256_list_filtered)
print(embeddings_256_array_filtered.shape)

Number of artists: 1860
(1860, 256)


In [None]:
# Create a mapping of musicbrainz_id to nationality
id_to_genre = dict(zip(filtered_artists['musicbrainz_id'], filtered_artists['genre']))

# Create a list of genres for the embeddings
genres = [id_to_genre.get(key, 'unknown') for key in embeddings2_256_filtered.keys()]
unique_genres = list(set(genres))
num_colors_needed = len(unique_genres)
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']
genre_to_color = {'hip hop': '#f3f7f2', 'jazz': '#E8F319', 'rock': '#EF553B','classical': '#00CC96','metal': '#636EFA','pop': '#FFA15A'} #{genre: discrete_colors[i] for i, genre in enumerate(unique_genres)}
colors = [genre_to_color[genre] for genre in genres]

# Create a legend for the discrete colors
color_scale = {i: genre for i, genre in enumerate(unique_genres)}

names = [id_name(mid) for mid in embeddings2_256_filtered.keys()]

def plot_2d(component1, component2, nationalities, colors, names, nationality_to_color):
    fig = go.Figure()

    # Get unique nationalities and their respective colors
    for nationality in set(nationalities):
        indices = [i for i, x in enumerate(nationalities) if x == nationality]
        fig.add_trace(go.Scatter(
            x=[component1[i] for i in indices],
            y=[component2[i] for i in indices],
            mode='markers',
            marker=dict(
                size=10,
                color=nationality_to_color[nationality],  # Set color specific to nationality
                line=dict(width=1)
            ),
            name=nationality,  # Use nationality as the name of the trace for legend
            text=[names[i] for i in indices],  # Include artist names in hover information
            hoverinfo='text+name'
        ))

    fig.update_layout(
        margin=dict(l=100, r=100, b=100, t=100),
        width=3000,
        height=1200,
        template='plotly_dark',
        legend_title_text='Genres'
    )

    fig.show()

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array_filtered)
plot_2d_genre(tsne2[:, 0], tsne2[:, 1], genres, colors, names, genre_to_color,'full_model4')

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1860 samples in 0.001s...
[t-SNE] Computed neighbors for 1860 samples in 0.081s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1860
[t-SNE] Computed conditional probabilities for sample 1860 / 1860
[t-SNE] Mean sigma: 0.061771
[t-SNE] KL divergence after 250 iterations with early exaggeration: 56.344048
[t-SNE] KL divergence after 800 iterations: 0.562721


### IMG & CAPTIONS ONLY

In [26]:
def load_embeddings(file_path):
  with open(file_path, 'rb') as file:
    return pickle.load(file)

In [27]:
# IMG -NATIONALITY

root_dir = os.getcwd()
artists = pd.read_csv(os.path.join(root_dir, 'artists_mio_3000_updated.csv'))
triplets = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music.csv'))
embeddings = load_embeddings(os.path.join(root_dir, 'embeddings_img.pkl')) #embeddings2, embeddings_img, embeddings_captions1
embeddings_256 = load_embeddings(os.path.join(root_dir, 'embeddings_img_256.pkl'))
embeddings_256_list = [embeddings_256[key] for key in embeddings_256]
embeddings_256_list = [embedding[0] for embedding in embeddings_256_list]
# Convert list of arrays to a single 2D numpy array
embeddings_256_array = np.vstack(embeddings_256_list)

# Create a mapping of musicbrainz_id to nationality
id_to_nationality = dict(zip(artists['musicbrainz_id'], artists['nationality']))

# Create a list of genres for the embeddings
nationalities = [id_to_nationality.get(key, 'unknown') for key in embeddings_256.keys()]

# Convert nationalities to a color
unique_nationalities = list(set(nationalities))
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']#pc.qualitative.Plotly[:len(unique_nationalities)]  # Generate discrete colors
nationality_to_color = {'es': '#E8F319','gb': '#f3f7f2','it': '#FFA15A','us': '#636EFA','de': '#EF553B','fr': '#00CC96'}#{nationality: discrete_colors[i] for i, nationality in enumerate(unique_nationalities)}
colors = [nationality_to_color[nationality] for nationality in nationalities]

# Create a legend for the discrete colors
color_scale = {i: nationality for i, nationality in enumerate(unique_nationalities)}
names = [id_name(mid) for mid in embeddings_256.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array)
plot_2d_nations(tsne2[:, 0], tsne2[:, 1], nationalities, colors, names, nationality_to_color,'img')

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 3008 samples in 0.002s...
[t-SNE] Computed neighbors for 3008 samples in 0.185s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3008
[t-SNE] Computed conditional probabilities for sample 2000 / 3008
[t-SNE] Computed conditional probabilities for sample 3000 / 3008
[t-SNE] Computed conditional probabilities for sample 3008 / 3008
[t-SNE] Mean sigma: 0.055870
[t-SNE] KL divergence after 250 iterations with early exaggeration: 62.547310
[t-SNE] KL divergence after 800 iterations: 0.774757


In [28]:
# IMG - GENRE

genres_of_interest = ['rock', 'pop', 'metal', 'hip hop', 'jazz', 'classical']

# Filter the DataFrame
filtered_artists = artists[artists['genre'].isin(genres_of_interest)]
print(f"Number of artists: {filtered_artists.shape[0]}")

#'musicbrainz_id' from the filtered DataFrame
musicbrainz_ids = filtered_artists['musicbrainz_id'].tolist()

# Filter the embeddings dictionary to only include keys that are in the musicbrainz_ids list
embeddings_256_filtered = {key: embeddings_256[key] for key in musicbrainz_ids if key in embeddings_256}

# Convert the filtered embeddings dictionary to a list
embeddings_256_list_filtered = list(embeddings_256_filtered.values())
embeddings_256_array_filtered = np.vstack(embeddings_256_list_filtered)
print(f'Number of artists considered: {embeddings_256_array_filtered.shape}')

# Create a mapping of musicbrainz_id to nationality
id_to_genre = dict(zip(filtered_artists['musicbrainz_id'], filtered_artists['genre']))

# Create a list of genres for the embeddings
genres = [id_to_genre.get(key, 'unknown') for key in embeddings_256_filtered.keys()]
unique_genres = list(set(genres))
num_colors_needed = len(unique_genres)
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']
genre_to_color = {'hip hop': '#f3f7f2', 'jazz': '#E8F319', 'rock': '#EF553B','classical': '#00CC96','metal': '#636EFA','pop': '#FFA15A'} #{genre: discrete_colors[i] for i, genre in enumerate(unique_genres)}
colors = [genre_to_color[genre] for genre in genres]

# Create a legend for the discrete colors
color_scale = {i: genre for i, genre in enumerate(unique_genres)}

names = [id_name(mid) for mid in embeddings_256_filtered.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array_filtered)
plot_2d_genre(tsne2[:, 0], tsne2[:, 1], genres, colors, names, genre_to_color,'img')

Number of artists: 1860
Number of artists considered: (1860, 256)
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1860 samples in 0.001s...
[t-SNE] Computed neighbors for 1860 samples in 0.098s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1860
[t-SNE] Computed conditional probabilities for sample 1860 / 1860
[t-SNE] Mean sigma: 0.059631
[t-SNE] KL divergence after 250 iterations with early exaggeration: 58.017357
[t-SNE] KL divergence after 800 iterations: 0.595878


In [29]:
# CAPTIONS1 - NATIONALITY

root_dir = os.getcwd()
artists = pd.read_csv(os.path.join(root_dir, 'artists_mio_3000_updated.csv'))
triplets = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music.csv'))
embeddings = load_embeddings(os.path.join(root_dir, 'embeddings_captions1.pkl')) #embeddings2, embeddings_img, embeddings_captions1
embeddings_256 = load_embeddings(os.path.join(root_dir, 'embeddings_captions1_256.pkl')) #embeddings_captions1_256
embeddings_256_list = [embeddings_256[key] for key in embeddings_256]
embeddings_256_list = [embedding[0] for embedding in embeddings_256_list]
# Convert list of arrays to a single 2D numpy array
embeddings_256_array = np.vstack(embeddings_256_list)

# Create a mapping of musicbrainz_id to nationality
id_to_nationality = dict(zip(artists['musicbrainz_id'], artists['nationality']))

# Create a list of genres for the embeddings
nationalities = [id_to_nationality.get(key, 'unknown') for key in embeddings_256.keys()]

# Convert nationalities to a color
unique_nationalities = list(set(nationalities))
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']#pc.qualitative.Plotly[:len(unique_nationalities)]  # Generate discrete colors
nationality_to_color = {'es': '#E8F319','gb': '#f3f7f2','it': '#FFA15A','us': '#636EFA','de': '#EF553B','fr': '#00CC96'}#{nationality: discrete_colors[i] for i, nationality in enumerate(unique_nationalities)}
colors = [nationality_to_color[nationality] for nationality in nationalities]

# Create a legend for the discrete colors
color_scale = {i: nationality for i, nationality in enumerate(unique_nationalities)}

names = [id_name(mid) for mid in embeddings_256.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array)
plot_2d_nations(tsne2[:, 0], tsne2[:, 1], nationalities, colors, names, nationality_to_color,'captions1')

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 3008 samples in 0.001s...
[t-SNE] Computed neighbors for 3008 samples in 0.183s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3008
[t-SNE] Computed conditional probabilities for sample 2000 / 3008
[t-SNE] Computed conditional probabilities for sample 3000 / 3008
[t-SNE] Computed conditional probabilities for sample 3008 / 3008
[t-SNE] Mean sigma: 0.026257
[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.454308
[t-SNE] KL divergence after 800 iterations: 0.777905


In [30]:
# CAPTIONS1 - GENRE

genres_of_interest = ['rock', 'pop', 'metal', 'hip hop', 'jazz', 'classical']

# Filter the DataFrame
filtered_artists = artists[artists['genre'].isin(genres_of_interest)]
print(f"Number of artists: {filtered_artists.shape[0]}")

#'musicbrainz_id' from the filtered DataFrame
musicbrainz_ids = filtered_artists['musicbrainz_id'].tolist()

# Filter the embeddings dictionary to only include keys that are in the musicbrainz_ids list
embeddings_256_filtered = {key: embeddings_256[key] for key in musicbrainz_ids if key in embeddings_256}

# Convert the filtered embeddings dictionary to a list
embeddings_256_list_filtered = list(embeddings_256_filtered.values())
embeddings_256_array_filtered = np.vstack(embeddings_256_list_filtered)
print(f'Number of artists considered: {embeddings_256_array_filtered.shape}')

# Create a mapping of musicbrainz_id to nationality
id_to_genre = dict(zip(filtered_artists['musicbrainz_id'], filtered_artists['genre']))

# Create a list of genres for the embeddings
genres = [id_to_genre.get(key, 'unknown') for key in embeddings_256_filtered.keys()]
unique_genres = list(set(genres))
num_colors_needed = len(unique_genres)
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']
genre_to_color = {'hip hop': '#f3f7f2', 'jazz': '#E8F319', 'rock': '#EF553B','classical': '#00CC96','metal': '#636EFA','pop': '#FFA15A'} #{genre: discrete_colors[i] for i, genre in enumerate(unique_genres)}
colors = [genre_to_color[genre] for genre in genres]

# Create a legend for the discrete colors
color_scale = {i: genre for i, genre in enumerate(unique_genres)}

names = [id_name(mid) for mid in embeddings_256_filtered.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array_filtered)
plot_2d_genre(tsne2[:, 0], tsne2[:, 1], genres, colors, names, genre_to_color,'captions1')

Number of artists: 1860
Number of artists considered: (1860, 256)
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1860 samples in 0.001s...
[t-SNE] Computed neighbors for 1860 samples in 0.082s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1860
[t-SNE] Computed conditional probabilities for sample 1860 / 1860
[t-SNE] Mean sigma: 0.026396
[t-SNE] KL divergence after 250 iterations with early exaggeration: 56.012260
[t-SNE] KL divergence after 800 iterations: 0.587877


In [31]:
# CAPTIONS2 - NATIONALITY

root_dir = os.getcwd()
artists = pd.read_csv(os.path.join(root_dir, 'artists_mio_3000_updated.csv'))
triplets = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music.csv'))
embeddings = load_embeddings(os.path.join(root_dir, 'embeddings_captions2.pkl')) #embeddings2, embeddings_img, embeddings_captions1
embeddings_256 = load_embeddings(os.path.join(root_dir, 'embeddings_captions2_256.pkl')) #embeddings_captions1_256
embeddings_256_list = [embeddings_256[key] for key in embeddings_256]
embeddings_256_list = [embedding[0] for embedding in embeddings_256_list]
# Convert list of arrays to a single 2D numpy array
embeddings_256_array = np.vstack(embeddings_256_list)

# Create a mapping of musicbrainz_id to nationality
id_to_nationality = dict(zip(artists['musicbrainz_id'], artists['nationality']))

# Create a list of genres for the embeddings
nationalities = [id_to_nationality.get(key, 'unknown') for key in embeddings_256.keys()]

# Convert nationalities to a color
unique_nationalities = list(set(nationalities))
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']#pc.qualitative.Plotly[:len(unique_nationalities)]  # Generate discrete colors
nationality_to_color = {'es': '#E8F319','gb': '#f3f7f2','it': '#FFA15A','us': '#636EFA','de': '#EF553B','fr': '#00CC96'}#{nationality: discrete_colors[i] for i, nationality in enumerate(unique_nationalities)}
colors = [nationality_to_color[nationality] for nationality in nationalities]

# Create a legend for the discrete colors
color_scale = {i: nationality for i, nationality in enumerate(unique_nationalities)}
names = [id_name(mid) for mid in embeddings_256.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array)
plot_2d_nations(tsne2[:, 0], tsne2[:, 1], nationalities, colors, names, nationality_to_color,'captions2')

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 3008 samples in 0.001s...
[t-SNE] Computed neighbors for 3008 samples in 0.178s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3008
[t-SNE] Computed conditional probabilities for sample 2000 / 3008
[t-SNE] Computed conditional probabilities for sample 3000 / 3008
[t-SNE] Computed conditional probabilities for sample 3008 / 3008
[t-SNE] Mean sigma: 0.021240
[t-SNE] KL divergence after 250 iterations with early exaggeration: 60.737431
[t-SNE] KL divergence after 800 iterations: 0.738361


In [32]:
# CAPTIONS2 - GENRE

genres_of_interest = ['rock', 'pop', 'metal', 'hip hop', 'jazz', 'classical']

# Filter the DataFrame
filtered_artists = artists[artists['genre'].isin(genres_of_interest)]
print(f"Number of artists: {filtered_artists.shape[0]}")

#'musicbrainz_id' from the filtered DataFrame
musicbrainz_ids = filtered_artists['musicbrainz_id'].tolist()

# Filter the embeddings dictionary to only include keys that are in the musicbrainz_ids list
embeddings_256_filtered = {key: embeddings_256[key] for key in musicbrainz_ids if key in embeddings_256}

# Convert the filtered embeddings dictionary to a list
embeddings_256_list_filtered = list(embeddings_256_filtered.values())
embeddings_256_array_filtered = np.vstack(embeddings_256_list_filtered)
print(f'Number of artists considered: {embeddings_256_array_filtered.shape}')

# Create a mapping of musicbrainz_id to nationality
id_to_genre = dict(zip(filtered_artists['musicbrainz_id'], filtered_artists['genre']))

# Create a list of genres for the embeddings
genres = [id_to_genre.get(key, 'unknown') for key in embeddings_256_filtered.keys()]
unique_genres = list(set(genres))
num_colors_needed = len(unique_genres)
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']
genre_to_color = {'hip hop': '#f3f7f2', 'jazz': '#E8F319', 'rock': '#EF553B','classical': '#00CC96','metal': '#636EFA','pop': '#FFA15A'} #{genre: discrete_colors[i] for i, genre in enumerate(unique_genres)}
colors = [genre_to_color[genre] for genre in genres]

# Create a legend for the discrete colors
color_scale = {i: genre for i, genre in enumerate(unique_genres)}

names = [id_name(mid) for mid in embeddings_256_filtered.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array_filtered)
plot_2d_genre(tsne2[:, 0], tsne2[:, 1], genres, colors, names, genre_to_color,'captions2')

Number of artists: 1860
Number of artists considered: (1860, 256)
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1860 samples in 0.001s...
[t-SNE] Computed neighbors for 1860 samples in 0.077s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1860
[t-SNE] Computed conditional probabilities for sample 1860 / 1860
[t-SNE] Mean sigma: 0.024038
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.978035
[t-SNE] KL divergence after 800 iterations: 0.564164


In [33]:
# CAPTIONS3 - NATIONALITY

root_dir = os.getcwd()
artists = pd.read_csv(os.path.join(root_dir, 'artists_mio_3000_updated.csv'))
triplets = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music.csv'))
embeddings = load_embeddings(os.path.join(root_dir, 'embeddings_captions3.pkl')) #embeddings2, embeddings_img, embeddings_captions1
embeddings_256 = load_embeddings(os.path.join(root_dir, 'embeddings_captions3_256.pkl')) #embeddings_captions1_256
embeddings_256_list = [embeddings_256[key] for key in embeddings_256]
embeddings_256_list = [embedding[0] for embedding in embeddings_256_list]
# Convert list of arrays to a single 2D numpy array
embeddings_256_array = np.vstack(embeddings_256_list)

# Create a mapping of musicbrainz_id to nationality
id_to_nationality = dict(zip(artists['musicbrainz_id'], artists['nationality']))

# Create a list of genres for the embeddings
nationalities = [id_to_nationality.get(key, 'unknown') for key in embeddings_256.keys()]

# Convert nationalities to a color
unique_nationalities = list(set(nationalities))
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']#pc.qualitative.Plotly[:len(unique_nationalities)]  # Generate discrete colors
nationality_to_color = {'es': '#E8F319','gb': '#f3f7f2','it': '#FFA15A','us': '#636EFA','de': '#EF553B','fr': '#00CC96'}#{nationality: discrete_colors[i] for i, nationality in enumerate(unique_nationalities)}
colors = [nationality_to_color[nationality] for nationality in nationalities]

# Create a legend for the discrete colors
color_scale = {i: nationality for i, nationality in enumerate(unique_nationalities)}

names = [id_name(mid) for mid in embeddings_256.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array)
plot_2d_nations(tsne2[:, 0], tsne2[:, 1], nationalities, colors, names, nationality_to_color,'captions3')

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 3008 samples in 0.001s...
[t-SNE] Computed neighbors for 3008 samples in 0.176s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3008
[t-SNE] Computed conditional probabilities for sample 2000 / 3008
[t-SNE] Computed conditional probabilities for sample 3000 / 3008
[t-SNE] Computed conditional probabilities for sample 3008 / 3008
[t-SNE] Mean sigma: 0.030925
[t-SNE] KL divergence after 250 iterations with early exaggeration: 59.612980
[t-SNE] KL divergence after 800 iterations: 0.699714


In [34]:
# CAPTIONS3 - GENRE

genres_of_interest = ['rock', 'pop', 'metal', 'hip hop', 'jazz', 'classical']

# Filter the DataFrame
filtered_artists = artists[artists['genre'].isin(genres_of_interest)]
print(f"Number of artists: {filtered_artists.shape[0]}")

#'musicbrainz_id' from the filtered DataFrame
musicbrainz_ids = filtered_artists['musicbrainz_id'].tolist()

# Filter the embeddings dictionary to only include keys that are in the musicbrainz_ids list
embeddings_256_filtered = {key: embeddings_256[key] for key in musicbrainz_ids if key in embeddings_256}

# Convert the filtered embeddings dictionary to a list
embeddings_256_list_filtered = list(embeddings_256_filtered.values())
embeddings_256_array_filtered = np.vstack(embeddings_256_list_filtered)
print(f'Number of artists considered: {embeddings_256_array_filtered.shape}')

# Create a mapping of musicbrainz_id to nationality
id_to_genre = dict(zip(filtered_artists['musicbrainz_id'], filtered_artists['genre']))

# Create a list of genres for the embeddings
genres = [id_to_genre.get(key, 'unknown') for key in embeddings_256_filtered.keys()]
unique_genres = list(set(genres))
num_colors_needed = len(unique_genres)
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']
genre_to_color = {'hip hop': '#f3f7f2', 'jazz': '#E8F319', 'rock': '#EF553B','classical': '#00CC96','metal': '#636EFA','pop': '#FFA15A'} #{genre: discrete_colors[i] for i, genre in enumerate(unique_genres)}
colors = [genre_to_color[genre] for genre in genres]

# Create a legend for the discrete colors
color_scale = {i: genre for i, genre in enumerate(unique_genres)}

names = [id_name(mid) for mid in embeddings_256_filtered.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array_filtered)
plot_2d_genre(tsne2[:, 0], tsne2[:, 1], genres, colors, names, genre_to_color,'captions3')

Number of artists: 1860
Number of artists considered: (1860, 256)
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1860 samples in 0.002s...
[t-SNE] Computed neighbors for 1860 samples in 0.078s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1860
[t-SNE] Computed conditional probabilities for sample 1860 / 1860
[t-SNE] Mean sigma: 0.035902
[t-SNE] KL divergence after 250 iterations with early exaggeration: 53.794777
[t-SNE] KL divergence after 800 iterations: 0.517861


In [35]:
# CAPTIONS4 - NATIONALITY

root_dir = os.getcwd()
artists = pd.read_csv(os.path.join(root_dir, 'artists_mio_3000_updated.csv'))
triplets = pd.read_csv(os.path.join(root_dir, 'Triplets', 'triplets_ids_music.csv'))
embeddings = load_embeddings(os.path.join(root_dir, 'embeddings_captions4.pkl')) #embeddings2, embeddings_img, embeddings_captions1
embeddings_256 = load_embeddings(os.path.join(root_dir, 'embeddings_captions4_256.pkl')) #embeddings_captions1_256
embeddings_256_list = [embeddings_256[key] for key in embeddings_256]
embeddings_256_list = [embedding[0] for embedding in embeddings_256_list]
# Convert list of arrays to a single 2D numpy array
embeddings_256_array = np.vstack(embeddings_256_list)

# Create a mapping of musicbrainz_id to nationality
id_to_nationality = dict(zip(artists['musicbrainz_id'], artists['nationality']))

# Create a list of genres for the embeddings
nationalities = [id_to_nationality.get(key, 'unknown') for key in embeddings_256.keys()]

# Convert nationalities to a color
unique_nationalities = list(set(nationalities))
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']#pc.qualitative.Plotly[:len(unique_nationalities)]  # Generate discrete colors
nationality_to_color = {'es': '#E8F319','gb': '#f3f7f2','it': '#FFA15A','us': '#636EFA','de': '#EF553B','fr': '#00CC96'}#{nationality: discrete_colors[i] for i, nationality in enumerate(unique_nationalities)}
colors = [nationality_to_color[nationality] for nationality in nationalities]

# Create a legend for the discrete colors
color_scale = {i: nationality for i, nationality in enumerate(unique_nationalities)}

names = [id_name(mid) for mid in embeddings_256.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array)
plot_2d_nations(tsne2[:, 0], tsne2[:, 1], nationalities, colors, names, nationality_to_color,'captions4')

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 3008 samples in 0.001s...
[t-SNE] Computed neighbors for 3008 samples in 0.195s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3008
[t-SNE] Computed conditional probabilities for sample 2000 / 3008
[t-SNE] Computed conditional probabilities for sample 3000 / 3008
[t-SNE] Computed conditional probabilities for sample 3008 / 3008
[t-SNE] Mean sigma: 0.023702
[t-SNE] KL divergence after 250 iterations with early exaggeration: 64.184708
[t-SNE] KL divergence after 800 iterations: 0.885310


In [None]:
# CAPTIONS4 - NATIONALITY - EUCLIDEAN

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800).fit_transform(embeddings_256_array)
plot_2d_nations(tsne2[:, 0], tsne2[:, 1], nationalities, colors, names, nationality_to_color,'')

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 3007 samples in 0.001s...
[t-SNE] Computed neighbors for 3007 samples in 0.131s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3007
[t-SNE] Computed conditional probabilities for sample 2000 / 3007
[t-SNE] Computed conditional probabilities for sample 3000 / 3007
[t-SNE] Computed conditional probabilities for sample 3007 / 3007
[t-SNE] Mean sigma: 2.876350
[t-SNE] KL divergence after 250 iterations with early exaggeration: 66.137581
[t-SNE] KL divergence after 800 iterations: 0.973113


In [36]:
# CAPTIONS4 - GENRE

genres_of_interest = ['rock', 'pop', 'metal', 'hip hop', 'jazz', 'classical']

# Filter the DataFrame
filtered_artists = artists[artists['genre'].isin(genres_of_interest)]
print(f"Number of artists: {filtered_artists.shape[0]}")

#'musicbrainz_id' from the filtered DataFrame
musicbrainz_ids = filtered_artists['musicbrainz_id'].tolist()

# Filter the embeddings dictionary to only include keys that are in the musicbrainz_ids list
embeddings_256_filtered = {key: embeddings_256[key] for key in musicbrainz_ids if key in embeddings_256}

# Convert the filtered embeddings dictionary to a list
embeddings_256_list_filtered = list(embeddings_256_filtered.values())
embeddings_256_array_filtered = np.vstack(embeddings_256_list_filtered)
print(f'Number of artists considered: {embeddings_256_array_filtered.shape}')

# Create a mapping of musicbrainz_id to nationality
id_to_genre = dict(zip(filtered_artists['musicbrainz_id'], filtered_artists['genre']))

# Create a list of genres for the embeddings
genres = [id_to_genre.get(key, 'unknown') for key in embeddings_256_filtered.keys()]
unique_genres = list(set(genres))
num_colors_needed = len(unique_genres)
discrete_colors = ['#636EFA', '#f3f7f2', '#FFA15A', '#E8F319', '#EF553B', '#00CC96']
genre_to_color = {'hip hop': '#f3f7f2', 'jazz': '#E8F319', 'rock': '#EF553B','classical': '#00CC96','metal': '#636EFA','pop': '#FFA15A'} #{genre: discrete_colors[i] for i, genre in enumerate(unique_genres)}
colors = [genre_to_color[genre] for genre in genres]

# Create a legend for the discrete colors
color_scale = {i: genre for i, genre in enumerate(unique_genres)}

names = [id_name(mid) for mid in embeddings_256_filtered.keys()]

tsne2 = TSNE(random_state=42, n_components=2, verbose=1, perplexity=50, n_iter=800, metric='cosine').fit_transform(embeddings_256_array_filtered)
plot_2d_genre(tsne2[:, 0], tsne2[:, 1], genres, colors, names, genre_to_color,'captions4')

Number of artists: 1860
Number of artists considered: (1860, 256)
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1860 samples in 0.001s...
[t-SNE] Computed neighbors for 1860 samples in 0.074s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1860
[t-SNE] Computed conditional probabilities for sample 1860 / 1860
[t-SNE] Mean sigma: 0.025166
[t-SNE] KL divergence after 250 iterations with early exaggeration: 56.870911
[t-SNE] KL divergence after 800 iterations: 0.675716
