In [1]:
import pandas as pd
import numpy as np
from utils import ROOT_DIR

In [2]:
df_interactions = pd.read_csv(
    f"{ROOT_DIR}/data/mdp_interactions.csv"
)

df_interactions.head(3)

Unnamed: 0,pid,track_uri,pos
0,0,0UaMYEvWZi0ZqiDOoHU3YI,0
1,0,6I9VzXrHxO9rA9A5euc8Ak,1
2,0,0WqIKmW4BTrj3eJFmnCKMv,2


In [3]:
df_track_interactions = df_interactions.groupby('track_uri').agg(
    count = ('track_uri', 'count'),
    additive_pos = ('pos', 'sum'))

df_track_interactions.reset_index(inplace=True)
df_track_interactions.sort_values(by='count', ascending=False, inplace=True)

df_track_interactions

Unnamed: 0,track_uri,count,mean_pos,mode_pos,median_pos
2128659,7KXjTSCq5nL1LoYtL7XAwS,46574,54.675012,0,38.0
573135,1xznGGDReH1oQq0xzbwXa3,43447,44.401754,0,29.0
2257360,7yyRTcZmCiyzzJlNzGC9Ol,41309,44.400663,0,29.0
2085721,7BKLCZ1jbUBVqRi2FVlTVw,41079,47.979113,0,33.0
1041889,3a1lNhkSLSkpJE4MSHpDu9,39987,50.191437,4,34.0
...,...,...,...,...,...
496841,1hjKpLKdiuYRY082UF3quN,1,6.000000,6,6.0
496843,1hjLagaRWzJwVBOKuLL9nC,1,211.000000,211,211.0
1366164,4hHJof8n4Z3Hw5GkHxkZrJ,1,136.000000,136,136.0
496844,1hjMy1ztIFTqSDwNFBL6gl,1,6.000000,6,6.0


In [4]:
# Store track interactions into results
df_track_interactions.to_pickle(f"{ROOT_DIR}/results/df_track_interactions.pickle")

In [5]:
df_tracks = pd.read_csv(
    f"{ROOT_DIR}/data/mdp_tracks.csv"
)
df_tracks.artist_uri = df_tracks.artist_uri.apply(lambda x: x.replace('spotify:artist:',''))

df_tracks.head(5)

Unnamed: 0,track_uri,artist_name,artist_uri,track_name,album_uri,album_name,duration_ms
0,0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,The Cookbook,226863
1,6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,In The Zone,198800
2,0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,Dangerously In Love (Alben für die Ewigkeit),235933
3,1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,Justified,267266
4,1lzr43nnXAijIGYnCT8M8H,Shaggy,5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,Hot Shot,227600


In [6]:
# Levantar el pickle de perfiles de artistas
df_artists_profile = pd.read_pickle(
    f"{ROOT_DIR}/results/artists_profiles_all_genres.pickle"
)

df_artists_profile.head(5)


Unnamed: 0,artist_uri,dance pop,rock,pop,electro house,latin,hip hop,pop rap,edm,filmi,...,kyrgyz hip hop,t-pop girl group,baluchi folk,rap pernambucano,rock campineiro,emo trap en espanol,schrammelmusik,esperanto,malang punk,rap mineiro
0,11VJahxEh2Fs2pXXVxdYeo,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4Dl8unj6USQNaEsaQdMegn,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,409qREwneDD43Jr4dvzsB3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,4pliqGKLKjF8vr9PXzNkog,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,13l03qv7mpBV432l5t8fpq,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df_track_artist_with_interactions = pd.merge(
    df_tracks[['track_uri', 'artist_uri']],
    df_track_interactions[['track_uri', 'count']],
    on=['track_uri']
)
df_track_artist_with_interactions.rename(columns={'count': 'interactions'}, inplace=True)

df_track_artist_with_interactions.sort_values(by='interactions', ascending=False, inplace=True)

df_track_artist_with_interactions.head(5)

Unnamed: 0,track_uri,artist_uri,interactions
1335,7KXjTSCq5nL1LoYtL7XAwS,2YZyLoL8N0Wb9xBt1NhZWg,46574
3713,1xznGGDReH1oQq0xzbwXa3,3TVXtAsR1Inumwj472S9r4,43447
1396,7yyRTcZmCiyzzJlNzGC9Ol,5M0lbkGluOPXLeFjApw8r8,41309
1874,7BKLCZ1jbUBVqRi2FVlTVw,69GGBxA162lTqCwzJG5jLp,41079
2715,3a1lNhkSLSkpJE4MSHpDu9,246dkjvS1zLTtiykXe5h60,39987


In [8]:
np.quantile(
    df_track_artist_with_interactions.interactions,
    q=np.arange(0, 1, 0.05)
)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,
        3.,  4.,  5.,  7., 11., 20., 53.])

In [9]:
# Filter track & artists with at least 5 interaction
top_artists = df_track_artist_with_interactions.query("interactions > 5").artist_uri

In [10]:
# I will use artist genre profile only for those top_artists

artists_profile = pd.merge(
    df_artists_profile,
    top_artists
)

In [11]:
artists_profile.shape

(475279, 5455)

In [12]:
genres_list = df_artists_profile.columns.to_list()[1::]

In [13]:
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt


def create_SVD_variance_df(amenities_matrix, random_state, n_components=30):
    """
    Create dataframe with total variance by component and variance improvement by component

    :param amenities_matrix:
    :param random_state:
    :param n_components:
    :return:
    """
    svd = TruncatedSVD(n_components=n_components, random_state=random_state)
    X_reduced = svd.fit_transform(amenities_matrix)
    svd_variance_df = pd.DataFrame({
        'n_component':list(range(1, n_components + 1)),
        'variance': svd.explained_variance_ratio_.tolist()}
    )
    svd_variance_df['total_variance'] = svd_variance_df.variance.cumsum()
    svd_variance_df['total_variance_change'] = (svd_variance_df.total_variance - svd_variance_df.total_variance.shift(1))\
                                                 / svd_variance_df.total_variance.shift(1)
    return svd_variance_df


def select_SVD_n_components(svd_variance_df, min_total_variance, min_variance_change):
    """
    Select optimal number of components by minimum total variance and variance improvement.
    In the case that multiple components meet the criteria, the lowest number of components will be selected

    :param svd_variance_df:
    :param min_total_variance:
    :param min_variance_change:
    :return:
    """
    n_components = svd_variance_df.query(f"total_variance >= {min_total_variance} & total_variance_change <{min_variance_change}").n_component.min()
    total_variance = svd_variance_df.query(f"n_component == {n_components}").total_variance.iloc[0]
    return n_components, total_variance


def plot_SVD_variance(svd_variance_df, n_components):
    """
    Plot variance and total variance by component

    :param svd_variance_df:
    :param n_components:
    :return:
    """
    fig, axes = plt.subplots(nrows=2, figsize=(12, 10))
    svd_variance_df.plot.line(
        x='n_component',
        y='variance',
        style='-o',
        title=f"% variance explained by component",
        ax=axes[0]
    ).axvline(
        x=n_components,
        linestyle='--',
        color='black'
    )

    svd_variance_df.plot.line(
        x='n_component',
        y='total_variance',
        style='-o',
        title=f"% total variance explained",
        c='green',
        ax=axes[1]
    ).axvline(
        x=n_components,
        linestyle='--',
        color='black'
    )

    plt.show()

In [14]:
X = artists_profile.set_index("artist_uri")

In [15]:
N_COMPONENTS = 1500
RANDOM_STATE = 671993

In [None]:
svd_variance_df = create_SVD_variance_df(
    X,
    random_state=RANDOM_STATE,
    n_components=N_COMPONENTS
)

In [None]:
svd_variance_df.head()

In [None]:
plot_SVD_variance(svd_variance_df, 1000)

In [None]:
# Number of components for final matrix
N_COMPONENTS=700

In [None]:
# Generate SVD
svd = TruncatedSVD(
    n_components=N_COMPONENTS,
    random_state=RANDOM_STATE
)

In [None]:
# Embedding dataframe creation
X_reduced = svd.fit_transform(X)
EMDEDDING_COLS = [f"emb_{i}" for i in range(N_COMPONENTS)]
embedding_df = pd.DataFrame(X_reduced, columns=EMDEDDING_COLS)
embedding_df.index = X.index

In [None]:
# Total variance
sum(svd.explained_variance_ratio_)

In [None]:
from sklearn.metrics import pairwise_distances

def create_distance_matrix(selected_emb_df, metric):
    """
    Create distance matrix based on selected embeddings

    :param selected_emb_df:
    :param metric:
    :return:
    """
    distance_matrix = pd.DataFrame(
        pairwise_distances(
            selected_emb_df.values,
            metric=metric
        ),
        index=selected_emb_df.index,
        columns=selected_emb_df.index
    )
    return distance_matrix

In [None]:
distance_matrix = create_distance_matrix(embedding_df, 'cosine')

In [None]:
# Store distance matrix into results
distance_matrix.to_pickle(f"{ROOT_DIR}/results/artist_genres_distance_matrix.pickle")

In [None]:
artist_similarity_array = distance_matrix["3nFkdlSjzX9mRTtwJOzDYB"].sort_values()

In [None]:
artist_similarity_array.head(15)

In [None]:
df_track_artist_with_interactions.query("artist_uri=='0auu2itHTxEdAMRHvx7CyG'")

In [None]:
similar_artists = artist_similarity_array.head(20).index.to_list()

In [None]:
df_track_artist_with_interactions.query("artist_uri in @similar_artists")