In [1]:
# -*- coding: utf8

import sys
sys.path.append('../code/')

In [2]:
from amutils import build_graph
from amutils import build_reverse_index
from amutils import load_am_json_data

from scipy.stats import kendalltau
from scipy.stats import pearsonr
from scipy.stats import spearmanr

from sklearn.metrics.pairwise import rbf_kernel

import numpy as np
import pandas as pd

In [12]:
json_data = load_am_json_data('../data/artists.json.gz')
feats_df =  pd.read_csv('../data/deep_features.csv', index_col=0)
names_df = pd.read_csv('../data/billboard-to-spotify-songs.csv', index_col=0).dropna()
names_df = names_df[['track_id_spf', 'track_name_bb', 'artist_name_bb']]

In [4]:
S = rbf_kernel(feats_df, gamma=0.5)
S = pd.DataFrame(S, index=feats_df.index, columns=feats_df.index.values)

In [5]:
store = pd.HDFStore('../cache/dt_matrices.h5')
DT = store['song_dt']
store.close()

In [6]:
valid = S.index.intersection(DT.index)
S = S.loc[valid]
S = S[valid.values]
DT = DT.loc[valid]
DT = DT[valid.values]

assert(DT.shape == S.shape)

In [7]:
MED_RANDOM = 0.23

In [8]:
_6M = pd.Timedelta(6, 'M').to_timedelta64().astype('timedelta64[s]').astype('f')
A = S[DT > _6M]
A = A[A > MED_RANDOM]
A = A.fillna(0)

In [9]:
reverse_index = {}
for artist in json_data.keys():
    reverse_index[json_data[artist]['name']] = artist

In [13]:
song2artist = {}
keys_to_consider = set()
for spotify_id, _, artist in names_df.values:
    if artist in reverse_index:
        song2artist[spotify_id] = artist
        keys_to_consider.add(reverse_index[artist])

In [11]:
graph = set()
for row in A.index.values:
    edges = A.loc[row]
    edges = edges[edges > 0]
    for col in edges.index.values:
        if row in song2artist and col in song2artist:
            src_id = reverse_index[song2artist[row]]
            by_id = reverse_index[song2artist[col]]
            if src_id in keys_to_consider and by_id in keys_to_consider:
                graph.add((src_id, by_id))

In [12]:
def extract_id(txt):
    pos = txt.rfind('mn')
    return txt[pos:pos+12]

intersection = set()
for artist in keys_to_consider:
    by_set = set(map(extract_id, json_data[artist]['influencer']))
    for by in by_set:
        if artist in keys_to_consider and by in keys_to_consider:
            intersection.add(artist)
            intersection.add(by)

In [13]:
GT = nx.DiGraph()
for artist in intersection:
    by_set = set(map(extract_id, json_data[artist]['influencer']))
    for by in by_set:
        if by in intersection:
            GT.add_edge(artist, by)

In [14]:
GN = nx.DiGraph()
for src, by in graph:
    if src in GT.nodes and by in GT.nodes:
        GN.add_edge(src, by)

In [15]:
GT = nx.DiGraph()
for artist in GN.nodes:
    data = json_data[artist]
    by_set = set(map(extract_id, json_data[artist]['influencer']))
    for by in by_set:
        if by in GN.nodes:
            GT.add_edge(artist, by)

In [16]:
GN = nx.DiGraph()
for src, by in graph:
    if src in GT.nodes and by in GT.nodes:
        GN.add_edge(src_id, by_id)

In [17]:
len(GN)

2

In [18]:
len(GT)

1242

In [19]:
def compute_disruption(G):

    id_to_node = dict((i, n) for i, n in enumerate(G.nodes))
    in_count = dict(G.in_degree(G.nodes))
    out_count = dict(G.out_degree(G.nodes))
    
    F = nx.to_scipy_sparse_matrix(G, format='csr')
    T = nx.to_scipy_sparse_matrix(G, format='csc')
    D = np.zeros(shape=(F.shape[0], 4))

    for node_id in range(F.shape[0]):
        if in_count[id_to_node[node_id]] >= 1:
            ni = 0
            nj = 0
            nk = 0

            outgoing = F[node_id].nonzero()[1]
            incoming = T[:, node_id].nonzero()[0]
            outgoing_set = set(outgoing)

            for other_id in incoming:
                second_level = F[other_id].nonzero()[1]
                if len(outgoing_set.intersection(second_level)) == 0:
                    ni += 1
                else:
                    nj += 1

            # who mentions my influences
            who_mentions_my_influences = np.unique(T[:, outgoing].nonzero()[0])
            for other_id in who_mentions_my_influences:
                if F[other_id, node_id] == 0:  # do they mention me?! if no, add nk
                    nk += 1

            D[node_id, 0] = ni
            D[node_id, 1] = nj
            D[node_id, 2] = nk
            D[node_id, 3] = (ni - nj) / (ni + nj + nk)
        else:
            D[node_id, 0] = np.nan
            D[node_id, 1] = np.nan
            D[node_id, 2] = np.nan
            D[node_id, 3] = np.nan
            
    return pd.DataFrame(D, index=G.nodes, columns=['ni', 'nj', 'nk', 'disruption'])

In [20]:
d_gn = compute_disruption(GN)
d_gt = compute_disruption(GT)

In [21]:
d_gn.head()

Unnamed: 0,ni,nj,nk,disruption
mn0000409670,,,,
mn0002009123,1.0,0.0,0.0,1.0


In [22]:
d_gt.head()

Unnamed: 0,ni,nj,nk,disruption
mn0000935330,3.0,3.0,75.0,0.0
mn0000884962,0.0,2.0,28.0,-0.066667
mn0000606283,1.0,0.0,12.0,0.076923
mn0000279337,5.0,2.0,86.0,0.032258
mn0000290072,9.0,8.0,165.0,0.005495


In [23]:
from scipy.stats import pearsonr, kendalltau, spearmanr

In [24]:
x = d_gn['disruption'].fillna(0)
y = d_gt['disruption'].fillna(0)

In [25]:
pearsonr(x, y)

ValueError: operands could not be broadcast together with shapes (2,) (1242,) 

In [None]:
kendalltau(x, y)

In [None]:
spearmanr(x, y)

In [None]:
d_gn