In [9]:
# -*- coding: utf8

from scipy.special import gammaln
from scipy.special import psi


import gzip
import json
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import scipy.stats as ss

In [10]:
def extract_id(txt):
    pos = txt.rfind('mn')
    return txt[pos:pos+12]


def load_am_json_data():
    fpath = '../data/allmusic-data.json.gz'
    with gzip.open(fpath) as gzip_file:
        json_data = json.load(gzip_file)
        return json_data

In [11]:
def build_graph(json_data, nodes_to_consider=None, restrictive=False):
    
    G = nx.DiGraph()
    if nodes_to_consider is None:
        nodes_to_consider = set(json_data.keys())

    for artist in nodes_to_consider:
        data = json_data[artist]
        by_set = set(map(extract_id, data['influencer']))
        for by in by_set:
            if restrictive and by not in nodes_to_consider:
                continue
            
            if G.has_edge(artist, by):
                G[artist][by]['weight'] += 1
            else:
                G.add_weighted_edges_from([(artist, by, 1)])
                
    nx.set_node_attributes(G, json_data)
    return G

In [12]:
allmusic_json = load_am_json_data()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

In [None]:
allmusic_graphs = dict()
nodes_to_consider_allmusic = set()
print('Decade  Nodes Edges')
for decade in range(1940,2020,10):
    for artist in allmusic_json:
        if set(range(1940, decade + 1, 10)).intersection(allmusic_json[artist]['decades']):
            nodes_to_consider_allmusic.add(artist)
    allmusic_graphs[decade] = build_graph(allmusic_json,
                                          nodes_to_consider=nodes_to_consider_allmusic,
                                          restrictive=True)
    print(decade,': ',allmusic_graphs[decade].number_of_nodes(),' ',allmusic_graphs[decade].number_of_edges())

In [None]:
in_degree = dict()
in_degree['allmusic'] = dict(keys = range(1940, 2020, 10))
for decade in range(1940, 2020, 10):
    in_degree['allmusic'][decade] = dict(allmusic_graphs[decade].in_degree(weight = 'weight'))

In [None]:
out_degree = dict()
out_degree['allmusic'] = dict(keys = range(1940, 2020, 10))
for decade in range(1940, 2020, 10):
    out_degree['allmusic'][decade] = dict(allmusic_graphs[decade].out_degree(weight = 'weight'))

In [None]:
pageranks = dict()
pageranks['allmusic'] = dict(keys = range(1940, 2020, 10))
for decade in range(1940, 2020, 10):
    pageranks['allmusic'][decade] = nx.pagerank(allmusic_graphs[decade])

In [None]:
all_nodes = set(pageranks['allmusic'][2010].keys())
node2id = {}
id2node = {}
for node, value in pageranks['allmusic'][2010].items():
    node2id[node] = len(node2id)
    id2node[len(node2id) - 1] = node

In [None]:
names = {}
for key, val in allmusic_json.items():
    names[key] = val['name']

In [None]:
def get_rank_positions(decade, node2id, centrality):
    n = len(node2id)
    values = np.zeros(shape=(2, n))
    for node, value in centrality['allmusic'][decade].items():
        values[0, node2id[node]] = 1.0 / (value + 1)
        values[1, node2id[node]] = value + 1
    order = values.argsort(axis=1)
    ranks = order.argsort(axis=1)
    
    return ranks.T + 1

In [18]:
def rank_surprise_detector(observed, prior):
    '''
    Computes the rank based surprise.
    
    Arguments
    ---------
    observed: matrix with the number of nodes with centrality
              values greater then (observed[i][0]) or lower
              then (observed[i][1]) node i.
    prior:    The prior for each node. prior[i][0] is alpha, and
              prior[i][1] is beta. In our paper, we set the prior
              for each such that prior[i][0] / sum(prior[i])
              captures the fraction of other nodes with centrality
              greater than node i.
    
    Both arguments are o shape (n_nodes, 2). Organizing data like
    this make's it easy to compute the posterior = observed + prior.
    
    Returns
    -------
    The posterior and the surprise for each node
    
    References
    ----------
    
    [1]  Penny  WD  (2001):  “KL-Divergences  of  Normal,
         Gamma,  Dirichlet  and  Wishartdensities”.
         University College, London;
         URL: www.fil.ion.ucl.ac.uk/∼wpenny/publications/densities.ps.
    [2]  Kullback-Leibler Divergence Between Two Dirichlet
         (and Beta) Distributions. URL: http://bariskurt.com/
    '''
    posterior = prior + observed
    
    # from here the code is a dkl divergence of dirichlets. adapted and double
    # checked [1] and [2]
    
    d_obs = gammaln(posterior.sum(axis=1)) - np.sum(gammaln(posterior), axis=1)
    d_pri = gammaln(prior.sum(axis=1)) - np.sum(gammaln(prior), axis=1)
    individual_factors = (posterior - prior).T * \
            (psi(posterior).T - psi(posterior.sum(axis=1)))
    surprises = d_obs - d_pri + individual_factors.sum(axis=0)
    
    return posterior, surprises

In [19]:
prior = get_rank_positions(2010, node2id, pageranks)
prior

array([[14934,     1],
       [ 7141, 22663],
       [ 1820, 28059],
       ...,
       [16637,  8362],
       [29285,  3620],
       [29878, 12556]])

In [31]:
prior = get_rank_positions(2010, node2id, pageranks)
for decade in range(1940, 2000, 10):
    obs = get_rank_positions(decade, node2id, pageranks)
    print(2010, '->', decade + 10)
    
    posterior, surprises = rank_surprise_detector(obs, prior)
    top = surprises.argsort()[-20:][::-1]
    for ni in top:
        p = obs[ni][0] / obs[ni].sum()
        beta = ss.beta(a=posterior[ni][0], b=posterior[ni][1])
        print(names[id2node[ni]], surprises[ni], beta.ppf(p), sep='\t')
    print()

2010 -> 1950
Bo Diddley	11842.384511662203	0.337058155657353
Louis Armstrong	11736.257370479907	0.32492766126590344
Jelly Roll Morton	11727.683446581148	0.32495032106313
Lead Belly	11701.761873804338	0.3256430284765886
James P. Johnson	11696.674476163374	0.32500686960760033
The Carter Family	11692.390549742973	0.32508634720628304
Ray Charles	11678.52145632419	0.33020893302602006
Muddy Waters	11659.095302395974	0.3270056600760024
Louis Jordan	11651.392533330996	0.3254721703619725
Duke Ellington	11644.340672472412	0.3251540574064349
W.C. Handy	11633.535428338415	0.3252221041850291
Fletcher Henderson	11623.237983998042	0.3251424919027258
Kid Ory	11617.495501633151	0.3251651517155467
Hank Williams	11610.007522682223	0.32632363502921996
Lonnie Johnson	11609.673177969817	0.32529001635183136
Fats Waller	11601.596491219192	0.3252558583107358
Miles Davis	11596.792322146437	0.3278227740610473
Jimmy Reed	11589.531145659508	0.3336114665479847
Eubie Blake	11587.58469282867	0.3252329965335282
T-Bone

In [34]:
prior = get_rank_positions(1940, node2id, pageranks)
surprises = np.zeros(len(node2id))
for decade in range(1950, 2020, 10):
    obs = get_rank_positions(decade, node2id, pageranks)

    posterior, surprises = rank_surprise_detector(obs, prior)
    print(decade-10, '->', decade,
          surprises.sum() / allmusic_graphs[decade].number_of_nodes())
    top = surprises.argsort()[-20:][::-1]
    for ni in top:
        p = obs[ni][0] / obs[ni].sum()
        beta = ss.beta(a=posterior[ni][0], b=posterior[ni][1])
        print(names[id2node[ni]], surprises[ni], beta.ppf(p), sep='\t')
    print()
    prior = obs

1940 -> 1950 21.14105094682595
Joana Zimmer	79.74342597954092	0.4709712207626524
Thelonious Monster	79.20397775781021	0.4710709352721716
Childish Gambino	79.19087857165869	0.47107275970772267
Eldar	79.05164010844601	0.47109684697907456
Denis Dufour	77.84917469530774	0.5216925040785245
Stoney Edwards	77.00153374543879	0.4714735441924793
Jackie King	75.03663110580237	0.5209442353033611
Red Ants	74.78739934737678	0.520984393502301
Let's Eat Grandma	74.65746472858882	0.5209516742402512
Emanuel and the Fear	74.55019158634241	0.5209242876510551
The Rockin' Ramrods	74.42222633551864	0.5211513821356966
Tommy Conwell	74.21941297044395	0.5207653990912285
Kirka	74.13452464588045	0.5208417752186356
My Favorite	74.06393409116572	0.5208736233945649
Astronoid	73.93498899081897	0.5210347277688121
Dev	73.91826950331597	0.5209678123963275
The Crookes	73.88927338182111	0.5208425843781402
Jill Barber	73.85267025885696	0.520751337568838
Camera Shy	73.80143538169796	0.5217521404561598
U.S. Chaos	73.62721547