In [1]:
# -*- coding: utf8

from scipy.special import gammaln
from scipy.special import psi


import gzip
import json
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import scipy.stats as ss

In [2]:
def extract_id(txt):
    pos = txt.rfind('mn')
    return txt[pos:pos+12]


def load_am_json_data():
    fpath = '../data/allmusic-data.json.gz'
    with gzip.open(fpath) as gzip_file:
        json_data = json.load(gzip_file)
        return json_data

In [3]:
def build_graph(json_data, nodes_to_consider=None, restrictive=False):
    
    G = nx.DiGraph()
    if nodes_to_consider is None:
        nodes_to_consider = set(json_data.keys())

    for artist in nodes_to_consider:
        data = json_data[artist]
        by_set = set(map(extract_id, data['influencer']))
        for by in by_set:
            if restrictive and by not in nodes_to_consider:
                continue
            
            if G.has_edge(artist, by):
                G[artist][by]['weight'] += 1
            else:
                G.add_weighted_edges_from([(artist, by, 1)])
                
    nx.set_node_attributes(G, json_data)
    return G

In [4]:
allmusic_json = load_am_json_data()

In [5]:
allmusic_graphs = dict()
nodes_to_consider_allmusic = set()
print('Decade  Nodes Edges')
for decade in range(1940,2020,10):
    for artist in allmusic_json:
        if set(range(1940, decade + 1, 10)).intersection(allmusic_json[artist]['decades']):
            nodes_to_consider_allmusic.add(artist)
    allmusic_graphs[decade] = build_graph(allmusic_json,
                                          nodes_to_consider=nodes_to_consider_allmusic,
                                          restrictive=True)
    print(decade,': ',allmusic_graphs[decade].number_of_nodes(),' ',allmusic_graphs[decade].number_of_edges())

Decade  Nodes Edges
1940 :  1113   2981
1950 :  2284   7726
1960 :  4336   17901
1970 :  6652   28885
1980 :  10106   44395
1990 :  17376   72661
2000 :  26603   105658
2010 :  29878   115053


In [6]:
in_degree = dict()
in_degree['allmusic'] = dict(keys = range(1940, 2020, 10))
for decade in range(1940, 2020, 10):
    in_degree['allmusic'][decade] = dict(allmusic_graphs[decade].in_degree(weight = 'weight'))

In [7]:
out_degree = dict()
out_degree['allmusic'] = dict(keys = range(1940, 2020, 10))
for decade in range(1940, 2020, 10):
    out_degree['allmusic'][decade] = dict(allmusic_graphs[decade].out_degree(weight = 'weight'))

In [8]:
pageranks = dict()
pageranks['allmusic'] = dict(keys = range(1940, 2020, 10))
for decade in range(1940, 2020, 10):
    pageranks['allmusic'][decade] = nx.pagerank(allmusic_graphs[decade])

In [9]:
all_nodes = set(pageranks['allmusic'][2010].keys())
node2id = {}
id2node = {}
for node, value in pageranks['allmusic'][2010].items():
    node2id[node] = len(node2id)
    id2node[len(node2id) - 1] = node

In [10]:
names = {}
for key, val in allmusic_json.items():
    names[key] = val['name']

In [11]:
def get_rank_positions(decade, node2id, centrality):
    n = len(node2id)
    values = np.zeros(shape=(2, n))
    for node, value in centrality['allmusic'][decade].items():
        values[0, node2id[node]] = 1.0 / (value + 1)
        values[1, node2id[node]] = value + 1
    order = values.argsort(axis=1)
    ranks = order.argsort(axis=1)
    
    return ranks.T + 1

In [12]:
def rank_surprise_detector(observed, prior):
    '''
    Computes the rank based surprise.
    
    Arguments
    ---------
    observed: matrix with the number of nodes with centrality
              values greater then (observed[i][0]) or lower
              then (observed[i][1]) node i.
    prior:    The prior for each node. prior[i][0] is alpha, and
              prior[i][1] is beta. In our paper, we set the prior
              for each such that prior[i][0] / sum(prior[i])
              captures the fraction of other nodes with centrality
              greater than node i.
    
    Both arguments are o shape (n_nodes, 2). Organizing data like
    this make's it easy to compute the posterior = observed + prior.
    
    Returns
    -------
    The posterior and the surprise for each node
    
    References
    ----------
    
    [1]  Penny  WD  (2001):  “KL-Divergences  of  Normal,
         Gamma,  Dirichlet  and  Wishartdensities”.
         University College, London;
         URL: www.fil.ion.ucl.ac.uk/∼wpenny/publications/densities.ps.
    [2]  Kullback-Leibler Divergence Between Two Dirichlet
         (and Beta) Distributions. URL: http://bariskurt.com/
    '''
    posterior = prior + observed
    
    # from here the code is a dkl divergence of dirichlets. adapted and double
    # checked [1] and [2]
    
    d_obs = gammaln(posterior.sum(axis=1)) - np.sum(gammaln(posterior), axis=1)
    d_pri = gammaln(prior.sum(axis=1)) - np.sum(gammaln(prior), axis=1)
    individual_factors = (posterior - prior).T * \
            (psi(posterior).T - psi(posterior.sum(axis=1)))
    surprises = d_obs - d_pri + individual_factors.sum(axis=0)
    
    return posterior, surprises

In [13]:
prior = get_rank_positions(2010, node2id, pageranks)
prior

array([[20025,     1],
       [  364, 29515],
       [23312, 17316],
       ...,
       [16603,  8324],
       [29572,  3681],
       [29878, 12498]])

In [14]:
prior = get_rank_positions(2010, node2id, pageranks)
for decade in range(1940, 2000, 10):
    obs = get_rank_positions(decade, node2id, pageranks)
    print(2010, '->', decade + 10)
    
    posterior, surprises = rank_surprise_detector(obs, prior)
    top = surprises.argsort()[-20:][::-1]
    for ni in top:
        p = obs[ni][0] / obs[ni].sum()
        beta = ss.beta(a=posterior[ni][0], b=posterior[ni][1])
        print(names[id2node[ni]], surprises[ni], beta.ppf(p), sep='\t')
    print()

2010 -> 1950
Bo Diddley	11795.897683501607	0.33601758798328163
Louis Armstrong	11736.257370479907	0.32492766126590344
Jelly Roll Morton	11727.683446581148	0.32495032106313
Lead Belly	11701.761873804338	0.3256430284765886
James P. Johnson	11696.674476163374	0.32500686960760033
The Carter Family	11692.390549742973	0.32508634720628304
Ray Charles	11678.52145632419	0.33020893302602006
Muddy Waters	11659.095302395974	0.3270056600760024
Louis Jordan	11651.392533330996	0.3254721703619725
Duke Ellington	11644.340672472412	0.3251540574064349
W.C. Handy	11633.535428338415	0.3252221041850291
Fletcher Henderson	11623.237983998042	0.3251424919027258
Kid Ory	11617.495501633151	0.3251651517155467
Hank Williams	11610.007522682223	0.32632363502921996
Lonnie Johnson	11609.673177969817	0.32529001635183136
Fats Waller	11601.596491219192	0.3252558583107358
Miles Davis	11596.792322146437	0.3278227740610473
Jimmy Reed	11590.8698314043	0.33364175741489854
Eubie Blake	11587.58469282867	0.3252329965335282
T-Bon

In [15]:
for decade in range(1950, 2020, 10):
    prior = get_rank_positions(decade - 10, node2id, pageranks)
    obs = get_rank_positions(decade, node2id, pageranks)

    posterior, surprises = rank_surprise_detector(obs, prior)
    print(decade-10, '->', decade,
          surprises.sum() / allmusic_graphs[decade].number_of_nodes())
    top = surprises.argsort()[-20:][::-1]
    for ni in top:
        p = obs[ni][0] / obs[ni].sum()
        beta = ss.beta(a=posterior[ni][0], b=posterior[ni][1])
        print(names[id2node[ni]], surprises[ni], beta.ppf(p), sep='\t')
    print()

1940 -> 1950 3.8650360135407276
Fats Domino	16.732463184773223	0.49496849265085396
Bill Haley	16.382403262388834	0.495575003798559
Clarence "Gatemouth" Brown	15.076960342077655	0.49542494679326565
The Clovers	15.057104760933726	0.49566655406603255
Bo Diddley	13.3488438467175	0.4960401929373018
Clarence Fountain	13.333244189714605	0.4959989435422384
Lula Reed	13.306284304882865	0.5070519045069587
Esther Phillips	12.556705116381636	0.4987726959032808
Mike Walbridge	11.407775266590761	0.4896125480262882
Lucia Dlugoszewski	11.298392272554338	0.5106680364074789
Doug Watkins	10.672291896582465	0.5113107091217168
Kenny Dorham	10.616795092078974	0.4963252999412312
François-Bernard Mâche	10.615620929500437	0.5103353179639374
Connie Converse	10.522883130433911	0.5107734671417403
The Tams	10.443878030389897	0.5116280550276857
Jimmy Reed	10.420962994998263	0.4926780492726743
Bill Potts	10.286131021377514	0.5115404557944991
Levon Helm	10.27659430528729	0.5129393288139686
Elvis Presley	10.2632330650