In [19]:
from itertools import chain, product, combinations
import json
from pprint import pprint

from scipy import sparse
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display, Markdown, IFrame

import networkx as nx
import plotly.graph_objects as go

sns.set_style('whitegrid')

In [2]:
artists = pd.read_json('../data/artists_table.json', orient='table')
songs = pd.read_json('../data/songs_table.json', orient='table')

In [8]:
pprint(artists.columns.tolist())

['name',
 'week_min',
 'week_max',
 'weeks_on_chart_total',
 'weeks_on_chart_longest',
 'weeks_on_chart_mean',
 'rank_min',
 'rank_mean5',
 'rank_mean',
 'rank_median',
 'rank_min_mean',
 'rank_min_mean5',
 'rank_min_median',
 'num_songs',
 'y_min',
 'y_max']


In [32]:
print("Minimum rank <= 10:\t", (artists.rank_min <= 10).sum())
print("Median rank <= 16:\t", (artists.rank_median <= 16).sum())
print("Num songs >= 30:\t", (artists.num_songs >= 50).sum())

Minimum rank <= 10:	 579
Median rank <= 16:	 49
Num songs >= 30:	 13


In [33]:
artist_mask = artists.num_songs >= 50

In [35]:
filtered_artist_ids = artists[artist_mask].index

Number of songs by most prolific 13 artists

In [38]:
songs.apply(lambda row: any(x in filtered_artist_ids for x in row.collaborators_id), axis=1).sum()

1116

TODO:
* Create graph
* Set artist positions
* perhaps record how many other artists NOT in the network each artist collab'ed with

In [None]:
class GraphWrapper(object):
    def __init__(self, _df):
        songs = _df.sort_values('Weeks.on.chart', ascending=False).sort_values('Weekly.rank').groupby(['Artists', 'Name']).first().reset_index()
        artists = set(list(chain(*_df.Collaborators.tolist())))
        songs['NumCollaborators'] = songs.Collaborators.apply(len)
        
        # artist maps
        artist2ind = {a: i for i, a in enumerate(sorted(artists))}

        G = nx.Graph()
        counter = 0
        for collaborators in songs.Collaborators:
            for artist in collaborators:
                ind = artist2ind[artist]
                if ind not in G.nodes:
                    G.add_node(ind, artist=artist, count=1)
                else:
                    #if 'artist' not in G.nodes[ind]:
                    #    G.nodes[ind]['artist'] = y
                    G.nodes[ind]['count'] += 1
            
            # add edges between each pair(2) of artists in the collaborators list of this song
            artist_indices = [artist2ind[artist] for artist in collaborators]
            G.add_edges_from(combinations(artist_indices, 2))
            # TODO: add weights

        self.songs = songs
        self.artists = artists
        self.artist2ind = artist2ind
        self.ind2artist = sorted(artists)
        self.graph = G
        self._connected_components = None
        self._cliques = None
            
    @property
    def connected_components(self):
        if self._connected_components is None:
            self._connected_components = list(nx.connected_components(self.graph))
        return self._connected_components
    
    @property
    def ccs(self):
        return self.connected_components
    
    @property
    def cliques(self):
        if self._cliques is None:
            self._cliques = list(nx.enumerate_all_cliques(self.graph))
        return self._cliques