In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Load data
df_orig = pd.read_csv("originals.csv")
df_cov = pd.read_csv("covers.csv")

# Create the graph
G = nx.DiGraph()


In [6]:
import numpy as np

df_try=np.array(df_orig['org_art_name'])

print(df_try)
# Add artist nodes
for artist_name in df_try:
    G.add_node(artist_name, type='Artist')


['Sidney Bechet' 'Dan Hartman' 'Van Morrison' ...
 'Annie Ross & The Low Note Quintet' 'Teitur' 'Can']


In [10]:
len(G.nodes)

17975

In [17]:
import pandas as pd
# Merge on perf_id, keep only cov_art_id and org_art_id
df_merged = pd.merge(df_cov[['song_title', 'cov_art_id']], 
                     df_orig[['song_title', 'org_art_id']], 
                     on='song_title', 
                     how='inner')

print(df_merged.head())

        song_title  cov_art_id org_art_id
0     Petite fleur       [879]        [1]
1   Evil Gal Blues        [10]        [8]
2   Evil Gal Blues         [9]        [8]
3  Big Yellow Taxi  [20, 1912]     [5191]
4    Light My Fire        [21]       [16]


In [23]:
import pandas as pd
import ast

# Step 1: Parse stringified lists if needed
def parse_list(val):
    if isinstance(val, str):
        try:
            return ast.literal_eval(val)
        except:
            return [val]
    return val

df_merged['cov_art_id'] = df_merged['cov_art_id'].apply(parse_list)
df_merged['org_art_id'] = df_merged['org_art_id'].apply(parse_list)

# Step 2: Explode both columns
df_exploded = df_merged.explode('cov_art_id').explode('org_art_id').reset_index(drop=True)
print(df_exploded)


                 song_title cov_art_id org_art_id
0              Petite fleur        879          1
1            Evil Gal Blues         10          8
2            Evil Gal Blues          9          8
3           Big Yellow Taxi         20       5191
4           Big Yellow Taxi       1912       5191
...                     ...        ...        ...
584171         Modern Music      35611      17062
584172      Surabaya Johnny      14339       2437
584173          Being Alive      14339      69125
584174          Being Alive      14339      69125
584175  Yesterday Once More        894       1389

[584176 rows x 3 columns]


In [24]:
edge_count = 0
for _, row in df_merged.iterrows():
    covering_artists = row['cov_art_id']
    original_artists = row['org_art_id']

    # Create edges for every pair (covering -> original)
    for cov_id in covering_artists:
        for org_id in original_artists:
            if cov_id != org_id:  # avoid self-loop
                G.add_edge(cov_id, org_id, relation='COVERED')
                edge_count += 1

In [25]:
len(G.edges)

499454

In [75]:
import collections

# Step 2: Count how many times each artist is covered
cover_count = collections.Counter()
print(cover_count)
for u, v, data in G.edges(data=True):
    if data.get('relation') == 'COVERED':
        cover_count[v] += 1  # v = original artist who was covered

# Step 3: Get top 10 most covered artists
top_10_covered = cover_count.most_common(10)
print(top_10_covered)

# Step 4: Print them
for artist, count in top_10_covered:
    print(f"{artist}: covered {count} times")

Counter()
[(41, 4252), (243, 2261), (319, 2184), (4305, 2116), (158, 2058), (11023, 1885), (1535, 1706), (2232, 1701), (61, 1556), (2575, 1551)]
41: covered 4252 times
243: covered 2261 times
319: covered 2184 times
4305: covered 2116 times
158: covered 2058 times
11023: covered 1885 times
1535: covered 1706 times
2232: covered 1701 times
61: covered 1556 times
2575: covered 1551 times


In [27]:
# Extract only the artist IDs (integers)
top_10_artist_ids = [artist_id for artist_id, count in top_10_covered]
print(top_10_artist_ids)

[41, 243, 319, 4305, 158, 11023, 1535, 2232, 61, 2575]


In [50]:
# no real reason for this other than to convert everything to a string
# Merge on perf_id, keep only cov_art_id and org_art_id
df_ids_and_names=df_orig[['org_art_name', 'org_art_id']]

# Step 1: Parse stringified lists if needed
def parse_list(val):
    if isinstance(val, str):
        try:
            return ast.literal_eval(val)
        except:
            return [val]
    return val

df_ids_and_names['org_art_id'] = df_ids_and_names['org_art_id'].apply(parse_list)


# Step 2: Explode both columns
df_id_names_exploded = df_ids_and_names.explode('org_art_id').reset_index(drop=True)
print(df_id_names_exploded)

                            org_art_name org_art_id
0                          Sidney Bechet          1
1                            Dan Hartman       5483
2                           Van Morrison          6
3                       Dinah Washington          8
4                            The Regents         13
...                                  ...        ...
61807                  The Four Freshmen      14202
61808                          Judy Lynn     100685
61809  Annie Ross & The Low Note Quintet       2442
61810                             Teitur      50123
61811                                Can      17460

[61812 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ids_and_names['org_art_id'] = df_ids_and_names['org_art_id'].apply(parse_list)


In [73]:
artist_ids = [41, 243, 319, 4305, 158, 11023, 1535, 2232, 61, 2575]
artist_ids_str = [str(x) for x in artist_ids]

# Filter rows where org_art_id (as string) matches any artist_id string
matches = df_id_names_exploded[df_id_names_exploded['org_art_id'].astype(str).isin(artist_ids_str)]

# Drop duplicates to keep only the first instance of each artist_id
matches_unique = matches.drop_duplicates(subset='org_art_id')

# Select and rename columns
counts = [count for _, count in top_10_covered]
result_df = matches_unique[['org_art_name', 'org_art_id']].copy()
result_df.columns = ['original_name', 'artist_id']
result_df["times covered"]=counts
print(result_df)


                                  original_name artist_id  times covered
20                                  The Beatles        41           4252
35                                   Will Smith        61           2261
107                                   Bob Dylan       158           2184
330                                  Diana Ross     11023           2116
603                    Bing Crosby, Grace Kelly       243           2058
1050                                 The Troggs      1535           1885
1668   Mae West & Duke Ellington, His Orchestra      4305           1706
1696                               Fred Astaire      2232           1701
1760                              Frank Sinatra       319           1556
13581                               The Sundays      2575           1551


pagerank centrality metric
importance based on network structure

In [None]:
print("Nodes:", G.number_of_nodes())
print("Edges:", G.number_of_edges())


Nodes: 76356
Edges: 499454


In [92]:
# Compute PageRank
pagerank_scores = nx.pagerank(G, alpha=0.85) # is a default
# # Sort and get top 10
top_10_pagerank = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)[:10]
pagerank_results=np.array(["top ten pagerank"])

# Extract just the artist IDs into a list
pagerank_results= [artist_id for artist_id, score in top_10_pagerank]
print(pagerank_results)

[41, 158, 206, 616, 90, 297, 194, 604, 243, 277]


In [94]:
pagerank_results = [str(i) for i in pagerank_results]
print(pagerank_results)
# Filter rows where org_art_id (as string) matches any artist_id string
matches = df_id_names_exploded[df_id_names_exploded['org_art_id'].astype(str).isin(pagerank_results)]

# Drop duplicates to keep only the first instance of each artist_id
matches_unique = matches.drop_duplicates(subset='org_art_id')

# Select and rename columns
counts = [count for _, count in top_10_covered]
pagerank_result_new = matches_unique[['org_art_name', 'org_art_id']].copy()
pagerank_result_new.columns = ['original_name', 'artist_id']
print(pagerank_result_new)


['41', '158', '206', '616', '90', '297', '194', '604', '243', '277']
                 original_name artist_id
20                 The Beatles        41
48      The Velvet Underground        90
107                  Bob Dylan       158
127                David Bowie       194
150         The Rolling Stones       206
200                The Stooges       277
473                    Ramones       604
495               Depeche Mode       616
603   Bing Crosby, Grace Kelly       243
1410                        U2       297


In-degree Centrality

In [97]:
# Compute in-degree centrality (for directed graphs)
in_degree_centrality = nx.in_degree_centrality(G)

# Sort and get top 10 nodes by in-degree centrality
in_degree_centrality_results= sorted(in_degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]

# Extract just the artist IDs into a list
in_degree_centrality_results= [artist_id for artist_id, score in in_degree_centrality_results]
print(in_degree_centrality_results)

[41, 243, 319, 4305, 158, 11023, 1535, 2232, 61, 2575]


In [98]:
in_degree_centrality_results = [str(i) for i in in_degree_centrality_results]
print(in_degree_centrality_results)
# Filter rows where org_art_id (as string) matches any artist_id string
matches = df_id_names_exploded[df_id_names_exploded['org_art_id'].astype(str).isin(in_degree_centrality_results)]

# Drop duplicates to keep only the first instance of each artist_id
matches_unique = matches.drop_duplicates(subset='org_art_id')

# Select and rename columns
counts = [count for _, count in top_10_covered]
in_degree_centrality_results_new = matches_unique[['org_art_name', 'org_art_id']].copy()
in_degree_centrality_results_new.columns = ['original_name', 'artist_id']
print(in_degree_centrality_results_new)

['41', '243', '319', '4305', '158', '11023', '1535', '2232', '61', '2575']
                                  original_name artist_id
20                                  The Beatles        41
35                                   Will Smith        61
107                                   Bob Dylan       158
330                                  Diana Ross     11023
603                    Bing Crosby, Grace Kelly       243
1050                                 The Troggs      1535
1668   Mae West & Duke Ellington, His Orchestra      4305
1696                               Fred Astaire      2232
1760                              Frank Sinatra       319
13581                               The Sundays      2575


Betweenness Centrality

Nodes that control flow / connectors

Example in Your Music Network Context
Betweenness could highlight artists who link different genres or scenes by covering songs from multiple communities.

Artists who bridge otherwise disconnected groups or influence multiple clusters.

They might not be the most covered (high in-degree) but have strategic importance connecting parts of the network.



In [102]:
# Approximate betweenness using a sample of nodes
# note: run time is insane otherwise (could take days), thus k set low
betweenness = nx.betweenness_centrality(G, k=100)  # try k=500 or lower

# Sort nodes by betweenness centrality descending and take top 10
top_10_betweenness = sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:10]

# Extract just the artist IDs into a list
betweenness_centrality_results= [artist_id for artist_id, score in top_10_betweenness]
print(betweenness_centrality_results)

[41, 158, 10, 319, 103, 206, 148, 243, 618, 6492]


In [103]:
betweenness_centrality_results = [str(i) for i in in_degree_centrality_results]
print(betweenness_centrality_results)
# Filter rows where org_art_id (as string) matches any artist_id string
matches = df_id_names_exploded[df_id_names_exploded['org_art_id'].astype(str).isin(betweenness_centrality_results)]

# Drop duplicates to keep only the first instance of each artist_id
matches_unique = matches.drop_duplicates(subset='org_art_id')

# Select and rename columns
counts = [count for _, count in top_10_covered]
betweenness_centrality_results_new = matches_unique[['org_art_name', 'org_art_id']].copy()
betweenness_centrality_results_new.columns = ['original_name', 'artist_id']
print(betweenness_centrality_results_new)

['41', '243', '319', '4305', '158', '11023', '1535', '2232', '61', '2575']
                                  original_name artist_id
20                                  The Beatles        41
35                                   Will Smith        61
107                                   Bob Dylan       158
330                                  Diana Ross     11023
603                    Bing Crosby, Grace Kelly       243
1050                                 The Troggs      1535
1668   Mae West & Duke Ellington, His Orchestra      4305
1696                               Fred Astaire      2232
1760                              Frank Sinatra       319
13581                               The Sundays      2575


Network Description

In [104]:
# Basic stats
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
density = nx.density(G)

# Components (weakly connected components for directed graph)
num_components = nx.number_weakly_connected_components(G)

# Average shortest path length (only valid on strongly connected graphs or components)
# We'll use the largest weakly connected component
largest_cc = max(nx.weakly_connected_components(G), key=len)
G_sub = G.subgraph(largest_cc)
try:
    avg_shortest_path = nx.average_shortest_path_length(G_sub)
except:
    avg_shortest_path = "N/A (graph not connected)"

# Create a table
metrics = {
    "Metric": [
        "Number of vertices (artists)",
        "Number of edges (song covers)",
        "Number of components",
        "Average shortest path length",
        "Density"
    ],
    "Value": [
        num_nodes,
        num_edges,
        num_components,
        avg_shortest_path,
        density
    ]
}

metrics_df = pd.DataFrame(metrics)
print(metrics_df.to_string(index=False))

                       Metric                     Value
 Number of vertices (artists)                     76356
Number of edges (song covers)                    499454
         Number of components                     18153
 Average shortest path length N/A (graph not connected)
                      Density                  0.000086
