In [2]:
import pandas as pd
import networkx as nx
import wikipediaapi

In [3]:
# 1. Load artist data
df = pd.read_csv("https://raw.githubusercontent.com/freiraum-bq/Music_Project/main/data/raw/neo4j_artists.csv")
artists = df[['common_name', 'wiki_url']]

# Drop rows where wiki_url is NaN or an empty string
artists = artists[artists['wiki_url'].notna() & (artists['wiki_url'] != '')]
wiki = wikipediaapi.Wikipedia(user_agent="MusicGraphExample (research@example.com)", language="en")

In [23]:
import wikipediaapi
from concurrent.futures import ProcessPoolExecutor, as_completed
import networkx as nx


artist_urls = dict(zip(artists['common_name'], artists['wiki_url']))
url_to_artist = {url: name for name, url in artist_urls.items()}

import wikipediaapi
import networkx as nx
from concurrent.futures import ThreadPoolExecutor
import threading
import time

artist_urls = dict(zip(artists['common_name'], artists['wiki_url']))
url_to_artist = {url: name for name, url in artist_urls.items()}

def get_page_links(url):
    tries = 3
    wiki = wikipediaapi.Wikipedia(user_agent="MusicGraphExample (research@example.com)", language="en")
    for attempt in range(tries):
        try:
            if '/wiki/' not in url:
                return url, set()
            page_title = url.split('/wiki/')[-1]
            page = wiki.page(page_title)
            if not page.exists():
                return url, set()
            links = page.links.keys()
            full_urls = {f"https://en.wikipedia.org/wiki/{link}" for link in links}
            return url, full_urls
        except Exception as e:
            if attempt == tries - 1:
                print(f"Exception in get_page_links for url {url}: {e}")
                return url, set()
            else:
                time.sleep(2 ** attempt)

def build_graph_threaded(artist_urls, url_to_artist, max_workers=10, batch_print=200):
    G = nx.DiGraph()
    artist_names = list(artist_urls.keys())
    total = len(artist_names)
    lock = threading.Lock()
    progress = {'count': 0}

    for artist in artist_names:
        G.add_node(artist)

    def worker(artist):
        url = artist_urls[artist]
        url, linked_urls = get_page_links(url)
        edges = []
        for linked_url in linked_urls:
            if linked_url in url_to_artist and linked_url != url:
                mentioned_artist = url_to_artist[linked_url]
                edges.append((artist, mentioned_artist))
        with lock:
            progress['count'] += 1
            if progress['count'] % batch_print == 0 or progress['count'] == total:
                print(f"Processed {progress['count']} / {total} artists...")
        return edges

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = executor.map(worker, artist_names)

        for edges in results:
            for u, v in edges:
                G.add_edge(u, v, relation='MENTIONS')

    print(f"Graph built with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
    return G

# Usage example:
G = build_graph_threaded(artist_urls, url_to_artist)



Processed 200 / 5828 artists...
Processed 400 / 5828 artists...
Processed 600 / 5828 artists...
Processed 800 / 5828 artists...
Processed 1000 / 5828 artists...
Processed 1200 / 5828 artists...
Processed 1400 / 5828 artists...
Processed 1600 / 5828 artists...
Processed 1800 / 5828 artists...
Processed 2000 / 5828 artists...
Processed 2200 / 5828 artists...
Processed 2400 / 5828 artists...
Processed 2600 / 5828 artists...
Processed 2800 / 5828 artists...
Processed 3000 / 5828 artists...
Processed 3200 / 5828 artists...
Processed 3400 / 5828 artists...
Processed 3600 / 5828 artists...
Processed 3800 / 5828 artists...
Processed 4000 / 5828 artists...
Processed 4200 / 5828 artists...
Processed 4400 / 5828 artists...
Processed 4600 / 5828 artists...
Processed 4800 / 5828 artists...
Processed 5000 / 5828 artists...
Processed 5200 / 5828 artists...
Processed 5400 / 5828 artists...
Processed 5600 / 5828 artists...
Processed 5800 / 5828 artists...
Processed 5828 / 5828 artists...
Graph built wi

In [None]:
# # loads entire wiki
# import pandas as pd
# import wikipediaapi
# from concurrent.futures import ThreadPoolExecutor
# import threading
# import time

# # Prepare artist names list
# artist_names = artists['common_name'].tolist()

# def fetch_all_pages(artist_names, wiki):
#     wiki_texts = {}
#     lock = threading.Lock()
#     progress = {'count': 0}

#     def fetch_page(artist):
#         tries = 3
#         for attempt in range(tries):
#             try:
#                 page = wiki.page(artist)
#                 text = page.text if page.exists() else None
#                 break
#             except Exception as e:
#                 if attempt == tries - 1:
#                     text = None
#                 else:
#                     time.sleep(2 ** attempt)
#         with lock:
#             wiki_texts[artist] = text
#             progress['count'] += 1
#             if progress['count'] % 200 == 0:
#                 print(f"Fetched Wikipedia pages for {progress['count']} artists out of {len(artist_names)}")
#         return

#     with ThreadPoolExecutor(max_workers=10) as executor:
#         executor.map(fetch_page, artist_names)

#     return wiki_texts

# print("Starting to fetch Wikipedia pages for all artists...")
# wiki_texts = fetch_all_pages(artist_names, wiki)
# print("Finished fetching Wikipedia pages.")

Starting to fetch Wikipedia pages for all artists...
Fetched Wikipedia pages for 200 artists out of 5871
Fetched Wikipedia pages for 400 artists out of 5871
Fetched Wikipedia pages for 600 artists out of 5871
Fetched Wikipedia pages for 800 artists out of 5871
Fetched Wikipedia pages for 1000 artists out of 5871
Fetched Wikipedia pages for 1200 artists out of 5871
Fetched Wikipedia pages for 1400 artists out of 5871
Fetched Wikipedia pages for 1600 artists out of 5871
Fetched Wikipedia pages for 1800 artists out of 5871
Fetched Wikipedia pages for 2000 artists out of 5871
Fetched Wikipedia pages for 2200 artists out of 5871
Fetched Wikipedia pages for 2400 artists out of 5871
Fetched Wikipedia pages for 2600 artists out of 5871
Fetched Wikipedia pages for 2800 artists out of 5871
Fetched Wikipedia pages for 3000 artists out of 5871
Fetched Wikipedia pages for 3200 artists out of 5871
Fetched Wikipedia pages for 3400 artists out of 5871
Fetched Wikipedia pages for 3600 artists out of 58

In [None]:
# Basic stats
print("Nodes:", G.number_of_nodes())
print("Edges:", G.number_of_edges())

# Count mentions: who is mentioned most (incoming edges)
mention_count = Counter(v for _, v in G.edges())
top10 = mention_count.most_common(10)

print("Top-10 most-mentioned artists:")
for name, cnt in top10:
    print(f"{name}: mentioned by {cnt} pages")
    print ("eminem, yay. No taylor swift cuz she traaaash")


Nodes: 5828
Edges: 10086
Top-10 most-mentioned artists:
Adele: mentioned by 389 pages
Beyoncé: mentioned by 373 pages
U2: mentioned by 358 pages
Eminem: mentioned by 316 pages
Madonna: mentioned by 312 pages
Metallica: mentioned by 254 pages
Aerosmith: mentioned by 242 pages
Coldplay: mentioned by 242 pages
Rihanna: mentioned by 240 pages
Bono: mentioned by 225 pages
