## Project 3 - Neo4j Graph Prep
Semester: Spring 2025 - Section 9

Course: w205 â€“ Fundamentals of Data Engineering

### Importing Libraries & Load Dataset

In [1]:
import math
import numpy as np
import pandas as pd
import psycopg2
from psycopg2.extras import execute_values
from neo4j import GraphDatabase

In [2]:
# !pip install kaggle
# !apt-get update && apt-get install -y unzip

In [3]:
# !mkdir -p ~/.kaggle
# !cp /user/projects/project-3-kalafosaurus/code/kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json

In [4]:
# !kaggle datasets download -d andrewmvd/spotify-playlists

In [5]:
# !unzip spotify-playlists.zip -d spotify_data

In [6]:
cols = pd.read_csv('/user/projects/project-3-kalafosaurus/code/spotify_data/spotify_dataset.csv', nrows=0)
print(cols.columns.tolist())

['user_id', ' "artistname"', ' "trackname"', ' "playlistname"']


In [7]:
df = pd.read_csv(
    '/user/projects/project-3-kalafosaurus/code/spotify_data/spotify_dataset.csv',
    names=['user_id', 'artistname', 'trackname', 'playlistname'],
    header=0,
    on_bad_lines='skip'
)
df.dropna(inplace=True)
df.head()

Unnamed: 0,user_id,artistname,trackname,playlistname
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


### EDA

In [8]:
print("Dataset shape:")
print(df.shape)
print()

print("Missing values:")
print(df.isna().sum())
print()

print("Unique counts:")
print(df.nunique())
print()

print("Top 10 artists:")
print(df['artistname'].value_counts().head(10))
print()

print("Top 10 tracks:")
print(df['trackname'].value_counts().head(10))
print()

print("Top 10 playlist names:")
print(df['playlistname'].value_counts().head(10))

Dataset shape:
(12856838, 4)

Missing values:
user_id         0
artistname      0
trackname       0
playlistname    0
dtype: int64

Unique counts:
user_id           15914
artistname       289603
trackname       2004523
playlistname     157320
dtype: int64

Top 10 artists:
Daft Punk             36086
Coldplay              35485
Radiohead             31429
The Rolling Stones    30814
Kanye West            29111
JAY Z                 28928
Eminem                28894
Queen                 28079
David Bowie           27791
Michael Jackson       26335
Name: artistname, dtype: int64

Top 10 tracks:
Intro          6675
Home           5600
Closer         3548
Runaway        3349
Hold On        3224
Radioactive    3188
Forever        3055
Stay           2992
Alive          2936
Wake Me Up     2793
Name: trackname, dtype: int64

Top 10 playlist names:
Starred                  1334319
Liked from Radio          180079
Favoritas de la radio      30425
Rock                       30107
2014          

### Postgres

In [9]:
def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()

    for column in df:
        if df[column].dtype == "float64":
            if all((np.isnan(x) or x == int(x)) for x in df[column]):
                df[column] = df[column].astype('Int64')

    return df

In [10]:
connection = psycopg2.connect(
    user="postgres",
    password="ucb",
    host="postgres",
    port="5432",
    database="postgres"
)

cur = connection.cursor()

cur.execute('DROP TABLE IF EXISTS spotify_data;')
cur.execute('''
    CREATE TABLE spotify_data (
        user_id VARCHAR(100),
        artistname VARCHAR(500),
        trackname VARCHAR(500),
        playlistname VARCHAR(500)
    );
''')
connection.commit()

# Prepare bulk data
data_tuples = list(df.itertuples(index=False, name=None))

# Fast bulk insert
insert_query = '''
    INSERT INTO spotify_data (user_id, artistname, trackname, playlistname)
    VALUES %s;
'''
execute_values(cur, insert_query, data_tuples)
connection.commit()

In [None]:
rollback_before_flag = True
rollback_after_flag = True

query = '''
WITH artist_pairs AS (
  SELECT a1.artistname as artist1, a2.artistname as artist2, 
         COUNT(DISTINCT a1.playlistname) as shared_playlists
  FROM spotify_data a1
  JOIN spotify_data a2 ON a1.playlistname = a2.playlistname 
                      AND a1.artistname < a2.artistname
  GROUP BY a1.artistname, a2.artistname
  HAVING COUNT(DISTINCT a1.playlistname) > 1
  ORDER BY shared_playlists DESC
  LIMIT 2000
)
SELECT artistname, trackname, COUNT(*) AS play_count
FROM spotify_data
WHERE artistname IN (SELECT artist1 FROM artist_pairs UNION SELECT artist2 FROM artist_pairs)
GROUP BY artistname, trackname
ORDER BY play_count DESC
LIMIT 10000;
'''

df_pairs = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
df_pairs

### Neo4j Nodes and Edges

In [None]:
# Artist nodes
df_artists = df_pairs[['artistname']].drop_duplicates().rename(columns={'artistname': 'name'})
df_artists['label'] = 'Artist'

# Track nodes
df_tracks = df_pairs[['trackname']].drop_duplicates().rename(columns={'trackname': 'name'})
df_tracks['label'] = 'Track'

# Relationships: Artist -[:PERFORMED]-> Track
df_edges = df_pairs.rename(columns={
    'artistname': 'source',
    'trackname': 'target',
    'play_count': 'weight'
})
df_edges['type'] = 'PERFORMED'

df_artists.head(), df_tracks.head(), df_edges.head()

In [None]:
df_artists.to_csv("/user/projects/project-3-kalafosaurus/code/artists.csv", index=False)
df_tracks.to_csv("/user/projects/project-3-kalafosaurus/code/tracks.csv", index=False)
df_edges.to_csv("/user/projects/project-3-kalafosaurus/code/edges.csv", index=False)

### Neo4j Graphs

In [None]:
neo4j_uri = "bolt://neo4j:7687"
neo4j_user = "neo4j"
neo4j_pass = "ucb_mids_w205"

driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_pass))
session = driver.session(database="neo4j")

In [None]:
# test connection
# session.run("MATCH (n) RETURN n LIMIT 5").data()

In [None]:
# Clean everything
session.run("MATCH (n) DETACH DELETE n")
try:
    session.run("CALL gds.graph.drop('spotify_graph', false)")
    print("Dropped existing graph projection")
except Exception as e:
    print(f"No existing graph projection to drop or error: {e}")

# Create Artist nodes from dataframe
artists = df_pairs['artistname'].unique()
for artist in artists:
    session.run(
        "CREATE (a:Artist {name: $name})",
        name=artist
    )

# Create Track nodes directly from dataframe
tracks = df_pairs['trackname'].unique()
for track in tracks:
    session.run(
        "CREATE (t:Track {name: $name})",
        name=track
    )

# Create indexes
session.run("CREATE INDEX artist_name IF NOT EXISTS FOR (a:Artist) ON (a.name)")
session.run("CREATE INDEX track_name IF NOT EXISTS FOR (t:Track) ON (t.name)")

# Create relationships in batches
batch_size = 100
for i in range(0, len(df_pairs), batch_size):
    batch = df_pairs.iloc[i:i+batch_size]
    for _, row in batch.iterrows():
        session.run(
            """
            MATCH (a:Artist {name: $artist}), (t:Track {name: $track})
            CREATE (a)-[:PERFORMED {weight: $weight}]->(t)
            """,
            artist=row['artistname'],
            track=row['trackname'],
            weight=float(row['play_count'])
        )
    print(f"Processed {min(i+batch_size, len(df_pairs))} of {len(df_pairs)} relationships")

# Add playlist data and co-occurrence relationships
# a. Create playlist nodes
playlist_names = df['playlistname'].unique()
for playlist in playlist_names[:1000]:  # Limit to first 1000 playlists for performance
    session.run(
        "CREATE (p:Playlist {name: $name})",
        name=playlist
    )

# b. Create INCLUDED_IN relationships (Track to Playlist)
batch_size = 100
playlist_data = df[['trackname', 'playlistname']].drop_duplicates()
for i in range(0, len(playlist_data), batch_size):
    batch = playlist_data.iloc[i:i+batch_size]
    for _, row in batch.iterrows():
        session.run(
            """
            MATCH (t:Track {name: $track}), (p:Playlist {name: $playlist})
            CREATE (t)-[:INCLUDED_IN]->(p)
            """,
            track=row['trackname'],
            playlist=row['playlistname']
        )
    print(f"Processed {min(i+batch_size, len(playlist_data))} of {len(playlist_data)} playlist relationships")

# c. Create graph projection for algorithms
print("Creating graph projection...")
try:
    session.run("""
    CALL gds.graph.project(
        'spotify_graph',
        ['Artist', 'Track', 'Playlist'],
        {
            PERFORMED: {
                orientation: 'NATURAL',
                properties: {
                    weight: {
                        property: 'weight',
                        defaultValue: 1.0
                    }
                }
            },
            INCLUDED_IN: {
                orientation: 'NATURAL'
            },
            SIMILAR_TO: {
                orientation: 'UNDIRECTED',
                properties: {
                    weight: {
                        property: 'weight',
                        defaultValue: 1.0
                    }
                }
            }
        }
    );
    """)
    print("Graph projection created successfully")
except Exception as e:
    print(f"Error creating graph projection: {e}")

In [None]:
# verifying loaded data
result_artists = session.run("MATCH (a:Artist) RETURN a.name LIMIT 10")
result_tracks = session.run("MATCH (t:Track) RETURN t.name LIMIT 10")
result_relationships = session.run("MATCH (a:Artist)-[:PERFORMED]->(t:Track) RETURN a.name, t.name LIMIT 10")

print("Artists in Neo4j:")
for record in result_artists:
    print(record["a.name"])

print("\nTracks in Neo4j:")
for record in result_tracks:
    print(record["t.name"])

print("\nRelationships (Artist -[:PERFORMED]-> Track):")
for record in result_relationships:
    print(f"Artist: {record['a.name']} performed Track: {record['t.name']}")


### Run Graph Algorithms

In [None]:
# 1. PageRank
session.run("""
CALL gds.pageRank.write('spotify_graph', {
    writeProperty: 'pagerank',
    relationshipWeightProperty: 'weight',
    maxIterations: 20,
    dampingFactor: 0.85
})
""")

# 2. Degree Centrality
session.run("""
CALL gds.degree.write('spotify_graph', {
    writeProperty: 'degree',
    relationshipWeightProperty: 'weight'
})
""")

# 3. Community Detection with Louvain
session.run("""
CALL gds.louvain.write('spotify_graph', {
    writeProperty: 'community',
    relationshipWeightProperty: 'weight'
})
""")

# 4. Create artist similarity network
session.run("MATCH ()-[r:SIMILAR_TO]-() DELETE r")  # Clear existing

session.run("""
MATCH (a1:Artist), (a2:Artist)
WHERE a1.name < a2.name
WITH a1, a2
MATCH (a1)-[:PERFORMED]->(t1:Track)<-[:PERFORMED]-(a2)
WITH a1, a2, COUNT(t1) AS common_tracks
WHERE common_tracks > 0
MERGE (a1)-[r:SIMILAR_TO]-(a2)
SET r.weight = common_tracks
""")

# Verify algorithms ran successfully
count_result = session.run("""
MATCH (n) 
WHERE n.pagerank IS NOT NULL AND n.degree IS NOT NULL AND n.community IS NOT NULL
RETURN count(n) AS count
""").single()["count"]

similarity_count = session.run("MATCH ()-[r:SIMILAR_TO]-() RETURN COUNT(r) AS count").single()["count"]

print(f"Successfully ran algorithms on {count_result} nodes and created {similarity_count} similarity relationships")

In [None]:
# Create an artist-only projection for recommendations
session.run("""
CALL gds.graph.project(
    'artist_graph',
    'Artist',
    {
        SIMILAR_TO: {
            orientation: 'UNDIRECTED',
            properties: {
                weight: {
                    property: 'weight',
                    defaultValue: 1.0
                }
            }
        }
    }
);
""")

# Run PageRank on the artist graph
session.run("""
CALL gds.pageRank.write('artist_graph', {
    writeProperty: 'artist_pagerank',
    relationshipWeightProperty: 'weight'
})
""")

### Query Results

In [None]:
# 1. Top nodes by PageRank
pagerank_result = session.run("""
MATCH (n)
WHERE n.pagerank IS NOT NULL
RETURN labels(n)[0] AS type, n.name AS name, n.pagerank AS score
ORDER BY score DESC
LIMIT 10
""")

print("Top 10 Influential Nodes by PageRank:")
for record in pagerank_result:
    print(f"{record['type']}: {record['name']} (Score: {record['score']:.6f})")

# 2. Top nodes by Degree Centrality
degree_result = session.run("""
MATCH (n)
WHERE n.degree IS NOT NULL
RETURN labels(n)[0] AS type, n.name AS name, n.degree AS score
ORDER BY score DESC
LIMIT 10
""")

print("\nTop 10 Nodes by Degree Centrality:")
for record in degree_result:
    print(f"{record['type']}: {record['name']} (Score: {record['score']:.2f})")

# 3. Community detection results
community_result = session.run("""
MATCH (n)
WHERE n.community IS NOT NULL
WITH n.community AS community, collect(n.name) AS members
RETURN community, size(members) AS size, members[0..3] AS sample_members
ORDER BY size DESC
LIMIT 5
""")

print("\nMusic Communities Detected:")
for record in community_result:
    print(f"Community {record['community']} (Size: {record['size']}): {record['sample_members']}")

# 4. Similar artist pairs
similar_result = session.run("""
MATCH (a1:Artist)-[r:SIMILAR_TO]-(a2:Artist)
RETURN a1.name AS artist1, a2.name AS artist2, r.weight AS commonTracks
ORDER BY commonTracks DESC
LIMIT 10
""")

print("\nTop 10 Similar Artist Pairs:")
for record in similar_result:
    print(f"{record['artist1']} and {record['artist2']} share {record['commonTracks']} tracks")

### Artist Recommendations

In [None]:
# Get top artists by track count
top_artists = session.run("""
MATCH (a:Artist)-[:PERFORMED]->(t)
WITH a, COUNT(t) AS track_count
RETURN a.name AS name
ORDER BY track_count DESC
LIMIT 3
""")

top_artist_names = [record["name"] for record in top_artists]

print("Recommendations for Top Artists:")
for artist in top_artist_names:
    print(f"\nFor fans of {artist}:")
    
    # Find similar artists based on tracks they've performed
    similar = session.run("""
    MATCH (a:Artist {name: $artist})-[:PERFORMED]->(t:Track)<-[:PERFORMED]-(other:Artist)
    WHERE a <> other
    RETURN other.name AS name, COUNT(t) AS common_tracks
    ORDER BY common_tracks DESC
    LIMIT 3
    """, artist=artist)
    
    similar_results = list(similar)
    if similar_results:
        print("Similar artists:")
        for record in similar_results:
            print(f"- {record['name']} (Common tracks: {record['common_tracks']})")
    else:
        print("No similar artists found")
    
    # Recommend tracks from artists in the same community
    tracks = session.run("""
    MATCH (a:Artist {name: $artist})
    MATCH (other:Artist)
    WHERE other.community = a.community AND other <> a
    MATCH (other)-[:PERFORMED]->(t:Track)
    WHERE NOT EXISTS((a)-[:PERFORMED]->(t))
    RETURN t.name AS track, other.name AS artist
    LIMIT 3
    """, artist=artist)
    
    track_results = list(tracks)
    if track_results:
        print("Recommended tracks:")
        for record in track_results:
            print(f"- {record['track']} by {record['artist']}")
    else:
        print("No track recommendations found")

### Summary

In [None]:
# Graph summary statistics
summary = session.run("""
MATCH (n)
WHERE n.pagerank IS NOT NULL
WITH 
    labels(n)[0] AS type,
    count(n) AS count,
    avg(n.pagerank) AS avg_pagerank,
    max(n.pagerank) AS max_pagerank,
    min(n.pagerank) AS min_pagerank,
    avg(n.degree) AS avg_degree,
    max(n.degree) AS max_degree,
    min(n.degree) AS min_degree
RETURN type, count, avg_pagerank, max_pagerank, min_pagerank, avg_degree, max_degree, min_degree
ORDER BY type
""")

print("Graph Analysis Summary:")
for record in summary:
    print(f"Node type: {record['type']}")
    print(f"  Count: {record['count']}")
    print(f"  PageRank: Avg={record['avg_pagerank']:.6f}, Max={record['max_pagerank']:.6f}, Min={record['min_pagerank']:.6f}")
    print(f"  Degree: Avg={record['avg_degree']:.2f}, Max={record['max_degree']:.2f}, Min={record['min_degree']:.2f}")

# Community counts
community_stats = session.run("""
MATCH (n)
WHERE n.community IS NOT NULL
RETURN count(DISTINCT n.community) AS communityCount
""")

print(f"\nNumber of communities: {community_stats.single()['communityCount']}")

# Close Neo4j session
#session.close()
#driver.close()
print("Neo4j graph analysis complete!")

## Debugging

In [None]:
# Investigation Cell - Neo4j Graph Diagnostics

print("============ NEO4J GRAPH INVESTIGATION ============")

# 1. Check graph projections
print("\n1. CHECKING GRAPH PROJECTIONS:")
projection_info = session.run("""
CALL gds.graph.list()
YIELD graphName, nodeCount, relationshipCount, schema
RETURN graphName, nodeCount, relationshipCount, schema
""")

projections = list(projection_info)
if projections:
    for record in projections:
        print(f"Graph: {record['graphName']}")
        print(f"  Nodes: {record['nodeCount']}")
        print(f"  Relationships: {record['relationshipCount']}")
        print(f"  Schema: {record['schema']}")
else:
    print("No graph projections found.")

# 2. Check node and relationship counts
print("\n2. CHECKING NODE AND RELATIONSHIP COUNTS:")
node_counts = session.run("""
MATCH (n)
RETURN labels(n)[0] AS type, count(n) AS count
""")

for record in node_counts:
    print(f"{record['type']}s: {record['count']}")

rel_counts = session.run("""
MATCH ()-[r]->()
RETURN type(r) AS type, count(r) AS count
""")

for record in rel_counts:
    print(f"{record['type']} relationships: {record['count']}")

# 3. Check weight distribution on relationships
print("\n3. CHECKING RELATIONSHIP WEIGHT DISTRIBUTION:")
weight_stats = session.run("""
MATCH ()-[r:PERFORMED]->()
RETURN 
    count(r) AS total,
    avg(r.weight) AS avg_weight,
    min(r.weight) AS min_weight,
    max(r.weight) AS max_weight
""")

for record in weight_stats:
    print(f"PERFORMED relationships: {record['total']}")
    print(f"  Average weight: {record['avg_weight']}")
    print(f"  Min weight: {record['min_weight']}")
    print(f"  Max weight: {record['max_weight']}")

# 4. Check connectivity between artists (how many share tracks)
print("\n4. CHECKING ARTIST CONNECTIVITY:")
artist_connectivity = session.run("""
MATCH (a1:Artist)-[:PERFORMED]->(t:Track)<-[:PERFORMED]-(a2:Artist)
WHERE a1 <> a2
RETURN count(DISTINCT [a1, a2]) AS artist_pairs
""")

for record in artist_connectivity:
    print(f"Artist pairs that share tracks: {record['artist_pairs']}")

# 5. Verify SIMILAR_TO relationships
print("\n5. CHECKING SIMILAR_TO RELATIONSHIPS:")
similar_stats = session.run("""
MATCH ()-[r:SIMILAR_TO]-()
RETURN 
    count(r) AS total,
    avg(r.weight) AS avg_weight,
    min(r.weight) AS min_weight,
    max(r.weight) AS max_weight
""")

for record in similar_stats:
    print(f"SIMILAR_TO relationships: {record['total']}")
    if record['total'] > 0:
        print(f"  Average weight: {record['avg_weight']}")
        print(f"  Min weight: {record['min_weight']}")
        print(f"  Max weight: {record['max_weight']}")

# 6. Test creating a new graph projection
print("\n6. TESTING NEW GRAPH PROJECTION:")
try:
    session.run("CALL gds.graph.drop('test_graph', false)")
except:
    pass

try:
    new_projection = session.run("""
    CALL gds.graph.project(
        'test_graph',
        ['Artist', 'Track'],
        {
            PERFORMED: {
                orientation: 'NATURAL',
                properties: {
                    weight: {
                        property: 'weight',
                        defaultValue: 1.0
                    }
                }
            }
        }
    )
    YIELD graphName, nodeCount, relationshipCount
    RETURN graphName, nodeCount, relationshipCount
    """)
    
    record = new_projection.single()
    print(f"Created test projection: {record['graphName']}")
    print(f"  Nodes: {record['nodeCount']}")
    print(f"  Relationships: {record['relationshipCount']}")
except Exception as e:
    print(f"Error creating test projection: {e}")

# 7. Test artist-only projection
print("\n7. TESTING ARTIST-ONLY PROJECTION:")
try:
    session.run("CALL gds.graph.drop('artist_graph', false)")
except:
    pass

# First create SIMILAR_TO relationships if they don't exist
session.run("""
MATCH (a1:Artist)-[:PERFORMED]->(t:Track)<-[:PERFORMED]-(a2:Artist)
WHERE a1 <> a2 AND id(a1) < id(a2)
WITH a1, a2, COUNT(t) AS commonTracks
WHERE commonTracks > 0
MERGE (a1)-[r:SIMILAR_TO]-(a2)
SET r.weight = commonTracks
""")

try:
    artist_projection = session.run("""
    CALL gds.graph.project(
        'artist_graph',
        'Artist',
        {
            SIMILAR_TO: {
                orientation: 'UNDIRECTED',
                properties: {
                    weight: {
                        property: 'weight',
                        defaultValue: 1.0
                    }
                }
            }
        }
    )
    YIELD graphName, nodeCount, relationshipCount
    RETURN graphName, nodeCount, relationshipCount
    """)
    
    record = artist_projection.single()
    print(f"Created artist projection: {record['graphName']}")
    print(f"  Nodes: {record['nodeCount']}")
    print(f"  Relationships: {record['relationshipCount']}")
except Exception as e:
    print(f"Error creating artist projection: {e}")

# 8. Test PageRank on Artist Graph
print("\n8. TESTING PAGERANK ON ARTIST GRAPH:")
try:
    session.run("""
    CALL gds.pageRank.write('artist_graph', {
        writeProperty: 'artist_pagerank',
        relationshipWeightProperty: 'weight',
        maxIterations: 20,
        dampingFactor: 0.85
    })
    """)
    
    pagerank_result = session.run("""
    MATCH (a:Artist)
    WHERE a.artist_pagerank IS NOT NULL
    RETURN a.name AS name, a.artist_pagerank AS score
    ORDER BY score DESC
    LIMIT 5
    """)
    
    print("Top 5 artists by PageRank:")
    for record in pagerank_result:
        print(f"  {record['name']}: {record['score']:.6f}")
except Exception as e:
    print(f"Error running PageRank on artist graph: {e}")

# 9. Try alternative community detection
print("\n9. TESTING LABEL PROPAGATION:")
try:
    session.run("""
    CALL gds.labelPropagation.write('artist_graph', {
        writeProperty: 'artist_community'
    })
    """)
    
    community_result = session.run("""
    MATCH (a:Artist)
    WHERE a.artist_community IS NOT NULL
    RETURN a.artist_community AS community, count(*) AS size
    ORDER BY size DESC
    LIMIT 5
    """)
    
    print("Top 5 artist communities:")
    for record in community_result:
        print(f"  Community {record['community']}: {record['size']} artists")
        
    # Get sample members
    for record in community_result:
        community_id = record['community']
        sample = session.run("""
        MATCH (a:Artist)
        WHERE a.artist_community = $community
        RETURN a.name AS name
        LIMIT 3
        """, community=community_id)
        
        names = [r['name'] for r in sample]
        print(f"  Sample from community {community_id}: {names}")
except Exception as e:
    print(f"Error running Label Propagation on artist graph: {e}")

# 10. Check which algorithms are available
print("\n10. CHECKING AVAILABLE ALGORITHMS:")
try:
    algo_list = session.run("""
    CALL gds.list()
    YIELD name, description
    RETURN name, description
    LIMIT 10
    """)
    
    print("Sample of available algorithms:")
    for record in algo_list:
        print(f"  {record['name']}: {record['description']}")
except Exception as e:
    print(f"Error listing algorithms: {e}")

print("\n============ INVESTIGATION COMPLETE ============")
