# Preparazione del grafo

In [None]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import community as community_louvain  
import ast
import random
import tarfile
import json

# Carica dati
edges = pd.read_csv('dataset/spoti/edges.csv')
nodes = pd.read_csv('dataset/spoti/nodes2.csv')
nodes_unique = nodes.drop_duplicates(subset=['spotify_id'], keep='first')


In [None]:
# ========== PARAMETRI DI SELEZIONE ==========
TOPLIST = 2000
SELECTION_CRITERION = "degree"
POPULARITY_FIELD = "popularity"
COUNTRY_FILTER = "Italy"  # Opzioni: "Italy", "France", "Germany", "Spain", "United Kingdom", None (per tutti)
SEED = 42

np.random.seed(SEED)
random.seed(SEED)

In [None]:
def is_from_country(nationality, country_filter=None):
    """
    Verifica se un artista appartiene a un paese specifico basandosi sulla nazionalità.
    
    Parametri:
    - nationality: stringa con il paese dell'artista (es. "Italy", "United States")
    - country_filter: nome del paese da filtrare (es. "Italy", "France") o None per nessun filtro
    
    Returns:
    - Boolean
    """
    if country_filter is None:
        return True
    
    if pd.isna(nationality):
        return False
    
    # Normalizza per il confronto (case-insensitive)
    nationality_lower = str(nationality).lower().strip()
    country_filter_lower = str(country_filter).lower().strip()
    
    return nationality_lower == country_filter_lower


In [None]:
def parse_genres(genres_field):
    """Converte stringa rappresentante lista in lista Python"""
    if pd.isna(genres_field):
        return []
    if isinstance(genres_field, str):
        try:
            parsed = ast.literal_eval(genres_field)
            return parsed if isinstance(parsed, list) else []
        except:
            return []
    if isinstance(genres_field, list):
        return genres_field
    return []



In [None]:
# ========== FUNZIONE DI SELEZIONE TOP NODI ==========
def select_top_nodes(graph, nodes_df, n=100, criterion="degree", popularity_field="popularity"):
    """
    Seleziona i top n nodi secondo diversi criteri di centralità.
    
    Parametri:
    - graph: grafo NetworkX
    - nodes_df: dataframe con attributi degli artisti
    - n: numero di nodi da selezionare
    - criterion: metrica di selezione ("degree", "popularity", "betweenness", "closeness", "eigenvector", "random")
    - popularity_field: nome colonna popolarità
    
    Returns:
    - Lista di nodi selezionati
    """
    
    if criterion == "degree":
        # Centralità di grado: numero di collaborazioni
        metric = dict(graph.degree())
        description = "numero di collaborazioni"
        
    elif criterion == "popularity":
        # Popolarità su Spotify
        metric = {}
        for node in graph.nodes():
            if node in nodes_df['spotify_id'].values:
                pop = nodes_df.loc[nodes_df['spotify_id'] == node, popularity_field].values
                metric[node] = pop[0] if len(pop) > 0 else 0
            else:
                metric[node] = 0
        description = "popolarità Spotify"
        
    elif criterion == "betweenness":
        # Centralità di intermediazione: broker tra comunità
        metric = nx.betweenness_centrality(graph)
        description = "centralità di intermediazione"
        
    elif criterion == "closeness":
        # Centralità di vicinanza: distanza media dagli altri
        metric = nx.closeness_centrality(graph)
        description = "centralità di vicinanza"
        
    elif criterion == "eigenvector":
        # Centralità di autovettore: connessione a nodi importanti
        try:
            metric = nx.eigenvector_centrality(graph, max_iter=1000)
            description = "centralità di autovettore"
        except:
            print("⚠️ Eigenvector centrality non convergente, uso degree")
            metric = dict(graph.degree())
            description = "grado (fallback)"
            
    elif criterion == "random":
        # Selezione casuale
        all_nodes = list(graph.nodes())
        random.shuffle(all_nodes)
        top_nodes = all_nodes[:n]
        
        print(f"\n[SELEZIONE TOP {n}]")
        print(f"Criterio: {criterion} (seed={SEED})")
        print(f"Primi 5 artisti selezionati casualmente:")
        for i, node in enumerate(top_nodes[:5], 1):
            name = graph.nodes[node].get('name', node)
            print(f"  {i}. {name}")
        
        return top_nodes
    else:
        raise ValueError(f"Criterio '{criterion}' non valido. Opzioni: degree, popularity, betweenness, closeness, eigenvector, random")
    
    # Ordina e seleziona top n (non per random, già gestito sopra)
    sorted_nodes = sorted(metric.items(), key=lambda x: x[1], reverse=True)
    top_nodes = [n for n, v in sorted_nodes[:n]]
    
    print(f"\n[SELEZIONE TOP {n}]")
    print(f"Criterio: {criterion} ({description})")
    print(f"Top 5 artisti selezionati:")
    for i, (node, value) in enumerate(sorted_nodes[:5], 1):
        name = graph.nodes[node].get('name', node)
        print(f"  {i}. {name} (valore: {value:.3f})")
    
    return top_nodes

In [None]:
# ========== FILTRO PER NAZIONALITÀ ==========
# Filtra artisti per paese
nodes_country = nodes_unique[
    nodes_unique['nationality'].apply(lambda nat: is_from_country(nat, COUNTRY_FILTER))
]

print(f"✓ Artisti filtrati per nazionalità: {len(nodes_country)}")
if COUNTRY_FILTER:
    print(f"  Paese: {COUNTRY_FILTER}")
else:
    print(f"  Nessun filtro per nazionalità (globale)")

# Converti generi da stringa a lista
def parse_genres(genres_str):
    """Converte stringa di generi in lista"""
    if pd.isna(genres_str):
        return []
    if isinstance(genres_str, list):
        return genres_str
    try:
        return ast.literal_eval(genres_str)
    except:
        return []

nodes_country['genres'] = nodes_country['genres'].apply(parse_genres)
genres_with_data = sum(nodes_country['genres'].apply(len) > 0)
print(f"✓ Generi convertiti: {genres_with_data}/{len(nodes_country)} artisti con genere definito")

# Estrai IDs degli artisti del paese selezionato
country_ids = set(nodes_country['spotify_id'])
print(f"✓ Artisti {COUNTRY_FILTER if COUNTRY_FILTER else 'globali'} da analizzare: {len(country_ids)}")

# Filtra solo collaborazioni tra artisti dello stesso paese
print(f"\nFiltraggio collaborazioni...")
tt = edges[edges['id_0'].isin(country_ids) & edges['id_1'].isin(country_ids)]
print(f"✓ Collaborazioni interne: {len(tt)}")

# Creazione grafo completo
print(f"\nCreazione grafo...")
G_country = nx.Graph()
G_country.add_edges_from(tt[['id_0', 'id_1']].values)

# Aggiungi attributi dei nodi (nome, nazionalità, generi, ecc.)
attr_dict = nodes_country.set_index('spotify_id').to_dict('index')
nx.set_node_attributes(G_country, attr_dict)

print(f"\n{'='*60}")
print(f"GRAFO {'DEL PAESE: ' + COUNTRY_FILTER.upper() if COUNTRY_FILTER else 'GLOBALE'}")
print(f"{'='*60}")
print(f"Nodi (artisti): {G_country.number_of_nodes()}")
print(f"Archi (collaborazioni): {G_country.number_of_edges()}")

if G_country.number_of_nodes() > 0:
    density = nx.density(G_country)
    avg_degree = 2 * G_country.number_of_edges() / G_country.number_of_nodes()
    print(f"Densità: {density:.4f}")
    print(f"Grado medio: {avg_degree:.2f}")

# Visualizza info nazionalità nel grafo
print(f"\n{'='*60}")
print(f"DISTRIBUZIONE NAZIONALITÀ NEGLI ARTISTI")
print(f"{'='*60}")
nationality_dist = nodes_country['nationality'].value_counts()
print(nationality_dist.head(10))

In [None]:
import ast
import pandas as pd

def parse_macro_genres(mg):
    if pd.isna(mg):
        return []
    if isinstance(mg, list):
        return mg
    try:
        return ast.literal_eval(mg)
    except:
        return []

nodes_country['macro_genres'] = nodes_country['macro_genres'].apply(parse_macro_genres)

with_macro = sum(nodes_country['macro_genres'].apply(len) > 0)
print(f"✓ Macro-generi convertiti: {with_macro}/{len(nodes_country)} artisti")
attr_dict = nodes_country.set_index('spotify_id').to_dict('index')
nx.set_node_attributes(G_country, attr_dict)


# 1. General Network Analysis

## 1.1 Network Density
Analysis of the overall density of the artist collaboration network, highlighting how sparse or dense the connections are.

## 1.2 Mean Distance
Computation of the average shortest path length to evaluate how easily artists are reachable within the network.

## 1.3 Weighted Overall Clustering
Evaluation of the weighted clustering coefficient to measure the tendency of artists to form tightly connected collaboration groups.



In [None]:
### TODO

# 2. Node-Level Analysis

## 2.1 Centrality Measures
Analysis of node centrality to investigate the role of artists within the network and their relationship with popularity and followers.

**Research questions:**
- Does a higher number of collaborations (featuring) correspond to higher popularity?
- Can centrality metrics help identify emerging artists?

### 2.1.1 Degree Centrality
Measures the number of direct collaborations of each artist.

### 2.1.2 Eigenvector Centrality
Evaluates the influence of an artist by considering both direct collaborations and the importance of connected artists.

### 2.1.3 Closeness Centrality
Measures how close an artist is to all others in the network in terms of shortest paths.

### 2.1.4 Betweenness Centrality
Identifies artists acting as bridges between different parts of the network.


## 2.2 Assortativity Analysis
Study of collaboration patterns among artists based on structural and semantic attributes.

### 2.2.1 Assortativity by Degree
Analysis of whether artists tend to collaborate with others having a similar number of collaborations.

### 2.2.2 Assortativity by Followers
Evaluation of collaboration preferences based on artist popularity.

### 2.2.3 Assortativity by Genre (Modularity)
Investigation of whether collaborations are influenced by musical genres, using modularity-based measures.


In [None]:
### TODO

# 3. Network-Level Structural Analysis

## 3.1 Community Detection
Analysis aimed at verifying whether artists tend to collaborate mainly with others belonging to the same musical genre.

### 3.1.1 Louvain Method
Detection of communities using the Louvain algorithm and evaluation of genre homogeneity.

### 3.1.2 Comparison with Genre Modularity
Cross-analysis between detected communities and genre-based assortativity results (Section 2.2.3).

### 3.1.3 Edge Betweenness (Girvan–Newman)
Community detection based on edge betweenness centrality to identify inter-genre bridging collaborations.


## 3.3 Degree Distribution
Analysis of the degree distribution of the collaboration network to highlight the presence of hubs and long-tailed behavior.

In [None]:
import community as community_louvain
from collections import Counter
import numpy as np
from networkx.algorithms.community import girvan_newman

print("\nEsecuzione Louvain community detection...")
partition_louvain = community_louvain.best_partition(G_country)

nx.set_node_attributes(G_country, partition_louvain, "community_louvain")

n_communities = len(set(partition_louvain.values()))
print(f"✓ Community trovate (Louvain): {n_communities}")


print("\nAnalisi omogeneità macro-generi per community (Louvain)...")

community_macro_stats = {}

for node, comm in partition_louvain.items():
    macro_genres = G_country.nodes[node].get("macro_genres", [])
    community_macro_stats.setdefault(comm, []).extend(macro_genres)

for comm, macros in community_macro_stats.items():
    if macros:
        most_common = Counter(macros).most_common(3)
        purity = most_common[0][1] / len(macros)

        print(f"Community {comm}:")
        print(f"  Macro-generi principali: {most_common}")
        print(f"  Purezza macro-genere dominante: {purity:.2f}")



print("\nEsecuzione Edge Betweenness (Girvan–Newman)...")

gn = girvan_newman(G_country)
first_split = next(gn)

communities_eb = [list(c) for c in first_split]
print(f"✓ Community trovate (Edge Betweenness): {len(communities_eb)}")

print("\nAnalisi macro-generi per community (Edge Betweenness)...")

for i, nodes_comm in enumerate(communities_eb):
    macros = []
    for node in nodes_comm:
        macros.extend(G_country.nodes[node].get("macro_genres", []))

    if macros:
        most_common = Counter(macros).most_common(3)
        purity = most_common[0][1] / len(macros)

        print(f"Community EB {i}:")
        print(f"  Macro-generi principali: {most_common}")
        print(f"  Purezza macro-genere dominante: {purity:.2f}")

print("\nComputing Degree Distribution...")

degrees = [deg for _, deg in G_country.degree()]

print("\nDegree Distribution:")
print(f"Minimum degree: {min(degrees)}")
print(f"Maximum degree: {max(degrees)}")
print(f"Average degree: {np.mean(degrees):.2f}")

# -------------------------
# Linear scale histogram
# -------------------------
plt.figure(figsize=(9,5))
plt.hist(degrees, bins=50, edgecolor='black', density=True)
plt.axvline(np.mean(degrees), linestyle='--', linewidth=1, label='Average degree')
plt.xlabel("Degree")
plt.ylabel("Probability density")
plt.title("Degree Distribution – Artist Collaborations")
plt.grid(alpha=0.3)
plt.legend()
plt.show()

# -------------------------
# Log-scale histogram (y-axis)
# -------------------------
plt.figure(figsize=(9,5))
plt.hist(degrees, bins=50, edgecolor='black', log=True, density=True)
plt.axvline(np.mean(degrees), linestyle='--', linewidth=1, label='Average degree')
plt.xlabel("Degree")
plt.ylabel("Probability density (log scale)")
plt.title("Degree Distribution (Log Scale)")
plt.grid(alpha=0.3)
plt.legend()
plt.show()



# Export della rete per analisi su Gephi

In [None]:
import os

# --- Prepara la rete per export ---
# Converti tipi non supportati (list, dict, tuple) in stringa:
for n, data in G_top.nodes(data=True):
    for k, v in data.items():
        if isinstance(v, (list, dict, tuple)):
            G_top.nodes[n][k] = str(v)

# Prepara nome directory e file:
country = COUNTRY_FILTER if COUNTRY_FILTER else "all"
crit = SELECTION_CRITERION
topn = TOPLIST
export_dir = "exports_gexf"
os.makedirs(export_dir, exist_ok=True)
filename = f"{country}_{crit}_top{topn}.gexf"
filepath = os.path.join(export_dir, filename)

# Exporta il grafo
nx.write_gexf(G_top, filepath)

print(f"\n✓ Esportazione completata!")
print(f"  File GEXF salvato in: {filepath}")
print(f"  → Aprilo in Gephi per analisi/visualizzazione.")
