# Preparazione del grafo

In [2]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import community as community_louvain  
import ast
import random
import tarfile
import json

# Carica dati
edges = pd.read_csv('dataset/spoti/edges.csv')
nodes = pd.read_csv('dataset/spoti/nodes2.csv')
nodes_unique = nodes.drop_duplicates(subset=['spotify_id'], keep='first')


  nodes = pd.read_csv('dataset/spoti/nodes2.csv')


In [3]:
# ========== PARAMETRI DI SELEZIONE ==========
TOPLIST = 2000
SELECTION_CRITERION = "degree"
POPULARITY_FIELD = "popularity"
COUNTRY_FILTER = "Italy"  # Opzioni: "Italy", "France", "Germany", "Spain", "United Kingdom", None (per tutti)
SEED = 42

np.random.seed(SEED)
random.seed(SEED)

In [4]:
def is_from_country(nationality, country_filter=None):
    """
    Verifica se un artista appartiene a un paese specifico basandosi sulla nazionalità.
    
    Parametri:
    - nationality: stringa con il paese dell'artista (es. "Italy", "United States")
    - country_filter: nome del paese da filtrare (es. "Italy", "France") o None per nessun filtro
    
    Returns:
    - Boolean
    """
    if country_filter is None:
        return True
    
    if pd.isna(nationality):
        return False
    
    # Normalizza per il confronto (case-insensitive)
    nationality_lower = str(nationality).lower().strip()
    country_filter_lower = str(country_filter).lower().strip()
    
    return nationality_lower == country_filter_lower


In [5]:
def parse_genres(genres_field):
    """Converte stringa rappresentante lista in lista Python"""
    if pd.isna(genres_field):
        return []
    if isinstance(genres_field, str):
        try:
            parsed = ast.literal_eval(genres_field)
            return parsed if isinstance(parsed, list) else []
        except:
            return []
    if isinstance(genres_field, list):
        return genres_field
    return []



In [6]:
# ========== FUNZIONE DI SELEZIONE TOP NODI ==========
def select_top_nodes(graph, nodes_df, n=100, criterion="degree", popularity_field="popularity"):
    """
    Seleziona i top n nodi secondo diversi criteri di centralità.
    
    Parametri:
    - graph: grafo NetworkX
    - nodes_df: dataframe con attributi degli artisti
    - n: numero di nodi da selezionare
    - criterion: metrica di selezione ("degree", "popularity", "betweenness", "closeness", "eigenvector", "random")
    - popularity_field: nome colonna popolarità
    
    Returns:
    - Lista di nodi selezionati
    """
    
    if criterion == "degree":
        # Centralità di grado: numero di collaborazioni
        metric = dict(graph.degree())
        description = "numero di collaborazioni"
        
    elif criterion == "popularity":
        # Popolarità su Spotify
        metric = {}
        for node in graph.nodes():
            if node in nodes_df['spotify_id'].values:
                pop = nodes_df.loc[nodes_df['spotify_id'] == node, popularity_field].values
                metric[node] = pop[0] if len(pop) > 0 else 0
            else:
                metric[node] = 0
        description = "popolarità Spotify"
        
    elif criterion == "betweenness":
        # Centralità di intermediazione: broker tra comunità
        metric = nx.betweenness_centrality(graph)
        description = "centralità di intermediazione"
        
    elif criterion == "closeness":
        # Centralità di vicinanza: distanza media dagli altri
        metric = nx.closeness_centrality(graph)
        description = "centralità di vicinanza"
        
    elif criterion == "eigenvector":
        # Centralità di autovettore: connessione a nodi importanti
        try:
            metric = nx.eigenvector_centrality(graph, max_iter=1000)
            description = "centralità di autovettore"
        except:
            print("⚠️ Eigenvector centrality non convergente, uso degree")
            metric = dict(graph.degree())
            description = "grado (fallback)"
            
    elif criterion == "random":
        # Selezione casuale
        all_nodes = list(graph.nodes())
        random.shuffle(all_nodes)
        top_nodes = all_nodes[:n]
        
        print(f"\n[SELEZIONE TOP {n}]")
        print(f"Criterio: {criterion} (seed={SEED})")
        print(f"Primi 5 artisti selezionati casualmente:")
        for i, node in enumerate(top_nodes[:5], 1):
            name = graph.nodes[node].get('name', node)
            print(f"  {i}. {name}")
        
        return top_nodes
    else:
        raise ValueError(f"Criterio '{criterion}' non valido. Opzioni: degree, popularity, betweenness, closeness, eigenvector, random")
    
    # Ordina e seleziona top n (non per random, già gestito sopra)
    sorted_nodes = sorted(metric.items(), key=lambda x: x[1], reverse=True)
    top_nodes = [n for n, v in sorted_nodes[:n]]
    
    print(f"\n[SELEZIONE TOP {n}]")
    print(f"Criterio: {criterion} ({description})")
    print(f"Top 5 artisti selezionati:")
    for i, (node, value) in enumerate(sorted_nodes[:5], 1):
        name = graph.nodes[node].get('name', node)
        print(f"  {i}. {name} (valore: {value:.3f})")
    
    return top_nodes

In [7]:
# ========== FILTRO PER NAZIONALITÀ ==========
# Filtra artisti per paese
nodes_country = nodes_unique[
    nodes_unique['nationality'].apply(lambda nat: is_from_country(nat, COUNTRY_FILTER))
]

print(f"✓ Artisti filtrati per nazionalità: {len(nodes_country)}")
if COUNTRY_FILTER:
    print(f"  Paese: {COUNTRY_FILTER}")
else:
    print(f"  Nessun filtro per nazionalità (globale)")

# Converti generi da stringa a lista
def parse_genres(genres_str):
    """Converte stringa di generi in lista"""
    if pd.isna(genres_str):
        return []
    if isinstance(genres_str, list):
        return genres_str
    try:
        return ast.literal_eval(genres_str)
    except:
        return []

nodes_country['genres'] = nodes_country['genres'].apply(parse_genres)
genres_with_data = sum(nodes_country['genres'].apply(len) > 0)
print(f"✓ Generi convertiti: {genres_with_data}/{len(nodes_country)} artisti con genere definito")

# Estrai IDs degli artisti del paese selezionato
country_ids = set(nodes_country['spotify_id'])
print(f"✓ Artisti {COUNTRY_FILTER if COUNTRY_FILTER else 'globali'} da analizzare: {len(country_ids)}")

# Filtra solo collaborazioni tra artisti dello stesso paese
print(f"\nFiltraggio collaborazioni...")
tt = edges[edges['id_0'].isin(country_ids) & edges['id_1'].isin(country_ids)]
print(f"✓ Collaborazioni interne: {len(tt)}")

# Creazione grafo completo
print(f"\nCreazione grafo...")
G_country = nx.Graph()
G_country.add_edges_from(tt[['id_0', 'id_1']].values)

# Aggiungi attributi dei nodi (nome, nazionalità, generi, ecc.)
attr_dict = nodes_country.set_index('spotify_id').to_dict('index')
nx.set_node_attributes(G_country, attr_dict)

print(f"\n{'='*60}")
print(f"GRAFO {'DEL PAESE: ' + COUNTRY_FILTER.upper() if COUNTRY_FILTER else 'GLOBALE'}")
print(f"{'='*60}")
print(f"Nodi (artisti): {G_country.number_of_nodes()}")
print(f"Archi (collaborazioni): {G_country.number_of_edges()}")

if G_country.number_of_nodes() > 0:
    density = nx.density(G_country)
    avg_degree = 2 * G_country.number_of_edges() / G_country.number_of_nodes()
    print(f"Densità: {density:.4f}")
    print(f"Grado medio: {avg_degree:.2f}")

# Visualizza info nazionalità nel grafo
print(f"\n{'='*60}")
print(f"DISTRIBUZIONE NAZIONALITÀ NEGLI ARTISTI")
print(f"{'='*60}")
nationality_dist = nodes_country['nationality'].value_counts()
print(nationality_dist.head(10))

✓ Artisti filtrati per nazionalità: 2707
  Paese: Italy
✓ Generi convertiti: 1403/2707 artisti con genere definito
✓ Artisti Italy da analizzare: 2707

Filtraggio collaborazioni...
✓ Collaborazioni interne: 4307

Creazione grafo...

GRAFO DEL PAESE: ITALY
Nodi (artisti): 1656
Archi (collaborazioni): 4307
Densità: 0.0031
Grado medio: 5.20

DISTRIBUZIONE NAZIONALITÀ NEGLI ARTISTI
nationality
Italy    2707
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nodes_country['genres'] = nodes_country['genres'].apply(parse_genres)


In [8]:
import ast
import pandas as pd

def parse_macro_genres(mg):
    if pd.isna(mg):
        return []
    if isinstance(mg, list):
        return mg
    try:
        return ast.literal_eval(mg)
    except:
        return []

nodes_country['macro_genres'] = nodes_country['macro_genres'].apply(parse_macro_genres)

with_macro = sum(nodes_country['macro_genres'].apply(len) > 0)
print(f"✓ Macro-generi convertiti: {with_macro}/{len(nodes_country)} artisti")
attr_dict = nodes_country.set_index('spotify_id').to_dict('index')
nx.set_node_attributes(G_country, attr_dict)


✓ Macro-generi convertiti: 2609/2707 artisti


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nodes_country['macro_genres'] = nodes_country['macro_genres'].apply(parse_macro_genres)


# 1. General Network Analysis

## 1.1 Network Density
Analysis of the overall density of the artist collaboration network, highlighting how sparse or dense the connections are.

## 1.2 Mean Distance
Computation of the average shortest path length to evaluate how easily artists are reachable within the network.

## 1.3 Weighted Overall Clustering
Evaluation of the weighted clustering coefficient to measure the tendency of artists to form tightly connected collaboration groups.



In [None]:
### TODO

# 2. Node-Level Analysis

## 2.1 Centrality Measures
Analysis of node centrality to investigate the role of artists within the network and their relationship with popularity and followers.

**Research questions:**
- Does a higher number of collaborations (featuring) correspond to higher popularity?
- Can centrality metrics help identify emerging artists?

### 2.1.1 Degree Centrality
Measures the number of direct collaborations of each artist.

In [35]:
def calculate_degree_centrality(graph):
    print("CALCOLO DEGREE CENTRALITY")
    print("="*50)
    
    # Degree Centrality
    degree_centrality = nx.degree_centrality(graph)
    
    # Degree standard (numero assoluto)
    degree_dict = dict(graph.degree())
    
    print(f"Calcolata per {len(degree_centrality)} nodi")
    
    # Top 10 artisti per grado
    top_degree = sorted(degree_dict.items(), key=lambda x: x[1], reverse=True)[:10]
    print("\nTop 10 artisti per numero di collaborazioni:")
    for i, (node, deg) in enumerate(top_degree, 1):
        node_name = graph.nodes[node].get('name', node)[:30]
        print(f"  {i:2d}. {node_name:<30} → {deg} collaborazioni")
    
    return degree_centrality, degree_dict

# Chiamata della funzione
degree_centrality, degree_dict = calculate_degree_centrality(G_country)

CALCOLO DEGREE CENTRALITY
Calcolata per 1656 nodi

Top 10 artisti per numero di collaborazioni:
   1. Guè                            → 114 collaborazioni
   2. Andrea Bocelli                 → 103 collaborazioni
   3. Clementino                     → 84 collaborazioni
   4. Gemitaiz                       → 81 collaborazioni
   5. Night Skinny                   → 80 collaborazioni
   6. Don Joe                        → 79 collaborazioni
   7. Inoki                          → 73 collaborazioni
   8. Fabri Fibra                    → 72 collaborazioni
   9. Emis Killa                     → 71 collaborazioni
  10. Ennio Morricone                → 68 collaborazioni


### 2.1.2 Eigenvector Centrality
Evaluates the influence of an artist by considering both direct collaborations and the importance of connected artists.

In [34]:
def calculate_eigenvector_centrality(graph):
    print("CALCOLO EIGENVECTOR CENTRALITY")
    print("="*50)
    
    try:
        eigenvector_centrality = nx.eigenvector_centrality(
            graph, 
            max_iter=1000, 
            tol=1e-6
        )
        method = "eigenvector"
    except nx.PowerIterationFailedConvergence:
        print("Eigenvector non converge, uso PageRank")
        eigenvector_centrality = nx.pagerank(graph)
        method = "pagerank (fallback)"
    
    print(f"\nCalcolata con metodo: {method}")
    print(f"Calcolata per {len(eigenvector_centrality)} nodi")
    
    # Top 10 artisti per eigenvector centrality
    top_eigen = sorted(eigenvector_centrality.items(), 
                      key=lambda x: x[1], 
                      reverse=True)[:10]
    
    print("\nTop 10 artisti per centralità di autovettore:")
    for i, (node, eigen_val) in enumerate(top_eigen, 1):
        node_name = graph.nodes[node].get('name', node)[:30]
        print(f"  {i:2d}. {node_name:<30} → {eigen_val:.4f}")
    
    return eigenvector_centrality

# Chiamata della funzione
eigenvector_centrality = calculate_eigenvector_centrality(G_country)

CALCOLO EIGENVECTOR CENTRALITY

Calcolata con metodo: eigenvector
Calcolata per 1656 nodi

Top 10 artisti per centralità di autovettore:
   1. Guè                            → 0.2573
   2. Gemitaiz                       → 0.2069
   3. Emis Killa                     → 0.1904
   4. Night Skinny                   → 0.1903
   5. Fabri Fibra                    → 0.1717
   6. Marracash                      → 0.1677
   7. Jake La Furia                  → 0.1664
   8. Don Joe                        → 0.1650
   9. MadMan                         → 0.1533
  10. Lazza                          → 0.1425


### 2.1.3 Closeness Centrality
Measures how close an artist is to all others in the network in terms of shortest paths.

In [33]:
def calculate_closeness_centrality(graph):
    print("CALCOLO CLOSENESS CENTRALITY")
    print("="*50)
    
    closeness_centrality = nx.closeness_centrality(graph)
    
    print(f"Calcolata per {len(closeness_centrality)} nodi")
    
    # Calcola anche distanze medie (ℓi)
    mean_distances = {}
    for node in graph.nodes():
        if graph.nodes():
            # Calcola percorso più breve per tutti i nodi raggiungibili
            path_lengths = []
            for target in graph.nodes():
                if node != target:
                    try:
                        length = nx.shortest_path_length(graph, source=node, target=target)
                        path_lengths.append(length)
                    except nx.NetworkXNoPath:
                        continue
            if path_lengths:
                mean_distances[node] = np.mean(path_lengths)
    
    # Top 10 artisti per closeness centrality
    top_closeness = sorted(closeness_centrality.items(), 
                          key=lambda x: x[1], 
                          reverse=True)[:10]
    
    print("\nTop 10 artisti per centralità di vicinanza:")
    for i, (node, close_val) in enumerate(top_closeness, 1):
        node_name = graph.nodes[node].get('name', node)[:30]
        dist = mean_distances.get(node, np.nan)
        dist_str = f"{dist:.2f}" if not np.isnan(dist) else "N/A"
        print(f"  {i:2d}. {node_name:<30} → {close_val:.4f} (dist. media: {dist_str})")
    
    return closeness_centrality, mean_distances

# Chiamata della funzione
closeness_centrality, mean_distances = calculate_closeness_centrality(G_country)

CALCOLO CLOSENESS CENTRALITY
Calcolata per 1656 nodi

Top 10 artisti per centralità di vicinanza:
   1. Guè                            → 0.3677 (dist. media: 2.65)
   2. Clementino                     → 0.3576 (dist. media: 2.72)
   3. Gemitaiz                       → 0.3537 (dist. media: 2.75)
   4. Fabri Fibra                    → 0.3500 (dist. media: 2.78)
   5. J-AX                           → 0.3487 (dist. media: 2.79)
   6. Night Skinny                   → 0.3480 (dist. media: 2.80)
   7. Marracash                      → 0.3466 (dist. media: 2.81)
   8. Emis Killa                     → 0.3464 (dist. media: 2.81)
   9. Elisa                          → 0.3447 (dist. media: 2.82)
  10. Rocco Hunt                     → 0.3420 (dist. media: 2.85)


### 2.1.4 Betweenness Centrality
Identifies artists acting as bridges between different parts of the network.

In [32]:
def calculate_betweenness_centrality(graph, k=None):
    print("CALCOLO BETWEENNESS CENTRALITY")
    print("="*50)
    
    # Usa k nodi campione se il grafo è grande
    if k is None:
        k = min(1000, graph.number_of_nodes())
    
    betweenness_centrality = nx.betweenness_centrality(graph, k=k)
    
    print(f"Calcolata per {len(betweenness_centrality)} nodi (k={k})")
    
    # Top 10 artisti per betweenness centrality
    top_between = sorted(betweenness_centrality.items(), 
                        key=lambda x: x[1], 
                        reverse=True)[:10]
    
    print("\nTop 10 artisti per centralità di intermediazione:")
    for i, (node, between_val) in enumerate(top_between, 1):
        node_name = graph.nodes[node].get('name', node)[:30]
        print(f"  {i:2d}. {node_name:<30} → {between_val:.4f}")
    
    return betweenness_centrality

# Chiamata della funzione
betweenness_centrality = calculate_betweenness_centrality(G_country, k=1000)

CALCOLO BETWEENNESS CENTRALITY
Calcolata per 1656 nodi (k=1000)

Top 10 artisti per centralità di intermediazione:
   1. Andrea Bocelli                 → 0.1250
   2. Clementino                     → 0.0910
   3. Guè                            → 0.0642
   4. Elisa                          → 0.0611
   5. Ennio Morricone                → 0.0575
   6. DJ Matrix                      → 0.0558
   7. J-AX                           → 0.0538
   8. Inoki                          → 0.0509
   9. Jovanotti                      → 0.0452
  10. Cristina D'Avena               → 0.0422


## 2.2 Assortativity Analysis
Study of collaboration patterns among artists based on structural and semantic attributes.

### 2.2.1 Assortativity by Degree
Analysis of whether artists tend to collaborate with others having a similar number of collaborations.

In [30]:
def calculate_degree_assortativity(graph):
    print("ASSORTATIVITY BY DEGREE")
    print("="*50)
    
    # Coefficiente di assortatività
    assort_coeff = nx.degree_assortativity_coefficient(graph)
    
    # Matrice di mixing
    degree_mixing = nx.degree_mixing_matrix(graph)
    
    print(f"Coefficiente di assortatività: {assort_coeff:.4f}")
    
    # Interpretazione
    if assort_coeff > 0.1:
        print("Interpretazione: Rete ASSORTATIVA")
        print("→ I nodi ad alto grado tendono a connettersi con altri nodi ad alto grado")
    elif assort_coeff < -0.1:
        print("Interpretazione: Rete DISASSORTATIVA")
        print("→ I nodi ad alto grado tendono a connettersi con nodi a basso grado")
    else:
        print("Interpretazione: Rete NEUTRA o CASUALE")

    return assort_coeff, degree_mixing

# Chiamata della funzione
degree_assortativity, degree_mixing_matrix = calculate_degree_assortativity(G_country)

ASSORTATIVITY BY DEGREE
Coefficiente di assortatività: -0.1052
Interpretazione: Rete DISASSORTATIVA
→ I nodi ad alto grado tendono a connettersi con nodi a basso grado


### 2.2.2 Assortativity by Followers
Evaluation of collaboration preferences based on artist popularity.

In [28]:
def calculate_followers_assortativity(graph, nodes_df):
    print("ASSORTATIVITY BY FOLLOWERS")
    print("="*50)
    
    # Verifica presenza colonna followers
    if 'followers' not in nodes_df.columns:
        print("Colonna 'followers' non trovata nel DataFrame")
        return None
    
    # Prepara dizionario followers
    followers_dict = {}
    missing_count = 0
    
    for node in graph.nodes():
        if node in nodes_df['spotify_id'].values:
            followers_val = nodes_df.loc[nodes_df['spotify_id'] == node, 'followers'].iloc[0]
            if pd.isna(followers_val):
                followers_dict[node] = 0
                missing_count += 1
            else:
                followers_dict[node] = float(followers_val)
        else:
            followers_dict[node] = 0
            missing_count += 1
    
    if missing_count > 0:
        print(f"{missing_count} nodi senza dati followers (impostati a 0)")
    
    # Calcola assortatività numerica
    try:
        assort_coeff = nx.numeric_assortativity_coefficient(graph, 'followers')
        print(f"Coefficiente di assortatività per followers: {assort_coeff:.4f}")
    except:
        # Calcolo manuale
        print("Calcolo assortatività manuale...")
        edges_data = []
        for u, v in graph.edges():
            fol_u = followers_dict.get(u, 0)
            fol_v = followers_dict.get(v, 0)
            edges_data.append((fol_u, fol_v))
        
        if len(edges_data) > 1:
            fol_u_list, fol_v_list = zip(*edges_data)
            corr_matrix = np.corrcoef(fol_u_list, fol_v_list)
            assort_coeff = corr_matrix[0, 1]
            print(f"Coefficiente (correlazione) per followers: {assort_coeff:.4f}")
        else:
            assort_coeff = 0
            print("Non ci sono abbastanza archi per calcolare la correlazione")
    
    
    if assort_coeff is not None:
        edges_data = []
        for u, v in graph.edges():
            fol_u = followers_dict.get(u, 0)
            fol_v = followers_dict.get(v, 0)
            if fol_u > 0 and fol_v > 0:  # Filtra zeri
                edges_data.append((fol_u, fol_v))
        
        if edges_data:
            fol_u_list, fol_v_list = zip(*edges_data)
            
    
    # Analisi dettagliata
    print("\nANALISI DETTAGLIATA COLLABORAZIONI PER FOLLOWERS:")
    print("-"*50)
    
    if edges_data:
        # Categorizza archi
        high_high = 0
        high_low = 0
        low_low = 0
        
        # Definisci soglia (mediana)
        all_followers = [f for f in followers_dict.values() if f > 0]
        if all_followers:
            threshold = np.median(all_followers)
            
            for fol_u, fol_v in edges_data:
                if fol_u >= threshold and fol_v >= threshold:
                    high_high += 1
                elif fol_u < threshold and fol_v < threshold:
                    low_low += 1
                else:
                    high_low += 1
            
            total = high_high + high_low + low_low
            if total > 0:
                print(f"  • Archi High-High: {high_high}/{total} ({high_high/total*100:.1f}%)")
                print(f"  • Archi High-Low:  {high_low}/{total} ({high_low/total*100:.1f}%)")
                print(f"  • Archi Low-Low:   {low_low}/{total} ({low_low/total*100:.1f}%)")
                print(f"  • Soglia followers: {threshold:.0f}")
    
    return assort_coeff

# Chiamata della funzione
followers_assortativity = calculate_followers_assortativity(G_country, nodes_country)

ASSORTATIVITY BY FOLLOWERS
Coefficiente di assortatività per followers: 0.0724

ANALISI DETTAGLIATA COLLABORAZIONI PER FOLLOWERS:
--------------------------------------------------
  • Archi High-High: 3160/4290 (73.7%)
  • Archi High-Low:  1105/4290 (25.8%)
  • Archi Low-Low:   25/4290 (0.6%)
  • Soglia followers: 5122


### 2.2.3 Assortativity by Genre (Modularity)
Investigation of whether collaborations are influenced by musical genres, using modularity-based measures.

In [29]:
def calculate_genre_assortativity(graph, nodes_df):
    print("ASSORTATIVITY BY GENRE (MODULARITY)")
    print("="*50)
    
    if 'macro_genres' not in nodes_df.columns:
        print("Colonna 'macro_genres' non trovata")
        return None, None, None
    
    def parse_macro_genres_fixed(mg):
        try:
            # Se è già una lista
            if isinstance(mg, list):
                return mg
            
            # Se è NaN
            if pd.isna(mg):
                return []
            
            # Se è stringa
            mg_str = str(mg).strip()
            
            # Se è lista vuota
            if mg_str in ['[]', '']:
                return []
            
            # Prova a parsare come lista Python
            if mg_str.startswith('[') and mg_str.endswith(']'):
                import ast
                try:
                    parsed = ast.literal_eval(mg_str)
                    if isinstance(parsed, list):
                        return parsed
                except:
                    pass
            
            # Prova split semplice
            # Rimuovi parentesi quadre
            mg_str = mg_str.strip('[]')
            # Split per virgola
            parts = [p.strip().strip("'\"") for p in mg_str.split(',')]
            # Filtra elementi vuoti
            return [p for p in parts if p]
            
        except Exception as e:
            print(f"Errore parsing generi '{mg}': {e}")
            return []
    
    # Prepara attributi genere
    genre_dict = {}
    genre_main_dict = {}
    
    for node in graph.nodes():
        # Cerca il nodo nel DataFrame
        matches = nodes_df[nodes_df['spotify_id'] == node]
        
        if not matches.empty:
            # Prendi il PRIMO valore (non la Series)
            genres_value = matches['macro_genres'].iloc[0]
            genres_list = parse_macro_genres_fixed(genres_value)
            
            if genres_list:
                genre_dict[node] = genres_list
                # Prendi il primo genere come principale
                genre_main_dict[node] = genres_list[0]
            else:
                genre_dict[node] = []
                genre_main_dict[node] = "Unknown"
        else:
            genre_dict[node] = []
            genre_main_dict[node] = "Unknown"
    
    # DEBUG: Stampa alcuni esempi
    print(f"Nodi processati: {len(genre_main_dict)}")
    print(f"Nodi con genere: {sum(1 for g in genre_main_dict.values() if g != 'Unknown')}")
    
    # Esempi di parsing
    sample_nodes = list(graph.nodes())[:3]
    print("Esempi di parsing generi:")
    for node in sample_nodes:
        genres = genre_dict.get(node, [])
        main = genre_main_dict.get(node, "Unknown")
        node_name = graph.nodes[node].get('name', node)[:20]
        print(f"  {node_name}: {main} (tutti: {genres})")
    
    # Calcola assortatività categorica
    try:
        # Aggiungi attributo al grafo
        nx.set_node_attributes(graph, genre_main_dict, 'main_genre')
        
        assort_coeff = nx.attribute_assortativity_coefficient(graph, 'main_genre')
        print(f"Coefficiente di assortatività per genere: {assort_coeff:.4f}")
    except Exception as e:
        print(f"Errore nel calcolo assortatività: {e}")
        assort_coeff = None
    
    # Calcola Modularità
    print("\nCALCOLO MODULARITÀ BASATA SU GENERI:")
    print("-"*40)
    
    try:
        # Crea partizione per generi
        genre_communities = {}
        for node, main_genre in genre_main_dict.items():
            if main_genre not in genre_communities:
                genre_communities[main_genre] = set()
            genre_communities[main_genre].add(node)
        
        # Converti in lista di set
        partition = [c for c in genre_communities.values() if len(c) > 0]
        
        # Calcola modularità
        from networkx.algorithms.community import modularity
        Q = modularity(graph, partition)
        print(f"Modularità (Q): {Q:.4f}")
        
        # Interpretazione modularità
        if Q > 0.3:
            print("→ Forte struttura modulare: i generi influenzano molto le collaborazioni")
        elif Q > 0.1:
            print("→ Moderata struttura modulare")
        elif Q > 0:
            print("→ Debole struttura modulare")
        else:
            print("→ Nessuna struttura modulare evidente")
    
    except Exception as e:
        print(f"Errore nel calcolo modularità: {e}")
        Q = None
    
    # Analisi pattern mixing
    print("\nANALISI PATTERN DI COLLABORAZIONE PER GENERI:")
    print("-"*50)
    
    # Conta archi intra-genere vs inter-genere
    intra_genre_edges = 0
    inter_genre_edges = 0
    genre_pair_counts = {}
    
    for u, v in graph.edges():
        u_genres = genre_dict.get(u, [])
        v_genres = genre_dict.get(v, [])
        
        if u_genres and v_genres:
            # Se condividono almeno un genere
            if set(u_genres) & set(v_genres):
                intra_genre_edges += 1
            else:
                inter_genre_edges += 1
            
            # Conta coppie di generi principali
            u_main = genre_main_dict.get(u, "Unknown")
            v_main = genre_main_dict.get(v, "Unknown")
            
            pair = tuple(sorted([u_main, v_main]))
            genre_pair_counts[pair] = genre_pair_counts.get(pair, 0) + 1
        else:
            # Almeno uno dei due non ha genere
            inter_genre_edges += 1
    
    total_edges = intra_genre_edges + inter_genre_edges
    if total_edges > 0:
        intra_prop = intra_genre_edges / total_edges
        print(f"Archi intra-genere: {intra_genre_edges}/{total_edges} ({intra_prop:.1%})")
        print(f"Archi inter-genere: {inter_genre_edges}/{total_edges} ({1-intra_prop:.1%})")
        
        # Top coppie di generi
        print("TOP 10 COPPIE DI GENERI NELLE COLLABORAZIONI:")
        sorted_pairs = sorted(genre_pair_counts.items(), 
                             key=lambda x: x[1], 
                             reverse=True)[:10]
        
        for i, (pair, count) in enumerate(sorted_pairs, 1):
            if pair[0] == pair[1]:
                print(f"  {i:2d}. {pair[0]:<25} → {count} (collaborazioni interne)")
            else:
                print(f"  {i:2d}. {pair[0]:<15} + {pair[1]:<10} → {count}")
    
    return assort_coeff, Q, genre_pair_counts

# Chiamata della funzione
genre_assortativity, genre_modularity, genre_pairs = calculate_genre_assortativity(G_country, nodes_country)

ASSORTATIVITY BY GENRE (MODULARITY)
Nodi processati: 1656
Nodi con genere: 1594
Esempi di parsing generi:
  Sick Luke: Hip Hop / Rap (tutti: ['Hip Hop / Rap'])
  MACHETE: Hip Hop / Rap (tutti: ['Hip Hop / Rap'])
  Giancarlo Giannini: Pop (tutti: ['Pop'])
Coefficiente di assortatività per genere: 0.4778

CALCOLO MODULARITÀ BASATA SU GENERI:
----------------------------------------
Modularità (Q): 0.2987
→ Moderata struttura modulare

ANALISI PATTERN DI COLLABORAZIONE PER GENERI:
--------------------------------------------------
Archi intra-genere: 3466/4307 (80.5%)
Archi inter-genere: 841/4307 (19.5%)
TOP 10 COPPIE DI GENERI NELLE COLLABORAZIONI:
   1. Hip Hop / Rap             → 1829 (collaborazioni interne)
   2. Pop                       → 819 (collaborazioni interne)
   3. Hip Hop / Rap   + Pop        → 596
   4. Hip Hop / Rap   + Indie      → 104
   5. Elettronica / Dance       → 85 (collaborazioni interne)
   6. Classica / Orchestrale    → 72 (collaborazioni interne)
   7. Elettr

In [39]:
def export_network_results_txt(graph, nodes_df, output_file):

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("=" * 80 + "\n")
        f.write("NETWORK ANALYSIS RESULTS\n")
        f.write("=" * 80 + "\n\n")

        # ==================================================
        # 1. INFO GENERALI
        # ==================================================
        f.write("1. INFORMAZIONI GENERALI\n")
        f.write("-" * 60 + "\n")
        f.write(f"Numero di nodi (artisti): {graph.number_of_nodes()}\n")
        f.write(f"Numero di archi (collaborazioni): {graph.number_of_edges()}\n\n")

        # ==================================================
        # 2. ASSORTATIVITÀ E MODULARITÀ
        # ==================================================
        f.write("2. ASSORTATIVITÀ E MODULARITÀ\n")
        f.write("-" * 60 + "\n")

        if 'degree_assortativity' in globals():
            f.write(f"Degree assortativity: {degree_assortativity:.4f}\n")

        if 'followers_assortativity' in globals() and followers_assortativity is not None:
            f.write(f"Followers assortativity: {followers_assortativity:.4f}\n")

        if 'genre_assortativity' in globals() and genre_assortativity is not None:
            f.write(f"Genre assortativity: {genre_assortativity:.4f}\n")

        if 'genre_modularity' in globals() and genre_modularity is not None:
            f.write(f"Genre modularity (Q): {genre_modularity:.4f}\n")

        f.write("\n")

        # ==================================================
        # 3. STATISTICHE CENTRALITÀ (GLOBALI)
        # ==================================================
        f.write("3. STATISTICHE DI CENTRALITÀ (GLOBALI)\n")
        f.write("-" * 60 + "\n")

        centralities = {
            "Degree centrality": degree_centrality,
            "Eigenvector centrality": eigenvector_centrality,
            "Closeness centrality": closeness_centrality,
            "Betweenness centrality": betweenness_centrality
        }

        for name, values in centralities.items():
            vals = [v for v in values.values() if v is not None]
            if vals:
                f.write(f"{name}:\n")
                f.write(f"  Media: {np.mean(vals):.4f}\n")
                f.write(f"  Min:   {np.min(vals):.4f}\n")
                f.write(f"  Max:   {np.max(vals):.4f}\n\n")

        # ==================================================
        # 4. TOP 10 ARTISTI PER CENTRALITÀ
        # ==================================================
        def write_top(metric_name, values):
            f.write(f"{metric_name}\n")
            f.write("-" * 40 + "\n")
            top = sorted(values.items(), key=lambda x: x[1], reverse=True)[:10]
            for i, (node, val) in enumerate(top, 1):
                row = nodes_df[nodes_df['spotify_id'] == node]
                name = row.iloc[0]['name'] if not row.empty else 'Unknown'
                f.write(f"{i:2d}. {name[:40]:<40} → {val:.4f}\n")
            f.write("\n")

        f.write("4. TOP 10 ARTISTI PER CENTRALITÀ\n")
        f.write("=" * 60 + "\n\n")

        write_top("Degree centrality", degree_centrality)
        write_top("Eigenvector centrality", eigenvector_centrality)
        write_top("Closeness centrality", closeness_centrality)
        write_top("Betweenness centrality", betweenness_centrality)

        # ==================================================
        # 5. TOP COPPIE DI GENERI (MIXING)
        # ==================================================
        if 'genre_pairs' in globals() and genre_pairs:
            f.write("5. TOP COPPIE DI GENERI NELLE COLLABORAZIONI\n")
            f.write("-" * 60 + "\n")
            top_pairs = sorted(genre_pairs.items(), key=lambda x: x[1], reverse=True)[:10]
            for i, ((g1, g2), count) in enumerate(top_pairs, 1):
                if g1 == g2:
                    f.write(f"{i:2d}. {g1:<30} → {count} (intra-genere)\n")
                else:
                    f.write(f"{i:2d}. {g1:<15} + {g2:<15} → {count}\n")

    print(f"Risultati salvati in: {output_file}")
    
export_network_results_txt(
    G_country,
    nodes_country,
    output_file="results/node-level-analysis.txt"
)


Risultati salvati in: results/node-level-analysis.txt


# 3. Network-Level Structural Analysis

## 3.1 Community Detection
Analysis aimed at verifying whether artists tend to collaborate mainly with others belonging to the same musical genre.

### 3.1.1 Louvain Method
Detection of communities using the Louvain algorithm and evaluation of genre homogeneity.

### 3.1.2 Comparison with Genre Modularity
Cross-analysis between detected communities and genre-based assortativity results (Section 2.2.3).

### 3.1.3 Edge Betweenness (Girvan–Newman)
Community detection based on edge betweenness centrality to identify inter-genre bridging collaborations.


## 3.3 Degree Distribution
Analysis of the degree distribution of the collaboration network to highlight the presence of hubs and long-tailed behavior.

In [None]:
import community as community_louvain
from collections import Counter
import numpy as np
from networkx.algorithms.community import girvan_newman

print("\nEsecuzione Louvain community detection...")
partition_louvain = community_louvain.best_partition(G_country)

nx.set_node_attributes(G_country, partition_louvain, "community_louvain")

n_communities = len(set(partition_louvain.values()))
print(f"✓ Community trovate (Louvain): {n_communities}")


print("\nAnalisi omogeneità macro-generi per community (Louvain)...")

community_macro_stats = {}

for node, comm in partition_louvain.items():
    macro_genres = G_country.nodes[node].get("macro_genres", [])
    community_macro_stats.setdefault(comm, []).extend(macro_genres)

for comm, macros in community_macro_stats.items():
    if macros:
        most_common = Counter(macros).most_common(3)
        purity = most_common[0][1] / len(macros)

        print(f"Community {comm}:")
        print(f"  Macro-generi principali: {most_common}")
        print(f"  Purezza macro-genere dominante: {purity:.2f}")



print("\nEsecuzione Edge Betweenness (Girvan–Newman)...")

gn = girvan_newman(G_country)
first_split = next(gn)

communities_eb = [list(c) for c in first_split]
print(f"✓ Community trovate (Edge Betweenness): {len(communities_eb)}")

print("\nAnalisi macro-generi per community (Edge Betweenness)...")

for i, nodes_comm in enumerate(communities_eb):
    macros = []
    for node in nodes_comm:
        macros.extend(G_country.nodes[node].get("macro_genres", []))

    if macros:
        most_common = Counter(macros).most_common(3)
        purity = most_common[0][1] / len(macros)

        print(f"Community EB {i}:")
        print(f"  Macro-generi principali: {most_common}")
        print(f"  Purezza macro-genere dominante: {purity:.2f}")

print("\nComputing Degree Distribution...")

degrees = [deg for _, deg in G_country.degree()]

print("\nDegree Distribution:")
print(f"Minimum degree: {min(degrees)}")
print(f"Maximum degree: {max(degrees)}")
print(f"Average degree: {np.mean(degrees):.2f}")

# -------------------------
# Linear scale histogram
# -------------------------
plt.figure(figsize=(9,5))
plt.hist(degrees, bins=50, edgecolor='black', density=True)
plt.axvline(np.mean(degrees), linestyle='--', linewidth=1, label='Average degree')
plt.xlabel("Degree")
plt.ylabel("Probability density")
plt.title("Degree Distribution – Artist Collaborations")
plt.grid(alpha=0.3)
plt.legend()
plt.show()

# -------------------------
# Log-scale histogram (y-axis)
# -------------------------
plt.figure(figsize=(9,5))
plt.hist(degrees, bins=50, edgecolor='black', log=True, density=True)
plt.axvline(np.mean(degrees), linestyle='--', linewidth=1, label='Average degree')
plt.xlabel("Degree")
plt.ylabel("Probability density (log scale)")
plt.title("Degree Distribution (Log Scale)")
plt.grid(alpha=0.3)
plt.legend()
plt.show()



# Export della rete per analisi su Gephi

In [None]:
import os

# --- Prepara la rete per export ---
# Converti tipi non supportati (list, dict, tuple) in stringa:
for n, data in G_top.nodes(data=True):
    for k, v in data.items():
        if isinstance(v, (list, dict, tuple)):
            G_top.nodes[n][k] = str(v)

# Prepara nome directory e file:
country = COUNTRY_FILTER if COUNTRY_FILTER else "all"
crit = SELECTION_CRITERION
topn = TOPLIST
export_dir = "exports_gexf"
os.makedirs(export_dir, exist_ok=True)
filename = f"{country}_{crit}_top{topn}.gexf"
filepath = os.path.join(export_dir, filename)

# Exporta il grafo
nx.write_gexf(G_top, filepath)

print(f"\n✓ Esportazione completata!")
print(f"  File GEXF salvato in: {filepath}")
print(f"  → Aprilo in Gephi per analisi/visualizzazione.")
