# Comparing Centrality Metrics

This notebook analyzes and compares different centrality metrics for a citation network.

It loads node and edge data from JSON files, calculates various centrality metrics (like degree, betweenness, closeness etc.), and compares them against ground truth measures like importance and document type.

The notebook defines constants for:
- Input data file paths
- Centrality metrics to analyze 
- Ground truth measures to compare against

It also defines TypedDict classes to type the network statistics and results.

**Note:** This is an analysis notebook. To modify the code that generates the network and calculates centralities, please refer to the Main section.



In [10]:
import pandas as pd
import numpy as np
import networkx as nx
import datetime
import dateutil.parser
import statistics
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import spearmanr
import seaborn as sns
import os
from typing import Dict, List, Union, TypedDict, Tuple


# Types

In [3]:
class NetworkStats(TypedDict):
    n_nodes: int
    n_edges: int

class BestCentralities(TypedDict):
    high: str
    low: str

class AnalysisResults(TypedDict):
    network_stats: NetworkStats
    correlations: Dict[str, Dict[str, Dict[tuple[str, str], float]]]  # ground_truth -> composite_function -> (centrality, ground_truth) -> correlation
    best_centralities: Dict[str, BestCentralities]  # ground_truth -> best centralities
    composite_rankings: Dict[str, Dict[str, Dict[str, float]]]  # ground_truth -> composite_function -> ecli -> rank
    dataframe: pd.DataFrame

# Utils

In [4]:
# Convert doctypebranch to a numeric value
def categorise_total_branch_numerically(branches: pd.Series) -> pd.Series:
    """
    Convert branch categorisation from strings into numbers.
    
    :param branches: The column containing branch data, categorized with strings.
    :return: A pandas Series with numerical categorization.
    """
    mapping = {
        "GRANDCHAMBER": 1,
        "CHAMBER": 2,
        "COMMITTEE": 3,
    }
    
    # Convert to uppercase to ensure consistent matching
    branches = branches.str.upper()
    
    # Print any values that don't match our mapping
    unmapped = set(branches.unique()) - set(mapping.keys())
    if unmapped:
        print(f"Warning: Found unmapped values: {unmapped}")
    
    return branches.map(mapping).fillna(0)  # Filling NaN with 0

def prep_data(df: pd.DataFrame, include: list) -> pd.DataFrame:
    """
    Prepare the dataset by selecting the appropriate columns and filtering out rows with uncomputed metric values.
    
    :param df: The DataFrame to process.
    :param include: Columns to include.
    :return: The processed DataFrame.
    """
    headers = include + ['ecli']  # Ensure essential columns are included
    headers = list(set(headers))  # Removing duplicates

    data = df[headers]

    # Filter out rows with uncomputed metric values (-2)
    metric_column = include[-1]
    data = data[data[metric_column] >= -1]

    return data

# Centrality Calculation

In [5]:
def calculate_disruptions_new(graph: nx.Graph) -> dict:
    """
    Calculate the disruption score for each node in the graph.

    Args:
        graph (nx.Graph): The input directed graph.

    Returns:
        dict: A dictionary with nodes as keys and their disruption scores as values.
    """
    disruptions = {}
    for node in graph.nodes:
        i, j, k = 0, 0, 0

        # count j
        for in_node in graph.predecessors(node):
            for out_node in graph.successors(node):
                if graph.has_edge(in_node, out_node):
                    j += 1
                    break

        # count i
        i = graph.in_degree(node) - j

        # count k
        for out_node in graph.successors(node):
            for in_out_node in graph.predecessors(out_node):
                if in_out_node != node and not graph.has_edge(in_out_node, node):
                    k += 1

        try:
            disruptions[node] = (i - j) / (i + j + k)
        except ZeroDivisionError:
            disruptions[node] = np.nan

    return disruptions


# Function to calculate centrality measures
def calculate_centrality_measures(graph):
    degree_centrality = nx.degree_centrality(graph)
    in_degree_centrality = nx.in_degree_centrality(graph)
    out_degree_centrality = nx.out_degree_centrality(graph)
    betweenness_centrality = nx.betweenness_centrality(graph, normalized=True)
    closeness_centrality = nx.closeness_centrality(graph)
    
    # Additional centrality measures
    core_number = nx.core_number(graph)
    eigenvector_centrality = nx.eigenvector_centrality(graph)
    pagerank = nx.pagerank(graph)
    # current_flow_betweenness_centrality = nx.current_flow_betweenness_centrality(graph)
    harmonic_centrality = nx.harmonic_centrality(graph)
    
    # For HITS, we get both hub and authority scores
    hits_hub, hits_authority = nx.hits(graph)
    
    # Trophic level calculation
    # Note: NetworkX doesn't have a built-in function for trophic level, so this is a placeholder
    # trophic_level = {node: 0 for node in graph.nodes}  # Placeholder implementation
    
    # Relative in-degree centrality
    total_nodes = len(graph.nodes)
    relative_in_degree_centrality = {node: in_degree / total_nodes for node, in_degree in in_degree_centrality.items()}
    
    # Placeholder for Forest Closeness and Disruption as these are not standard in NetworkX
    # forest_closeness = {node: 0 for node in graph.nodes}  # Placeholder implementation
    disruption = calculate_disruptions_new(graph)  # Placeholder implementation

    return {
        'degree_centrality': degree_centrality,
        'in_degree_centrality': in_degree_centrality,
        'out_degree_centrality': out_degree_centrality,
        'betweenness_centrality': betweenness_centrality,
        'closeness_centrality': closeness_centrality,
        'core_number': core_number,
        'relative_in_degree_centrality': relative_in_degree_centrality,
        'eigenvector_centrality': eigenvector_centrality,
        'pagerank': pagerank,
        # 'current_flow_betweenness_centrality': current_flow_betweenness_centrality,
        'hits_hub': hits_hub,
        'hits_authority': hits_authority,
        'harmonic_centrality': harmonic_centrality,
        'disruption': disruption,
    }

# Composite Ranking

In [6]:
"""
This script analyzes the relationship between various centrality measures and ground truth scores
for legal cases. It aims to find the best centrality measures for predicting high and low relevance
scores, create a composite ranking, and evaluate its performance against individual centrality measures.

The main steps are:
1. Plot error bars for centrality measures vs. ground truth scores
2. Find the best centrality measures for predicting high and low scores
3. Create a composite ranking using the best measures
4. Calculate correlations between rankings and ground truth scores
5. Visualize and save the results
"""

def plot_error_bars(df, centrality, ground_truth):
    """
    Plot error bars for a given centrality measure against a ground truth score.

    Args:
    df (pd.DataFrame): The dataframe containing the data
    centrality (str): The name of the centrality measure column
    ground_truth (str): The name of the ground truth score column

    This function visualizes the relationship between a centrality measure and a ground truth score,
    showing the mean centrality value for each ground truth score category along with error bars
    representing the standard deviation.
    """
    # Prepare data
    include = [ground_truth, centrality]
    data = prep_data(df, include)

    x_header = centrality
    y_header = ground_truth
    x, y = list(data[x_header]), list(data[y_header])
    categories = list(set(y))
    categories.sort()
    num_categories, num_instances = len(categories), len(x)
    y_instances = [[] for _ in range(num_categories)]
    for category_no in range(num_categories):
        for instance_no in range(num_instances):
            if y[instance_no] == categories[category_no]:
                y_instances[category_no].append(x[instance_no])
    x = [statistics.mean(y_instances[category_no]) for category_no in range(num_categories)]
    y = categories

    # Draw graph
    plt.figure(figsize=(10, 6))
    title = f"{centrality.capitalize()} vs Average {y_header.capitalize()}"
    plt.suptitle(title, fontsize=22)
    plt.xlabel(f"{centrality.capitalize()}", fontsize=22)
    plt.ylabel(f"{y_header.capitalize()}", fontsize=22)
    plt.yticks(categories, fontsize=16)

    # Calculate error bars
    stds = [statistics.stdev(y_instances[category_no]) for category_no in range(num_categories)]
    plt.errorbar(x, y, xerr=stds, fmt='o')

    plt.show()

def find_best_centralities(df, centralities, ground_truth):
    """
    Find the best centrality measures for predicting high and low ground truth scores.
    TODO: Include considerations for multiple ground truth scores

    Args:
    df (pd.DataFrame): The dataframe containing the data
    centralities (list): List of centrality measure column names
    ground_truth (str): The name of the ground truth score column

    Returns:
    tuple: (best_high, best_low) - the names of the best centrality measures for high and low scores

    This function calculates the Spearman correlation between each centrality measure and the ground truth,
    using 1 - |correlation| as an error metric. The centrality with the lowest error is chosen as best_high,
    and the second-lowest (excluding best_high) is chosen as best_low.
    """
    errors = {}
    
    for centrality in centralities:
        # Calculate correlation across the full range
        corr, _ = stats.spearmanr(df[centrality], df[ground_truth])
        errors[centrality] = 1 - abs(corr)  # Use 1 - |correlation| as error
    
    best_high = min(errors, key=errors.get)
    
    # Remove the best_high centrality from consideration for best_low
    errors.pop(best_high, None)
    
    best_low = min(errors, key=errors.get)
    
    return best_high, best_low

def rank_cases(df, centrality):
    """
    Rank cases based on a given centrality measure.

    Args:
    df (pd.DataFrame): The dataframe containing the data
    centrality (str): The name of the centrality measure column

    Returns:
    pd.Series: A series of rankings for each case

    This function ranks the cases in descending order of the centrality measure,
    with the highest centrality receiving rank 1.
    """
    return df[centrality].rank(ascending=False)

def create_treashold_composite_ranking(df, high_centrality, low_centrality, ground_truth):
    """
    Create a composite ranking based on a threshold approach using two centrality measures.
    
    Args:
        df: DataFrame containing the centrality measures
        high_centrality: Centrality measure that performs best for high scores
        low_centrality: Centrality measure that performs best for low scores
        ground_truth: Name of the ground truth column
        
    Returns:
        Series containing the composite ranking
    """
    # Normalize both centrality measures to [0,1] range
    high_normalized = (df[high_centrality] - df[high_centrality].min()) / (df[high_centrality].max() - df[high_centrality].min())
    low_normalized = (df[low_centrality] - df[low_centrality].min()) / (df[low_centrality].max() - df[low_centrality].min())
    
    # Find optimal threshold by testing different values
    thresholds = np.linspace(0, 1, 100)
    best_correlation = -1
    optimal_threshold = 0
    
    for threshold in thresholds:
        # Create composite ranking using current threshold
        composite = np.where(
            high_normalized > threshold,
            high_normalized,  # Use high centrality measure
            low_normalized    # Use low centrality measure
        )
        
        # Calculate correlation with ground truth
        correlation = abs(spearmanr(composite, df[ground_truth])[0])
        
        # Update if better correlation found
        if correlation > best_correlation:
            best_correlation = correlation
            optimal_threshold = threshold
    
    # Create final composite ranking using optimal threshold
    final_composite = np.where(
        high_normalized > optimal_threshold,
        high_normalized,
        low_normalized
    )
    
    print(f"Optimal threshold found: {optimal_threshold:.3f}")
    return final_composite

def create_composite_ranking(df, high_centrality, low_centrality, weight):
    """
    Create a composite ranking using two centrality measures.

    Args:
    df (pd.DataFrame): The dataframe containing the data
    high_centrality (str): The name of the centrality measure best for high scores
    low_centrality (str): The name of the centrality measure best for low scores
    weight (float): The weight given to the high_centrality ranking (0-1)

    Returns:
    pd.Series: A series of composite rankings for each case

    This function creates a weighted average of the rankings from two centrality measures,
    allowing for a balance between predicting high and low ground truth scores.
    """
    high_ranks = rank_cases(df, high_centrality)
    low_ranks = rank_cases(df, low_centrality)
    return weight * high_ranks + (1 - weight) * low_ranks

def find_optimal_weight(df, high_centrality, low_centrality, ground_truth):
    """
    Find the optimal weight for creating a composite ranking.

    Args:
    df (pd.DataFrame): The dataframe containing the data
    high_centrality (str): The name of the centrality measure best for high scores
    low_centrality (str): The name of the centrality measure best for low scores
    ground_truth (str): The name of the ground truth score column

    Returns:
    float: The optimal weight (0-1) for the high_centrality ranking

    This function tests different weights to find the one that produces the composite ranking
    with the highest correlation to the ground truth scores.
    """
    best_corr = -1
    best_weight = 0
    for weight in np.arange(0, 1.01, 0.01):
        composite = create_composite_ranking(df, high_centrality, low_centrality, weight)
        corr, _ = stats.spearmanr(composite, df[ground_truth])
        if abs(corr) > best_corr:
            best_corr = abs(corr)
            best_weight = weight
    return best_weight

def calculate_correlations(df, centralities, ground_truths, composite_ranking):
    """
    Calculate correlations between rankings and ground truth scores.

    Args:
    df (pd.DataFrame): The dataframe containing the data
    centralities (list): List of centrality measure column names
    ground_truths (list): List of ground truth score column names
    composite_ranking (str): The name of the composite ranking column

    Returns:
    dict: A dictionary of correlation coefficients

    This function calculates Spearman correlations between the rankings of each centrality measure
    (including the composite ranking) and each ground truth score.
    """
    correlations = {}
    
    for centrality in centralities:
        centrality_ranking = rank_cases(df, centrality)
        for ground_truth in ground_truths:
            corr, _ = stats.spearmanr(centrality_ranking, df[ground_truth])
            correlations[(centrality, ground_truth)] = corr
    
    for ground_truth in ground_truths:
        corr, _ = stats.spearmanr(df[composite_ranking], df[ground_truth])
        correlations[('composite', ground_truth)] = corr
    
    return correlations

def plot_correlations(correlations, ground_truth, output_file, best_high, best_low):
    """
    Plot correlations between rankings and ground truth scores.

    Args:
    correlations (dict): Dictionary of correlation coefficients
    ground_truth (str): The name of the current ground truth score
    output_file (str): The name of the output file for the plot
    best_high (str): The name of the best centrality for high scores
    best_low (str): The name of the best centrality for low scores

    This function creates a bar plot showing the correlations between each centrality measure
    (including the composite ranking) and the ground truth scores.
    """
    centralities = list(set([k[0] for k in correlations.keys() if k[0] != 'composite']))
    ground_truths = list(set([k[1] for k in correlations.keys()]))
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Add text to show which centralities were used for the composite ranking
    ax.text(0.02, 0.98, f"Composite: {best_high} (high) + {best_low} (low)", 
            transform=ax.transAxes, ha='left', va='top', 
            bbox=dict(facecolor='white', edgecolor='gray', alpha=0.8))
    
    x = np.arange(len(ground_truths))
    width = 0.8 / (len(centralities) + 1)
    
    for i, centrality in enumerate(centralities + ['composite']):
        offset = width * i - 0.4 + width/2
        rects = ax.bar(x + offset, [correlations[(centrality, gt)] for gt in ground_truths], width, label=centrality)
    
    ax.set_ylabel('Correlation Coefficient')
    ax.set_title(f'Correlations between Rankings and Ground Truths (optimized for {ground_truth})')
    ax.set_xticks(x, ground_truths)
    ax.legend(loc='best')
    
    plt.tight_layout()
    plt.savefig(output_file)
    plt.close()

def save_correlations_to_csv(correlations, ground_truth, output_file, best_high, best_low):
    """
    Save correlation results to a CSV file.

    Args:
    correlations (dict): Dictionary of correlation coefficients
    ground_truth (str): The name of the current ground truth score
    output_file (str): The name of the output CSV file
    best_high (str): The name of the best centrality for high scores
    best_low (str): The name of the best centrality for low scores

    This function saves the correlation results to a CSV file for further analysis or reporting.
    """
    df_correlations = pd.DataFrame(correlations.items(), columns=['Pair', 'Correlation'])
    df_correlations[['Centrality', 'Ground Truth']] = pd.DataFrame(df_correlations['Pair'].tolist(), index=df_correlations.index)
    df_correlations = df_correlations.drop('Pair', axis=1)
    df_correlations.to_csv(output_file, index=False)

# Analyze Network Function
> The `analyze_network()` function performs comprehensive network analysis using various centrality measures and composite rankings.

## Input Parameters

- `nodes_df`: Pandas DataFrame containing node information, including ground truth scores and node attributes
- `edges_df`: Pandas DataFrame containing edge information (connections between nodes)  
- `ground_truths`: List of column names in nodes_df that contain ground truth scores to analyze
- `centralities`: List of centrality measures to calculate (e.g. degree, betweenness, etc.)
- `composite_functions`: List of functions that combine multiple centrality measures into composite rankings
- `output_path`: Directory path where analysis outputs will be saved

## Processing Steps

1. Creates output directory if it doesn't exist
2. Makes a copy of the input nodes DataFrame
3. For each ground truth measure:
   - Calculates an inverted version (max value - original value)
   - Stores inverted versions with "_inverted" suffix
4. Cleans data by:
   - Dropping rows with missing ECLI identifiers
   - Converting doctypebranch to numeric values if present
5. Calculates centrality measures specified
6. Creates composite rankings using provided functions
7. Computes correlations between:
   - Individual centrality measures and ground truths
   - Composite rankings and ground truths

## Return Value

Returns an `AnalysisResults` dictionary containing:
- `network_stats`: Basic statistics about the network (nodes, edges, density etc.)
- `correlations`: Correlation coefficients between rankings and ground truths
- `best_centralities`: Best performing centrality measures for each ground truth
- `composite_rankings`: Results of composite ranking calculations
- `dataframe`: Final processed DataFrame with all measures included

## Output Files

Saves various analysis results to the specified output directory, including:
- Correlation plots
- CSV files with detailed results
- Network statistics


In [7]:
def analyze_network(nodes_df: pd.DataFrame, 
                edges_df: pd.DataFrame, 
                ground_truths: list, 
                centralities: list,
                composite_functions: list,
                output_path: str) -> AnalysisResults:
    """
    Analyze a network using various centrality measures and composite rankings.
    
    Args:
        nodes_df: DataFrame containing node information
        edges_df: DataFrame containing edge information
        ground_truths: List of ground truth column names to analyze
        centralities: List of centrality measures to calculate
        composite_functions: List of composite ranking functions to use
        output_path: Path where to save output files
        
    Returns:
        AnalysisResults: Dictionary containing:
        - network_stats: Basic network statistics
        - correlations: Correlation results for each ground truth and composite function
        - best_centralities: Best performing centralities for each ground truth
        - composite_rankings: Composite ranking results
        - dataframe: The final processed dataframe with all measures
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    # Make a copy of nodes dataframe
    total_df = nodes_df.copy()
    
    # Invert ground truth values
    ground_truths_inverted = []
    for truth in ground_truths:
        max_value = total_df[truth].max()
        inverted_col = f'{truth}_inverted'
        total_df[inverted_col] = max_value - total_df[truth]
        ground_truths_inverted.append(inverted_col)
    
    # Drop rows with missing ecli
    total_df = total_df.dropna(subset=['ecli'])
    
    # Convert doctypebranch to numeric if it exists
    if 'doctypebranch' in total_df.columns:
        total_df['doctypebranch'] = categorise_total_branch_numerically(total_df['doctypebranch'])
    
    # Create graph
    G = nx.DiGraph()
    
    # Add nodes with attributes
    for idx, row in total_df.iterrows():
        node_attrs = {truth: row[truth] for truth in ground_truths if truth in row}
        G.add_node(row['ecli'], **node_attrs)
    
    # Add edges between existing nodes
    valid_nodes = set(total_df['ecli'].values)
    for idx, row in edges_df.iterrows():
        source = row['ecli']
        targets = row['references']
        if source in valid_nodes:
            for target in targets:
                if target and target in valid_nodes:
                    G.add_edge(source, target)
    
    # Remove self-loops
    G.remove_edges_from(nx.selfloop_edges(G))
    
    # Calculate centrality measures
    centrality_measures = calculate_centrality_measures(G)
    centrality_df = pd.DataFrame(centrality_measures)
    
    # Merge centrality measures with total_df
    total_df = pd.merge(total_df, centrality_df, left_on='ecli', right_index=True, how='left')
    
    # Plot initial correlations
    numeric_cols = total_df.select_dtypes(include=[float, int]).columns
    for centrality in centralities:
        if centrality in numeric_cols:
            correlation_results = total_df[ground_truths_inverted + [centrality]].corr()[centrality][ground_truths_inverted]
            
            # Plot correlation results
            plt.figure()
            correlation_results.plot(kind='bar', 
                                title=f'Correlation of {centrality} with Ground Truths')
            plt.ylabel('Correlation Coefficient')
            plt.tight_layout()
            plt.savefig(f'{output_path}/correlation_{centrality}.png')
            plt.close()
    
    # Run composite rankings analysis
    for ground_truth in ground_truths:
        for composite_function in composite_functions:
            print(f"\nAnalyzing ground truth: {ground_truth} with {composite_function}")
            
            # Find best centralities and create composite ranking
            best_high, best_low = find_best_centralities(total_df, centralities, ground_truth)
            
            if composite_function == 'weight_composite_ranking':
                optimal_weight = find_optimal_weight(total_df, best_high, best_low, ground_truth)
                composite_ranking = create_composite_ranking(total_df, best_high, best_low, optimal_weight)
            else:
                composite_ranking = create_treashold_composite_ranking(total_df, best_high, best_low, ground_truth)
            
            # Add composite ranking to dataframe
            ranking_col = f'composite_ranking_{ground_truth}_{composite_function}'
            total_df[ranking_col] = composite_ranking
            
            # Calculate and save correlations
            correlations = calculate_correlations(total_df, centralities, ground_truths, ranking_col)
            
            # Save results
            plot_correlations(correlations, ground_truth, 
                            f'{output_path}/correlations_plot_{ground_truth}_{composite_function}.png',
                            best_high, best_low)
            
            save_correlations_to_csv(correlations, ground_truth,
                                f'{output_path}/correlations_{ground_truth}_{composite_function}.csv',
                                best_high, best_low)
            
    # Prepare structured results
    results = {
        'network_stats': {
            'n_nodes': len(total_df),
            'n_edges': len(edges_df)
        },
        'correlations': {},
        'best_centralities': {},
        'composite_rankings': {},
        'dataframe': total_df  # Include the final dataframe
    }
    
    # Store correlations and rankings for each ground truth and composite function
    for ground_truth in ground_truths:
        results['correlations'][ground_truth] = {}
        results['best_centralities'][ground_truth] = {}
        results['composite_rankings'][ground_truth] = {}
        
        for composite_function in composite_functions:
            ranking_col = f'composite_ranking_{ground_truth}_{composite_function}'
            
            # Store correlations
            correlations = calculate_correlations(total_df, centralities, ground_truths, ranking_col)
            results['correlations'][ground_truth][composite_function] = correlations
            
            # Store best centralities
            best_high, best_low = find_best_centralities(total_df, centralities, ground_truth)
            results['best_centralities'][ground_truth] = {
                'high': best_high,
                'low': best_low
            }
            
            # Store composite rankings
            results['composite_rankings'][ground_truth][composite_function] = total_df[ranking_col].to_dict()
    
    return results

# Compare Function
This section implements network comparison functionality to analyze and compare results across different networks.

The main function `compare_networks()` takes results from multiple network analyses and performs the following comparisons:

1. Correlation Comparisons:
   - Compares how different centrality measures correlate with ground truth metrics across networks
   - Creates comparison tables showing correlation values for each network
   - Saves correlation comparisons to CSV files

2. Ranking Comparisons: 
   - Analyzes how centrality measures rank relative to each other in different networks
   - Converts absolute correlation values to rankings
   - Shows which centrality measures perform consistently well across networks
   - Saves ranking comparisons to CSV files

The comparisons are performed for each combination of:
- Ground truth metrics (e.g., PageRank, degree centrality)
- Composite ranking functions (different ways of combining centrality measures)

This allows us to:
- Identify which centrality measures work best across different network types
- Understand how network structure affects centrality measure performance
- Compare the effectiveness of different composite ranking approaches


In [8]:
def compare_networks(network_results: Dict[str, AnalysisResults], output_path: str) -> Tuple[Dict[str, Dict[str, pd.DataFrame]], Dict[str, Dict[str, Dict[str, pd.DataFrame]]]]:
    """
    Compare results across different networks.
    
    Args:
        network_results: Dictionary with network names as keys and their analysis results as values.
        output_path: Path to save comparison results.
        
    Returns:
        A tuple containing:
        - correlation_comparisons: A dictionary of correlation DataFrames for each ground truth and composite function.
        - ranking_comparisons: A dictionary containing ranking DataFrames and rank correlations for each ground truth and composite function.
    """
    os.makedirs(output_path, exist_ok=True)
    
    # Compare correlations across networks
    correlation_comparisons = {}
    ranking_comparisons = {}
    
    # Get all ground truths and composite functions from first network
    first_network = list(network_results.values())[0]
    ground_truths = list(first_network['correlations'].keys())
    composite_functions = list(first_network['correlations'][ground_truths[0]].keys())
    
    for ground_truth in ground_truths:
        correlation_comparisons[ground_truth] = {}
        ranking_comparisons[ground_truth] = {}
        
        for composite_function in composite_functions:
            # Compare absolute correlations
            network_correlations = {
                network_name: results['correlations'][ground_truth][composite_function]
                for network_name, results in network_results.items()
            }
            
            # Create correlation comparison DataFrame
            corr_df = pd.DataFrame(network_correlations).round(3)
            corr_df.to_csv(f'{output_path}/correlations_{ground_truth}_{composite_function}.csv')
            
            # Compare relative rankings of centrality measures
            centrality_ranks = {}
            for network_name, correlations in network_correlations.items():
                # Convert correlations to ranks
                centrality_corrs = {k[0]: abs(v) for k, v in correlations.items() 
                                if k[0] != 'composite'}
                ranks = pd.Series(centrality_corrs).rank(ascending=False)
                centrality_ranks[network_name] = ranks
            
            # Create ranking comparison DataFrame
            rank_df = pd.DataFrame(centrality_ranks).round(3)
            rank_df.to_csv(f'{output_path}/ranks_{ground_truth}_{composite_function}.csv')
            
            # Calculate rank correlation between networks
            rank_correlation = rank_df.corr(method='spearman')
            rank_correlation.to_csv(f'{output_path}/rank_correlation_{ground_truth}_{composite_function}.csv')
            
            # Store results
            correlation_comparisons[ground_truth][composite_function] = corr_df
            ranking_comparisons[ground_truth][composite_function] = {
                'centrality_ranks': rank_df,
                'rank_correlation': rank_correlation
            }
    
    return correlation_comparisons, ranking_comparisons

# Main

In [9]:
# Centralities of interest
CENTRALITIES = ['degree_centrality', 'in_degree_centrality', 'out_degree_centrality', 
                'betweenness_centrality', 'closeness_centrality', 'core_number', 
                'relative_in_degree_centrality', 'eigenvector_centrality', 
                'pagerank', 'hits_hub', 'hits_authority', 'harmonic_centrality', 'disruption']

# Ground truths of interest
GROUND_TRUTHS = ['importance', 'doctypebranch']
GROUND_TRUTHS_INVERTED = ['importance_inverted',  'doctypebranch_inverted']

In [None]:
# Analyze multiple networks
network_results = {}
networks = {
    'network1': {'nodes': nodes_df1, 'edges': edges_df1},
    'network2': {'nodes': nodes_df2, 'edges': edges_df2},
    # ... more networks
}

for network_name, data in networks.items():
    results = analyze_network(
        nodes_df=data['nodes'],
        edges_df=data['edges'],
        ground_truths=GROUND_TRUTHS,
        centralities=CENTRALITIES,
        composite_functions=['weight_composite_ranking', 'treashold_composite_ranking'],
        output_path=f'results/{network_name}'
    )
    network_results[network_name] = results

# Compare results across networks
correlations, rankings = compare_networks(network_results, 'results/comparisons')