In [1]:
import os
import pandas as pd
from tabulate import tabulate
import networkx as nx


In [2]:
def load_networks(path, max=100):
    """
    Load networks from a directory structure. Can handle both:
    1. Direct network files (nodes.json and edges.json in the input path)
    2. Networks in subdirectories
    
    Args:
        path: Root directory to search for networks or direct path to a network
        max: Maximum number of networks to load
    
    Returns:
        dict: Dictionary of networks with structure {network_name: {'nodes': df, 'edges': df}}
    """
    networks = {}
    loaded_count = 0
    
    # Verify root directory exists and print absolute path
    abs_root = os.path.abspath(path)
    print(f"Looking for networks in: {abs_root}")
    
    if not os.path.exists(abs_root):
        raise ValueError(f"Directory {abs_root} does not exist")
        
    def is_network_dir(dir_path):
        """Check if directory contains exactly nodes.json and edges.json"""
        try:
            contents = os.listdir(dir_path)
            return ('nodes.json' in contents and 'edges.json' in contents)
        except:
            return False
    
    def load_network(dir_path, network_name):
        """Load a single network from a directory"""
        print(f"\nChecking network: {network_name}")
        print(f"  Path: {dir_path}")
        
        nodes_file = os.path.join(dir_path, 'nodes.json')
        edges_file = os.path.join(dir_path, 'edges.json')
        
        try:
            # Load and validate data
            print("  Loading data files...")
            nodes_df = pd.read_json(nodes_file)
            edges_df = pd.read_json(edges_file)
            
            # Validate required columns
            required_node_cols = ['ecli', 'importance', 'doctypebranch']
            missing_cols = [col for col in required_node_cols if col not in nodes_df.columns]
            if missing_cols:
                print(f"  Warning: Missing columns in nodes.json: {missing_cols}")
                print(f"  Available columns: {nodes_df.columns.tolist()}")
                return False
            
            required_edge_cols = ['ecli', 'references']
            missing_edge_cols = [col for col in required_edge_cols if col not in edges_df.columns]
            if missing_edge_cols:
                print(f"  Warning: Missing columns in edges.json: {missing_edge_cols}")
                print(f"  Available columns: {edges_df.columns.tolist()}")
                return False
            
            networks[network_name] = {
                'nodes': nodes_df,
                'edges': edges_df
            }
            print(f"  Successfully loaded network with {len(nodes_df)} nodes and {len(edges_df)} edges")
            return True
            
        except Exception as e:
            print(f"  Error loading network: {str(e)}")
            return False
    
    def process_directory(current_path, parent_prefix=""):
        """Recursively process directories looking for networks"""
        nonlocal loaded_count
        if loaded_count >= max:
            return
            
        # First check if current directory is a network directory
        if is_network_dir(current_path):
            # Generate network name based on path
            rel_path = os.path.relpath(current_path, abs_root)
            network_name = rel_path.replace(os.sep, '-')
            if network_name == '.':  # Handle case where path is direct to network
                network_name = os.path.basename(current_path)
            
            if load_network(current_path, network_name):
                loaded_count += 1
            return
        
        # If not a network directory, search subdirectories
        try:
            for item in os.listdir(current_path):
                if loaded_count >= max:
                    break
                    
                item_path = os.path.join(current_path, item)
                if not os.path.isdir(item_path):
                    continue
                    
                process_directory(item_path)
        except Exception as e:
            print(f"Error accessing directory {current_path}: {str(e)}")
    
    # Start processing from root directory
    process_directory(abs_root)
    print(f"\nSuccessfully loaded {len(networks)} networks")
    return networks

In [3]:
def categorise_total_branch_numerically(branches: pd.Series) -> pd.Series:
    """
    Convert branch categorisation from strings into numbers.
    
    :param branches: The column containing branch data, categorized with strings.
    :return: A pandas Series with numerical categorization.
    """
    mapping = {
        "GRANDCHAMBER": 1,
        "CHAMBER": 2,
        "COMMITTEE": 3,
    }
    
    # Convert to uppercase to ensure consistent matching
    branches = branches.str.upper()
    
    # Print any values that don't match our mapping
    unmapped = set(branches.unique()) - set(mapping.keys())
    if unmapped:
        print(f"Warning: Found unmapped values: {unmapped}")
    
    return branches.map(mapping)

In [4]:
def analyze_network_stats(nodes_df, edges_df):
    """Calculate comprehensive statistics for a single network"""
    # Extract year from ECLI (format: ECLI:CE:ECHR:1998:0122JUD002620995)
    nodes_df['year'] = nodes_df['ecli'].str.extract(r':(\d{4}):')
    nodes_df['year'] = pd.to_numeric(nodes_df['year'], errors='coerce')
    
    # Convert doctypebranch to numeric if it exists
    if 'doctypebranch' in nodes_df.columns:
        nodes_df['doctypebranch'] = categorise_total_branch_numerically(nodes_df['doctypebranch'])
        # Remove rows with Nan doctypebranch
        nodes_df = nodes_df.dropna(subset=['doctypebranch'])

    # Convert importance to numeric
    nodes_df['importance'] = pd.to_numeric(nodes_df['importance'], errors='coerce')
    
    # Drop rows with missing ecli
    nodes_df = nodes_df.dropna(subset=['ecli'])
    
    # Create graph
    G = nx.DiGraph()
    
    # Add nodes with attributes
    valid_nodes = set(nodes_df['ecli'].values)
    for idx, row in nodes_df.iterrows():
        node_attrs = {
            'importance': row['importance'],
            'doctypebranch': row['doctypebranch'] if 'doctypebranch' in nodes_df.columns else None,
            'year': row['year']
        }
        G.add_node(row['ecli'], **node_attrs)
    
    # Add edges between existing nodes
    edge_count = 0
    for idx, row in edges_df.iterrows():
        source = row['ecli']
        targets = row['references']
        if source in valid_nodes:
            for target in targets:
                if target and target in valid_nodes:
                    G.add_edge(source, target)
                    edge_count += 1
    
    # Remove self-loops
    G.remove_edges_from(nx.selfloop_edges(G))
    
    # Basic network stats
    total_initial_nodes = len(nodes_df)
    total_valid_nodes = len(valid_nodes)
    total_edges = edge_count
    density = nx.density(G)
    
    # Connected nodes analysis
    connected_nodes = sum(1 for node in G.nodes() if G.degree(node) > 0)
    non_connected = total_valid_nodes - connected_nodes

    # Get connected components
    components = []
    for comp in nx.weakly_connected_components(G):
        if len(comp) >= 2:  # Only include components with 2+ nodes
            components.append(comp)

    num_components = len(components)
    biggest_component_size = len(max(components, key=len)) if components else 0
    
    # Class distributions
    doctypebranch_dist = nodes_df['doctypebranch'].value_counts().to_dict() if 'doctypebranch' in nodes_df.columns else {}
    importance_dist = nodes_df['importance'].value_counts().to_dict()

    
    # Split importance distributions by year
    pre_1998 = nodes_df[pd.to_datetime(nodes_df['judgementdate'].str.split(' ').str[0], format='%d/%m/%Y') < pd.to_datetime('01/11/1998', format='%d/%m/%Y')]
    post_1998 = nodes_df[pd.to_datetime(nodes_df['judgementdate'].str.split(' ').str[0], format='%d/%m/%Y') >= pd.to_datetime('01/11/1998', format='%d/%m/%Y')]
    importance_dist_pre_1998 = pre_1998['importance'].value_counts().to_dict()
    importance_dist_post_1998 = post_1998['importance'].value_counts().to_dict()
    
    return {
        'total_initial_nodes': total_initial_nodes,
        'total_valid_nodes': total_valid_nodes,
        'nodes_removed': total_initial_nodes - total_valid_nodes,
        'total_edges': total_edges,
        'connected_nodes': connected_nodes,
        'non_connected_nodes': non_connected,
        'num_components': num_components,
        'biggest_component_size': biggest_component_size,
        'density': density,
        'doctypebranch_dist': doctypebranch_dist,
        'importance_dist': importance_dist,
        'importance_dist_pre_1998': importance_dist_pre_1998,
        'importance_dist_post_1998': importance_dist_post_1998
    }

In [5]:
def create_network_summary():
    # Load all networks from different directories
    network_dirs = [
        'networks/merged-article-edges/full-balanced-importance',
        'networks/merged-article-edges/full-unbalanced',
        'networks/merged-article-edges/full-balanced-doctypebranch',
        'networks/merged-article-edges/split-balanced-importance',
        'networks/merged-article-edges/split-balanced-doctypebranch',
        'networks/merged-article-edges/split-unbalanced'
    ]
    
    networks = {}
    for dir_path in network_dirs:
        loaded_networks = load_networks(dir_path)
        # Prefix each network name with the directory name
        dir_name = dir_path.split('/')[-1]
        prefixed_networks = {
            f"{dir_name}-{network_name}": network_data
            for network_name, network_data in loaded_networks.items()
        }
        networks.update(prefixed_networks)
    
    # Analyze each network
    results = []
    for network_name, data in networks.items():
        stats = analyze_network_stats(data['nodes'], data['edges'])
        
        # Format class distributions for display
        doctypebranch_str = ', '.join([f"{k}: {v}" for k, v in stats['doctypebranch_dist'].items()])
        importance_str = ', '.join([f"{k}: {v}" for k, v in stats['importance_dist'].items()])
        importance_pre_1998_str = ', '.join([f"{k}: {v}" for k, v in stats['importance_dist_pre_1998'].items()])
        importance_post_1998_str = ', '.join([f"{k}: {v}" for k, v in stats['importance_dist_post_1998'].items()])
        
        results.append([
            network_name,
            stats['total_initial_nodes'],
            stats['total_valid_nodes'],
            stats['nodes_removed'],
            stats['total_edges'],
            stats['connected_nodes'],
            stats['non_connected_nodes'],
            stats['num_components'],
            stats['biggest_component_size'],
            f"{stats['density']:.8f}",
            doctypebranch_str,
            importance_str,
            importance_pre_1998_str,
            importance_post_1998_str
        ])

    # Create and display table
    headers = [
        'Network', 
        'Initial Nodes',
        'Valid Nodes',
        'Removed Nodes',
        'Total Edges', 
        'Connected Nodes',
        'Non-Connected Nodes',
        'Connected Components',
        'Biggest Component Size', 
        'Density',
        'Doctypebranch Distribution',
        'Importance Distribution',
        'Importance Pre-1998',
        'Importance Post-1998'
    ]
    
    # Sort results by valid nodes
    results.sort(key=lambda x: x[2], reverse=True)
    
    # Display table
    print(tabulate(results, headers=headers, tablefmt='grid'))
    
    # Save to CSV for further analysis
    df = pd.DataFrame(results, columns=headers)
    df.to_csv('network_statistics_updated.csv', index=False)

In [6]:
create_network_summary()

Looking for networks in: /Users/davidwickerhf/Projects/work/maastrichtuniversity/rankings/networks/merged-article-edges/full-balanced-importance

Checking network: full-balanced-importance
  Path: /Users/davidwickerhf/Projects/work/maastrichtuniversity/rankings/networks/merged-article-edges/full-balanced-importance
  Loading data files...
  Successfully loaded network with 6450 nodes and 6450 edges

Successfully loaded 1 networks
Looking for networks in: /Users/davidwickerhf/Projects/work/maastrichtuniversity/rankings/networks/merged-article-edges/full-unbalanced

Checking network: full-unbalanced
  Path: /Users/davidwickerhf/Projects/work/maastrichtuniversity/rankings/networks/merged-article-edges/full-unbalanced
  Loading data files...
  Successfully loaded network with 27801 nodes and 27801 edges

Successfully loaded 1 networks
Looking for networks in: /Users/davidwickerhf/Projects/work/maastrichtuniversity/rankings/networks/merged-article-edges/full-balanced-doctypebranch

Checking