# Pre-processing

## Checking metrics for clearing and clustering

In [8]:
import yaml

def load_yaml(file_path):
    """Load a YAML file and return its content."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return yaml.safe_load(file)

def extract_names_and_synonyms(data, key):
    """
    Extract names and synonyms from a clustered YAML structure.
    
    Args:
        data (dict): Data structure containing clustered data.
        key (str): Key for the cluster, e.g., 'clustered_metrics' or 'clustered_methods'.
    
    Returns:
        set: A set of all names and synonyms.
    """
    items = set()
    for cluster in data.get(key, []):
        for item in cluster.get(key[:-1], []):  # dynamically access 'metric' or 'method'
            items.add(item.get('name'))
            items.update(item.get('synonyms', []))
    return items

def extract_items_from_papers(data, item_type):
    """
    Extract all unique items (metrics or methods) from the 'papers' section.
    
    Args:
        data (dict): Data structure containing the 'papers' section.
        item_type (str): 'metrics' or 'methods'.
    
    Returns:
        set: A set of unique items found in the papers.
    """
    return {item for paper in data.get('papers', []) for item in paper.get(item_type, [])}

def main():
    metrics_yaml_path = "dictionaries/metrics.yml"
    papers_yaml_path = "papers.yml"
    
    # Load YAML data
    metrics_data = load_yaml(metrics_yaml_path)
    papers_data = load_yaml(papers_yaml_path)
    
    # Extract metrics
    extracted_metrics = extract_items_from_papers(papers_data, 'metrics')
    yaml_metrics = extract_names_and_synonyms(metrics_data, 'clustered_metrics')
    
    # Find missing metrics
    missing_metrics = extracted_metrics - yaml_metrics
    
    # Output results
    print("All Metrics (from papers):")
    print("\n".join(sorted(extracted_metrics)))
    
    print("\nMetrics not in the YAML:")
    print("\n".join(sorted(missing_metrics)))

if __name__ == "__main__":
    main()

All Metrics (from papers):
Actual amount of foreign capital used in the current year
Actual amount of foreign investment used in the year
Actual amount of foreign investment utilized
Actual utilization of foreign direct investment amount
Added value of the secondary industry as a proportion of GDP
Added value of the tertiary industry as a proportion of GDP
Administrative area land area
Advanced industrial structure index
Advanced level of industrial structure
Amount of foreign capital utilized
Amount of foreign capital utilized in the current year
Annual growth rate of fixed asset investment
Annual output value of tertiary industry
Annual percentage growth rate of GDP
Average wage of employees
Average wage of on-the-job workers
Balance of savings deposits per capita
Degree of resource depletion
Dependence on foreign trade
Dependence on foreign trade (Total imports and exports/GDP)
Deposit balance of financial institutions
Development intensity of construction land per 10,000 GDP
Digita

## Checking methods for clearing and clustering

In [9]:
import yaml

def load_yaml(file_path):
    """Load YAML content from a file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return yaml.safe_load(file)

def extract_names_and_synonyms(data, key):
    """
    Extract names and synonyms from a clustered YAML structure.
    
    Args:
        data (dict): Data structure containing clustered data.
        key (str): Key for the cluster, e.g., 'clustered_metrics' or 'clustered_methods'.
    
    Returns:
        set: A set of all names and synonyms.
    """
    items = set()
    for cluster in data.get(key, []):
        for item in cluster.get(key[:-1], []):  # dynamically access 'metric' or 'method'
            items.add(item.get('name'))
            items.update(item.get('synonyms', []))
    return items

def extract_items_from_papers(data, item_type):
    """
    Extract all unique items (metrics or methods) from the 'papers' section.
    
    Args:
        data (dict): Data structure containing the 'papers' section.
        item_type (str): 'metrics' or 'methods'.
    
    Returns:
        set: A set of unique items found in the papers.
    """
    return {item for paper in data.get('papers', []) for item in paper.get(item_type, [])}

def main():
    methods_yaml_path = "dictionaries/methods.yml"
    papers_yaml_path = "papers.yml"
    
    # Load YAML data
    methods_data = load_yaml(methods_yaml_path)
    papers_data = load_yaml(papers_yaml_path)
    
    # Extract methods
    extracted_methods = extract_items_from_papers(papers_data, 'methods')
    yaml_methods = extract_names_and_synonyms(methods_data, 'clustered_methods')
    
    # Find missing methods
    missing_methods = extracted_methods - yaml_methods
    
    # Output results
    print("All Methods (from papers):")
    print("\n".join(sorted(extracted_methods)))
    
    print("\nMethods not in the YAML:")
    print("\n".join(sorted(missing_methods)))

if __name__ == "__main__":
    main()

All Methods (from papers):
Adaptive cycle theory
ArcGIS for resilience visualization
Bartlett's test of sphericity
CRITIC and EWM combined weighting model
CRITIC–TOPSIS weighted evaluation model
Coefficient of variation method (CV)
Combination weight method
Combined weight model
Complex network approach
Coupling coordination degree (CCD) analysis
Coupling coordination degree (CCD) model
Coupling coordination degree model
Coupling coordination model
Dagum's Gini coefficient
Driving-Pressure-State-Response (DPSR) model
Entropy method
Entropy method for three-level indicator weighting
Entropy method for urban resilience index calculation
Entropy value method
Entropy weight method
Entropy weight method (EWM)
Entropy weight method (Shannon, 1948)
Entropy weight method with time series weights
Entropy weight-TOPSIS method
Evaluation functions for persistence, adaptability, and transformation
Exploratory Spatial Data Analysis (ESDA)
Extreme value entropy method
Fuzzy logic reasoning method
Ge

## Clear and cluster papers

In [10]:
from collections import defaultdict
import yaml

# --- Utility Functions --- #
def load_yaml(file_path):
    """Load YAML data from a file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return yaml.safe_load(file)

def save_yaml(data, file_path):
    """Save YAML data to a file."""
    with open(file_path, 'w', encoding='utf-8') as file:
        yaml.dump(data, file, allow_unicode=True, sort_keys=False)

# --- Map Items to Categories --- #
def map_items_to_categories(cluster_data, items_list, item_key):
    """
    Map items to their categories based on cluster data.

    Args:
        cluster_data (list): Cluster definitions with categories.
        items_list (list): List of items to categorize.
        item_key (str): Key in the cluster data ('metrics' or 'methods').

    Returns:
        dict: Categorized items as {category: [items]}.
    """
    categorized_items = defaultdict(list)

    for item in items_list:
        matched = False  # Track if the item is matched to a category
        for cluster in cluster_data:
            category = cluster['category']
            for definition in cluster[item_key]:
                # Normalize strings to lowercase for comparison
                item_normalized = item.lower()
                name_normalized = definition['name'].lower()
                synonyms_normalized = [syn.lower() for syn in definition.get('synonyms', [])]

                # Match item against the name or synonyms
                if item_normalized == name_normalized or item_normalized in synonyms_normalized:
                    categorized_items[category].append(definition['name'])  # Use the primary name
                    matched = True
                    break
            if matched:
                break
        if not matched:
            categorized_items["Uncategorized"].append(item)

    return categorized_items

# --- Process Papers --- #
def process_papers(clustered_metrics_path, clustered_methods_path, papers_path, output_path):
    """
    Process papers to categorize metrics and methods.

    Args:
        clustered_metrics_path (str): Path to clustered metrics YAML.
        clustered_methods_path (str): Path to clustered methods YAML.
        papers_path (str): Path to papers YAML.
        output_path (str): Path to save updated papers YAML.
    """
    # Load cluster data
    metrics_data = load_yaml(clustered_metrics_path)
    methods_data = load_yaml(clustered_methods_path)
    papers_data = load_yaml(papers_path)

    clustered_metrics = metrics_data.get('clustered_metrics', [])
    clustered_methods = methods_data.get('clustered_methods', [])

    if not clustered_metrics or not clustered_methods:
        raise ValueError("Clustered metrics or methods data is missing in YAML files.")

    # Process each paper
    for paper in papers_data.get('papers', []):
        for key, cluster_data in (("metrics", clustered_metrics), ("methods", clustered_methods)):
            if key in paper:
                categorized = map_items_to_categories(cluster_data, paper[key], key)
                # Replace the original list with the categorized structure
                paper[key] = [{'category': cat, 'items': items} for cat, items in categorized.items()]

    # Save updated papers
    save_yaml(papers_data, output_path)
    print(f"Updated papers saved to {output_path}")

# --- Main --- #
def main():
    clustered_metrics_path = "dictionaries/metrics.yml"
    clustered_methods_path = "dictionaries/methods.yml"
    papers_path = "papers.yml"
    output_path = "cleared_papers.yml"

    process_papers(clustered_metrics_path, clustered_methods_path, papers_path, output_path)

if __name__ == "__main__":
    main()

Updated papers saved to cleared_papers.yml


# Analysis

In [8]:
import os
import yaml
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from collections import Counter
from itertools import combinations

# Ensure plots directory exists
os.makedirs("plots", exist_ok=True)

def load_papers(file_path):
    """Load papers from YAML file."""
    with open(file_path, "r") as file:
        data = yaml.safe_load(file)
    return data["papers"]

def count_methods_and_metrics(papers):
    """Count all methods and metrics from papers."""
    methods = Counter()
    metrics = Counter()
    method_categories = Counter()
    metric_categories = Counter()
    
    for paper in papers:
        # Count methods
        for method in paper.get("methods", []):
            method_categories[method["category"]] += 1
            methods.update(method["items"])
        
        # Count metrics
        for metric in paper.get("metrics", []):
            metric_categories[metric["category"]] += 1
            metrics.update(metric["items"])
    
    return methods, metrics, method_categories, metric_categories

def get_cooccurrences(papers, item_type="methods"):
    """Find which categories appear together in papers."""
    cooccurrences = Counter()
    
    for paper in papers:
        categories = [item["category"] for item in paper.get(item_type, [])]
        # Get all unique pairs
        for pair in combinations(sorted(set(categories)), 2):
            cooccurrences[pair] += 1
    
    return cooccurrences

def create_cooccurrence_matrix(cooccurrences, all_categories):
    """Create matrix from cooccurrence pairs."""
    labels = sorted(all_categories.keys())
    n = len(labels)
    matrix = np.zeros((n, n))
    
    # Map labels to indices
    label_to_idx = {label: i for i, label in enumerate(labels)}
    
    # Fill matrix
    for (cat1, cat2), count in cooccurrences.items():
        if cat1 in label_to_idx and cat2 in label_to_idx:
            i, j = label_to_idx[cat1], label_to_idx[cat2]
            matrix[i, j] = count
            matrix[j, i] = count  # Make symmetric
    
    return matrix, labels

def plot_bar_chart(items, title, figsize=(16, 6), save_path=None):
    """Create bar chart with blue gradient colors."""
    if not items:
        return
    
    # Sort alphabetically by label
    sorted_items = sorted(items, key=lambda x: x[0])
    labels, counts = zip(*sorted_items)
    colors = plt.cm.Blues(np.linspace(0.9, 0.4, len(labels)))
    
    plt.figure(figsize=figsize)
    bars = plt.bar(labels, counts, color=colors)
    
    # Add value labels on bars - ensure integer values
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, height + 0.1,
                f'{int(height)}', ha='center', va='bottom')
    
    plt.title(title, fontsize=14)
    plt.xticks(rotation=45, ha='right')
    plt.ylim(0, max(counts) * 1.15)
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300)
        plt.close()
    else:
        plt.show()

def plot_heatmap(matrix, labels, title, figsize=(16, 8), save_path=None):
    """Create heatmap with blue colors and annotations."""
    plt.figure(figsize=figsize)
    max_val = int(np.max(matrix))
    
    # Custom blue colormap
    blues_colors = plt.cm.Blues(np.linspace(0, 0.9, 256))
    cmap = mcolors.ListedColormap(blues_colors)
    
    # Create heatmap without colorbar
    im = plt.imshow(matrix, cmap=cmap, interpolation='nearest')
    
    # Set labels
    plt.xticks(range(len(labels)), labels, rotation=45, ha='right', fontsize=10)
    plt.yticks(range(len(labels)), labels, fontsize=10)
    plt.title(title, fontsize=14)
    
    # Add text annotations - ensure integer values
    for i in range(len(labels)):
        for j in range(len(labels)):
            if i != j and matrix[i, j] > 0:  # Skip diagonal and zeros
                # Choose text color based on background
                text_color = 'white' if matrix[i, j] > max_val * 0.6 else 'black'
                plt.text(j, i, f'{int(matrix[i, j])}', 
                        ha='center', va='center', fontsize=9, 
                        color=text_color)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300)
        plt.close()
    else:
        plt.show()

def print_stats(papers, methods, metrics):
    """Print basic statistics."""
    print("=== PAPER ANALYSIS ===")
    print(f"Total papers: {len(papers)}")
    print(f"Unique methods: {len(methods)}")
    print(f"Unique metrics: {len(metrics)}")
    
    print(f"\nTop 10 Methods:")
    for method, count in methods.most_common(10):
        print(f"  {method}: {count}")
    
    print(f"\nTop 10 Metrics:")
    for metric, count in metrics.most_common(10):
        print(f"  {metric}: {count}")

def analyze_papers():
    """Main analysis function."""
    # Load data
    papers = load_papers("cleared_papers.yml")
    
    # Count everything
    methods, metrics, method_cats, metric_cats = count_methods_and_metrics(papers)
    
    # Print statistics
    print_stats(papers, methods, metrics)
    
    # Create visualizations
    print("\nCreating visualizations...")
    
    # Bar charts
    plot_bar_chart(methods.most_common(10), "Common Methods",
               save_path="plots/common_methods.png")
    plot_bar_chart(metrics.most_common(10), "Common Metrics",
                save_path="plots/common_metrics.png")
    plot_bar_chart(list(metric_cats.items()), "Metric Categories",
                save_path="plots/metric_categories.png")
    
    # Cooccurrence analysis
    method_cooc = get_cooccurrences(papers, "methods")
    metric_cooc = get_cooccurrences(papers, "metrics")
    
    # Heatmaps
    if method_cooc:
        matrix, labels = create_cooccurrence_matrix(method_cooc, method_cats)
        plot_heatmap(matrix, labels, "Method Category Co-occurrence",
                    save_path="plots/method_cooccurrence.png")

    if metric_cooc:
        matrix, labels = create_cooccurrence_matrix(metric_cooc, metric_cats)
        plot_heatmap(matrix, labels, "Metric Category Co-occurrence",
                    save_path="plots/metric_cooccurrence.png")
    
    print("\nAnalysis complete!")

if __name__ == "__main__":
    analyze_papers()

=== PAPER ANALYSIS ===
Total papers: 22
Unique methods: 45
Unique metrics: 124

Top 10 Methods:
  Entropy weight method: 12
  Coupling coordination model: 6
  Kernel density estimation: 3
  Obstacle degree model: 2
  Exploratory Spatial Data Analysis (ESDA): 2
  Principal Component Analysis (PCA): 2
  Combined weight method: 2
  TOPSIS method: 2
  Urban resilience evaluation model: 1
  Urban resilience capacity index: 1

Top 10 Metrics:
  GDP per capita: 22
  Proportion of the tertiary industry added value in GDP: 12
  Fixed asset investment: 10
  Urban per capita disposable income of residents: 8
  Per capita deposits in savings: 8
  Actual amount of foreign capital used: 7
  Per capita retail sales of consumer goods: 7
  Financial revenue: 6
  Dependence on foreign trade: 6
  Education expenditure: 5

Creating visualizations...

Analysis complete!


In [9]:
import os
import yaml
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import pandas as pd
from scipy.stats import gaussian_kde

def load_papers(file_path):
    """Load papers from YAML file."""
    with open(file_path, "r") as file:
        data = yaml.safe_load(file)
    return data["papers"]

def get_years_data(papers):
    """Extract all years from papers' temporal scopes."""
    years = []
    for paper in papers:
        years.extend(paper["temporal_scope"])
    return np.array(years)

def create_timeline_plot(years, save_path=None):
    """Create gradient violin plot showing timeline distribution."""
    plt.figure(figsize=(16, 6))
    ax = plt.gca()
    
    # Calculate density
    kde = gaussian_kde(years)
    min_year = years.min()
    max_year = years.max()
    x_points = np.linspace(min_year, max_year, 300)
    density = kde(x_points)
    
    # Scale density for plotting
    max_width = 0.4
    scaled_density = density / density.max() * max_width
    
    # Create custom blues colormap
    blues_slice = mcolors.ListedColormap(plt.cm.Blues(np.linspace(0.4, 0.9, 256)))
    
    # Draw gradient-filled violin
    for i in range(len(x_points) - 1):
        x_coords = [x_points[i], x_points[i+1], x_points[i+1], x_points[i]]
        y_coords = [-scaled_density[i], -scaled_density[i+1], 
                    scaled_density[i+1], scaled_density[i]]
        
        color_value = (density[i] + density[i+1]) / (2 * density.max())
        color = blues_slice(color_value)
        
        ax.fill(x_coords, y_coords, color=color, edgecolor=color)
    
    # Add center line
    ax.plot([min_year, max_year], [0, 0], color='black', linewidth=1)
    
    # Format plot
    ax.set_title("Timeline of Papers' Temporal Coverage", fontsize=16)
    ax.set_xlabel("Year", fontsize=12)
    ax.set_ylabel("Density", fontsize=12)
    ax.grid(axis="x", linestyle="--", alpha=0.7)
    ax.set_xticks(np.arange(min_year, max_year + 1, 1))
    
    plt.tight_layout()
    
    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        plt.savefig(save_path, dpi=300)
        plt.close()
    else:
        plt.show()

def main():
    papers = load_papers("cleared_papers.yml")
    years = get_years_data(papers)
    create_timeline_plot(years, save_path="plots/timeline.png")

if __name__ == "__main__":
    main()

In [10]:
import os
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import numpy as np
import textwrap

def load_papers(file_path):
    """Load papers from YAML file."""
    with open(file_path, "r") as file:
        data = yaml.safe_load(file)
    return data.get("papers", [])

def get_metrics_data(papers):
    """Extract metrics and years from papers."""
    data = []
    for paper in papers:
        year = paper.get("year")
        if year:
            for metric in paper.get("metrics", []):
                for item in metric.get("items", []):
                    data.append({"Metric": item, "Year": year})
    return pd.DataFrame(data)

def prepare_data(df, top_n=10):
    """Get top metrics and calculate percentages."""
    top_metrics = df["Metric"].value_counts().head(top_n).index
    df_filtered = df[df["Metric"].isin(top_metrics)]
    
    counts = df_filtered.groupby(["Metric", "Year"]).size().reset_index(name="Count")
    year_totals = df.groupby("Year").size().reset_index(name="Year_Total")
    result = counts.merge(year_totals, on="Year")
    result["Percentage"] = (result["Count"] / result["Year_Total"] * 100).round(1)
    
    return result

def create_plot(data, save_path=None):
    """Create scatter plot with seaborn, optionally save to file."""
    fig, ax = plt.subplots(figsize=(16, 7))
    blues_slice = mcolors.ListedColormap(plt.cm.Blues(np.linspace(0.4, 0.9, 256)))
    
    scatter = sns.scatterplot(
        data=data,
        x="Metric", 
        y="Year", 
        size="Percentage",
        sizes=(100, 2500),
        hue="Percentage", 
        palette=blues_slice,
        alpha=0.8,
        edgecolor="white",
        linewidth=0.5,
        ax=ax
    )
    
    # Wrap long labels
    labels = ["\n".join(textwrap.wrap(label, 15)) for label in data["Metric"].unique()]
    ax.set_xticklabels(labels, rotation=45, ha="right")
    
    # Move legend to the right side
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(
        handles, labels,
        title="Frequency (%)",
        loc="center left",
        bbox_to_anchor=(1.02, 0.5),
        frameon=False,
        borderpad=2,
        labelspacing=4,
        handletextpad=2,
        borderaxespad=0.5
    )
    
    # Format plot
    ax.set_title("Distribution of Common Metrics Over Time (Normalized by Year)")
    ax.set_xlabel("Metrics")
    ax.set_ylabel("Year")
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        plt.savefig(save_path, dpi=300)
        plt.close()
    else:
        plt.show()

def main():
    papers = load_papers("cleared_papers.yml")
    df = get_metrics_data(papers)
    plot_data = prepare_data(df)
    create_plot(plot_data, save_path="plots/metrics_over_time.png")

if __name__ == "__main__":
    main()

  ax.set_xticklabels(labels, rotation=45, ha="right")


In [11]:
import os
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize
from community import community_louvain
import yaml
import numpy as np
import matplotlib.colors as mcolors
from matplotlib.ticker import FormatStrFormatter

def load_papers(filename):
    """Load papers from a YAML file."""
    with open(filename, "r") as file:
        data = yaml.safe_load(file)
    return data["papers"]

def create_bipartite_graph(papers):
    """Create a bipartite graph of papers and methods."""
    edges = []
    for paper in papers:
        paper_id = paper["id"]
        methods = [method for category in paper.get("methods", []) for method in category["items"]]
        for method in methods:
            edges.append((paper_id, method))
    B = nx.Graph()
    B.add_edges_from(edges)
    return B

def project_graph(B):
    """Project the bipartite graph into a unipartite graph of papers."""
    paper_nodes = [node for node in B.nodes if isinstance(node, int)]
    return nx.bipartite.projected_graph(B, paper_nodes)

def visualize_graph(papers_graph, save_path=None):
    """
    Visualize the paper graph with community detection and node coloring.
    If save_path is provided, save the figure (PNG, 300 dpi) and close it.
    Otherwise, show interactively.
    """
    if papers_graph.number_of_nodes() == 0:
        print("Graph is empty; nothing to plot.")
        return

    # Community partition (unused for color here, but kept for potential extensions)
    _communities = community_louvain.best_partition(papers_graph)

    # Layout and node metrics
    pos = nx.spring_layout(papers_graph, seed=42, k=5, iterations=300)
    degrees = [papers_graph.degree(node) for node in papers_graph.nodes]

    # Color map by degree
    norm = Normalize(vmin=min(degrees), vmax=max(degrees))
    blues_slice = mcolors.ListedColormap(plt.cm.Blues(np.linspace(0.4, 0.9, 256)))
    node_colors = [blues_slice(norm(d)) for d in degrees]
    node_sizes = [2000 + 100 * d for d in degrees]

    # Draw
    plt.figure(figsize=(16, 8))
    nx.draw_networkx_edges(papers_graph, pos, alpha=0.3, edge_color="gray")
    nx.draw_networkx_nodes(papers_graph, pos, node_size=node_sizes, node_color=node_colors, alpha=0.9)

    labels = {n: f"Paper {n}" if isinstance(n, int) else n for n in papers_graph.nodes}
    nx.draw_networkx_labels(papers_graph, pos, labels, font_size=8, font_color="white")

    # Colorbar for degree
    fig = plt.gcf()
    sm = ScalarMappable(cmap=blues_slice, norm=norm)
    sm.set_array([])
    cbar = fig.colorbar(sm, ax=plt.gca(), orientation="vertical")
    cbar.set_label("Node Degree")
    cbar.ax.yaxis.set_major_formatter(FormatStrFormatter('%d'))

    plt.title("Network of Papers Based on Shared Methods", fontsize=16)
    plt.axis("off")
    plt.tight_layout()

    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        plt.savefig(save_path, dpi=300)
        plt.close()
    else:
        plt.show()

def main():
    """Main function to orchestrate the analysis and plotting."""
    papers = load_papers("cleared_papers.yml")
    B = create_bipartite_graph(papers)
    papers_graph = project_graph(B)
    visualize_graph(papers_graph, save_path="plots/paper_network.png")

if __name__ == "__main__":
    main()

In [12]:
import os
import re
import yaml
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.colors import Normalize

def load_papers(file_path):
    """Load papers from YAML file."""
    with open(file_path, "r") as file:
        data = yaml.safe_load(file)
    return data.get("papers", [])

def extract_categories(papers):
    """Extract all unique metric categories."""
    categories = set()
    for paper in papers:
        for metric in paper.get("metrics", []):
            categories.add(metric["category"])
    return sorted(categories)

def create_paper_network(papers, category):
    """Create network of papers connected by shared metrics."""
    edges = []
    for paper in papers:
        paper_id = paper["id"]
        for metric in paper.get("metrics", []):
            if metric["category"] == category:
                for item in metric["items"]:
                    edges.append((paper_id, item))
    if not edges:
        return None

    # Create bipartite graph and project to paper-only network
    bipartite_graph = nx.Graph()
    bipartite_graph.add_edges_from(edges)

    paper_nodes = [node for node in bipartite_graph.nodes if isinstance(node, int)]
    return nx.bipartite.projected_graph(bipartite_graph, paper_nodes)

def calculate_smart_ticks(max_value, max_ticks=8):
    """Generate intelligent tick locations for colorbar."""
    if max_value <= max_ticks:
        return list(range(max_value + 1))
    if max_value <= 20:
        step = 2 if max_value <= 16 else 5
    elif max_value <= 50:
        step = 5 if max_value <= 30 else 10
    else:
        step = max(1, max_value // max_ticks)
    ticks = list(range(0, max_value + 1, step))
    if ticks[-1] != max_value:
        ticks.append(max_value)
    return ticks

def setup_plot_elements(graph):
    """Calculate positions, colors, and sizes for network visualization."""
    pos = nx.spring_layout(graph, seed=42, k=5, iterations=300)
    degrees = [graph.degree(node) for node in graph.nodes]
    max_degree = max(degrees) if degrees else 0

    vmin, vmax = (0, 1) if max_degree == 0 else (0, max_degree)
    norm = Normalize(vmin=vmin, vmax=vmax)
    blues_slice = mcolors.ListedColormap(plt.cm.Blues(np.linspace(0.4, 0.9, 256)))

    return {
        "pos": pos,
        "degrees": degrees,
        "max_degree": max_degree,
        "norm": norm,
        "colormap": blues_slice,
        "node_colors": [blues_slice(norm(d)) for d in degrees],
        "node_sizes": [2000 + 100 * d for d in degrees],
    }

def _slugify(text):
    """Filesystem-friendly slug for filenames."""
    text = re.sub(r"\s+", "_", text.strip())
    text = re.sub(r"[^\w\-_.]", "", text)
    return text.lower() or "category"

def create_network_plot(graph, category, save_dir="plots", show=False):
    """Create network visualization; save PNG to save_dir and optionally show."""
    if not graph or len(graph.nodes) == 0:
        print(f"No network found for category: {category}")
        return

    elements = setup_plot_elements(graph)
    fig, ax = plt.subplots(figsize=(10, 8))

    # Draw network
    nx.draw_networkx_edges(graph, elements["pos"], alpha=0.3, edge_color="gray", ax=ax)
    nx.draw_networkx_nodes(
        graph,
        elements["pos"],
        node_size=elements["node_sizes"],
        node_color=elements["node_colors"],
        alpha=0.9,
        ax=ax,
    )

    # Labels
    labels = {node: f"Paper {node}" for node in graph.nodes}
    nx.draw_networkx_labels(
        graph, elements["pos"], labels, font_size=8, font_color="white", ax=ax
    )

    # Colorbar
    sm = plt.cm.ScalarMappable(cmap=elements["colormap"], norm=elements["norm"])
    sm.set_array([])
    cbar = fig.colorbar(sm, ax=ax)
    cbar.set_label("Node Degree")

    # Smart ticks
    tick_locations = (
        [0, 1] if elements["max_degree"] == 0 else calculate_smart_ticks(elements["max_degree"])
    )
    cbar.set_ticks(tick_locations)
    cbar.set_ticklabels([str(int(t)) for t in tick_locations])

    # Formatting
    ax.set_title(f"Paper Network: {category}", fontsize=14)
    ax.axis("off")
    plt.tight_layout()

    # Save to file
    os.makedirs(save_dir, exist_ok=True)
    fname = f"paper_network_{_slugify(category)}.png"
    out_path = os.path.join(save_dir, fname)
    plt.savefig(out_path, dpi=300)
    print(f"Saved: {out_path}")

    if show:
        plt.show()
    else:
        plt.close()

    print(f"  - {len(graph.nodes)} papers, {len(graph.edges)} connections")

def analyze_networks(show=False):
    """Main analysis function."""
    papers = load_papers("cleared_papers.yml")
    categories = extract_categories(papers)
    print(f"Found {len(categories)} metric categories")

    for category in categories:
        print(f"Creating network for: {category}")
        paper_graph = create_paper_network(papers, category)
        create_network_plot(paper_graph, category, save_dir="plots", show=show)

def main():
    """Main execution function."""
    analyze_networks(show=False)  # set True to also display

if __name__ == "__main__":
    main()

Found 10 metric categories
Creating network for: Economic & Investment
Saved: plots/paper_network_economic__investment.png
  - 22 papers, 210 connections
Creating network for: Environment & Resources
Saved: plots/paper_network_environment__resources.png
  - 4 papers, 1 connections
Creating network for: Finance & Revenue
Saved: plots/paper_network_finance__revenue.png
  - 20 papers, 42 connections
Creating network for: Industrial & Structural
Saved: plots/paper_network_industrial__structural.png
  - 20 papers, 72 connections
Creating network for: Miscellaneous & Others
Saved: plots/paper_network_miscellaneous__others.png
  - 7 papers, 0 connections
Creating network for: Population & Employment
Saved: plots/paper_network_population__employment.png
  - 10 papers, 4 connections
Creating network for: Retail & Consumption
Saved: plots/paper_network_retail__consumption.png
  - 11 papers, 22 connections
Creating network for: Social & Living Standards
Saved: plots/paper_network_social__living_s

In [7]:
import yaml
import pandas as pd

def load_yaml(filename):
    """Load the YAML file and return the list of papers."""
    with open(filename, "r") as file:
        data = yaml.safe_load(file)
    return data.get("papers", [])

def extract_method_categories(methods):
    """
    Extract method categories from the list of methods.
    Returns a semicolon-separated string of method categories.
    """
    categories = [method.get("category", "") for method in methods if "category" in method]
    return "; ".join(categories)

def extract_metric_categories(metrics):
    """
    Extract metric categories from the list of metrics.
    Returns a semicolon-separated string of metric categories.
    """
    categories = [metric.get("category", "") for metric in metrics if "category" in metric]
    return "; ".join(categories)

def extract_timespan(temporal_scope):
    """
    Given a list of years (temporal_scope), return a string showing the start and end.
    If no temporal scope is provided, return "N/A".
    """
    if temporal_scope and len(temporal_scope) >= 2:
        start = temporal_scope[0]
        end = temporal_scope[-1]
        return f"{start} - {end}"
    elif temporal_scope:
        # If only one year is present, show that year.
        return str(temporal_scope[0])
    else:
        return "N/A"

def build_review_table(papers):
    """
    Build a pandas DataFrame suited for a systematic literature review.
    Columns: ID, Author, Method Category, Metric Category, Timespan.
    """
    rows = []
    for paper in papers:
        paper_id = paper.get("id", "")
        authors = ", ".join(paper.get("authors", []))
        method_categories = extract_method_categories(paper.get("methods", []))
        metric_categories = extract_metric_categories(paper.get("metrics", []))
        timespan = extract_timespan(paper.get("temporal_scope", []))
        
        rows.append({
            "ID": paper_id,
            "Author": authors,
            "Method Category": method_categories,
            "Metric Category": metric_categories,
            "Timespan": timespan
        })
    
    return pd.DataFrame(rows)

def main():
    # Load the papers from the YAML file.
    papers_file = "cleared_papers.yml"
    papers = load_yaml(papers_file)
    
    # Build the review table.
    review_table = build_review_table(papers)
    
    # Print the table as plain text.
    print(review_table.to_string(index=False))
    
    # Optionally, export the table to a CSV file:
    # review_table.to_csv("review_table.csv", index=False)

if __name__ == "__main__":
    main()


 ID                                                                                             Author                                                                                          Method Category                                                                                                                                                                                                                               Metric Category    Timespan
  1                                                                    Na Chen, Hongxia Guo, Hui Xiang                                              Weighting Models; Theory-based Methods; Index-based Methods                                                                                                                   Social & Living Standards; Population & Employment; Economic & Investment; Industrial & Structural; Technology & Innovation 2010 - 2020
  2                                                                   Haiyue Fu, Nan