In [None]:
import os
import json
import pandas as pd
import re

In [None]:
input_dir = "../../all_pathway_collages" # dowloaded from metacyc database
output_dir = "../../graphs"


# Initialize a list to store all DataFrames
all_edges_dfs = []

# Iterate over all JSON files in the directory
for filename in os.listdir(input_dir):
    if filename.endswith(".json"):  # Process only JSON files
        file_path = os.path.join(input_dir, filename)
        
        # Read the JSON file
        with open(file_path, 'r') as f:
            graph_data = json.load(f)
        
        # Extract elements and process edges
        elements = graph_data.get("elements", {})
        
        if "edges" in elements:  # Ensure 'edges' key exists
            edges_data = [node['data'] for node in elements['edges']]
            edges_df = pd.DataFrame(edges_data)
            
            # Apply transformations
            edges_df['pathway'] = edges_df['source'].str.extract(r'--(.+)$')
            edges_df['pathway'] = edges_df['pathway'].fillna(
                edges_df['target'].str.extract(r'--(.+)$')[0]
            )

            edges_df['source_clean'] = edges_df['source'].apply(lambda x: re.split(r'\d{7}--', x)[0])
            edges_df['target_clean'] = edges_df['target'].apply(lambda x: re.split(r'\d{7}--', x)[0])
            
            # Add filename as an identifier (optional)
            edges_df['filename'] = filename
            
            # Append the processed DataFrame to the list
            all_edges_dfs.append(edges_df)

# Merge all edges_df DataFrames into a single DataFrame
merged_edges_df = pd.concat(all_edges_dfs, ignore_index=True)

# Save the merged DataFrame to a CSV file in the output directory
output_file = os.path.join(output_dir, "merged_edges_df.csv")
merged_edges_df.to_csv(output_file, index=False)

# Display summary
print(f"Processed {len(all_edges_dfs)} files.")
print(f"Output saved to: {output_file}")


In [None]:
healthy_path = '../../data/only_healthy_data'
pathways = pd.read_csv(f'{healthy_path}/batchfix_paths_healthy_samples_pathway_counts.csv', sep=',', index_col=0, header=0)


In [None]:
def rename_pathways_based_on_prefix(df, pathways_index):
    """
    Rename the 'pathway' column in the DataFrame based on matching prefixes
    with the given pathways index.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the 'pathway' column.
    - pathways_index (Index or list): The index or list containing the original pathway names.

    Returns:
    - pd.DataFrame: Updated DataFrame with renamed pathways.
    """
    def find_matching_name(pathway):
        if pd.isna(pathway):
            return pathway  # Skip if NaN
        matches = [name for name in pathways_index if pathway == name.split(':')[0]]
        return matches[0] if matches else pathway  # Return the first match or original pathway

    # Apply the renaming function
    df['pathway'] = df['pathway'].apply(find_matching_name)
    return df

merged_edges_df = rename_pathways_based_on_prefix(merged_edges_df, pathways.index)
merged_edges_df

In [None]:
scAAnet_path = '../../data/scAAnet_output'
heatmap_file = f'{scAAnet_path}/full_sorted_top20_state53_heatmapdata.csv'
heatmap_df = pd.read_csv(heatmap_file, index_col=0)
unique_pathways = heatmap_df.index

# type 1 top 20
t1_top20 = unique_pathways[0:20]
# type 2 top 20
t2_top20 = unique_pathways[20:40]
# type 3 top 20
t3_top20 = unique_pathways[40:60]
# Filter rows where 'pathway' matches the extracted pathways
edges_df_t1 = merged_edges_df[merged_edges_df['pathway'].isin(t1_top20)].copy()
edges_df_t2 = merged_edges_df[merged_edges_df['pathway'].isin(t2_top20)].copy()
edges_df_t3 = merged_edges_df[merged_edges_df['pathway'].isin(t3_top20)].copy()


In [None]:
import networkx as nx
from pyvis.network import Network
import matplotlib.colors as mcolors
from collections import defaultdict
import random


def generate_colors(n, seed=None):
    """
    Generate `n` distinct RGB colors.
    
    Parameters:
        n (int): Number of colors to generate.
        seed (int, optional): Seed for reproducibility. Defaults to None.
    
    Returns:
        list: List of RGB tuples.
    """
    if seed is not None:
        random.seed(seed)  # Set the random seed for reproducibility
    
    colors = []
    for _ in range(n):
        colors.append((random.random(), random.random(), random.random()))  # RGB
    return colors


def generate_network_graph(edges_df, labels_to_remove, output_html="network_graph.html"):
    """
    Generate a network graph from a DataFrame of edges and visualize it using PyVis.

    Parameters:
    - edges_df (pd.DataFrame): Input DataFrame with columns ['pathway', 'left', 'right'].
    - labels_to_remove (set): Set of labels to exclude from 'left' and 'right' columns.
    - output_html (str): Path to save the generated graph as an HTML file.
    
    Returns:
    - None: The function generates and saves the graph.
    """

    # Clean the edges DataFrame
    clean_edges_df = edges_df.dropna(subset=['pathway']).reset_index(drop=True)
    
    # Clean `left` and `right` columns based on `labels_to_remove`
    clean_edges_df['left'] = clean_edges_df['left'].apply(
        lambda x: [d for d in x if d['label'] not in labels_to_remove] if isinstance(x, list) else []
    )
    clean_edges_df['right'] = clean_edges_df['right'].apply(
        lambda x: [d for d in x if d['label'] not in labels_to_remove] if isinstance(x, list) else []
    )

    # Assign colors for pathways
    unique_pathways = clean_edges_df['pathway'].dropna().unique()
    '''    color_map = cm.get_cmap('tab20', len(unique_pathways))
    pathway_colors = {
        pathway: mcolors.rgb2hex(color_map(i)) for i, pathway in enumerate(unique_pathways)
    }
    '''    
    color_map = generate_colors(len(unique_pathways), seed=42)
    # Assuming `generate_colors` produces a list of RGB or hex colors
    pathway_colors = {
        pathway: mcolors.rgb2hex(color) for pathway, color in zip(unique_pathways, color_map)
    }

    # Create subgraphs for pathways
    subgraphs = {}
    node_to_subgraph_nodes = defaultdict(list)

    for pathway in unique_pathways:
        subgraph = nx.DiGraph()
        pathway_edges = clean_edges_df[clean_edges_df['pathway'] == pathway]

        for _, row in pathway_edges.iterrows():
            if not isinstance(row['left'], list) or not isinstance(row['right'], list):
                continue  # Skip rows without proper lists

            left_labels = [d['label'] for d in row['left']]
            right_labels = [d['label'] for d in row['right']]

            for left_label in left_labels:
                left_node_name = f"{left_label}_{pathway}"
                subgraph.add_node(
                    left_node_name,
                    color=pathway_colors[pathway],
                    title=left_label
                )
                node_to_subgraph_nodes[left_label].append(left_node_name)

                for right_label in right_labels:
                    right_node_name = f"{right_label}_{pathway}"
                    subgraph.add_node(
                        right_node_name,
                        color=pathway_colors[pathway],
                        title=right_label
                    )
                    node_to_subgraph_nodes[right_label].append(right_node_name)
                    subgraph.add_edge(
                        left_node_name,
                        right_node_name,
                        color=pathway_colors[pathway]
                    )

        subgraphs[pathway] = subgraph

    # Combine subgraphs into the main graph
    main_graph = nx.DiGraph()
    for subgraph in subgraphs.values():
        main_graph.add_nodes_from(subgraph.nodes(data=True))
        main_graph.add_edges_from(subgraph.edges(data=True))

    for original_label, namespaced_nodes in node_to_subgraph_nodes.items():
        if len(namespaced_nodes) > 1:
            unique_namespaced_nodes = list(set(namespaced_nodes))
            n = len(unique_namespaced_nodes)
            for i in range(n):
                for j in range(i + 1, n):
                    main_graph.add_edge(
                        unique_namespaced_nodes[i],
                        unique_namespaced_nodes[j],
                        color="gray",
                        style="dashed",
                        title=f"Duplicate node: {original_label}"
                    )

    # Visualize using PyVis
    net = Network(notebook=True)
    for node, data in main_graph.nodes(data=True):
        net.add_node(
            node,
            title=data.get('title', node),
            color=data.get('color', 'gray')
        )
    for source, target, data in main_graph.edges(data=True):
        net.add_edge(
            source,
            target,
            color=data.get('color', 'black'),
            title=data.get('title', ''),
            dashes=(data.get('style') == "dashed")
        )

    # Add legend for pathways
    legend_y_offset = 80
    legend_x_position = 5000
    for i, (pathway, color) in enumerate(pathway_colors.items()):
        net.add_node(
            f"legend_{i}",
            label=pathway,
            color=color,
            x=legend_x_position,
            y=i * legend_y_offset,
            physics=False,
            shape='dot',
            size=15
        )

    net.show_buttons(filter_=['physics'])
    net.show(output_html)
    print(f"Network graph saved to {output_html}")

In [None]:
labels_to_remove = {"H+", "phosphate", "ATP", "ADP", "H2O", "NADP+", "NADPH", "NADH", "NAD+", "CO2", "coenzyme A", "AMP", "dioxygen", "hydrogen carbonate", "diphosphate"}
generate_network_graph(edges_df_t1, labels_to_remove, output_html=f"{output_dir}/type1_top20.html")
generate_network_graph(edges_df_t2, labels_to_remove, output_html=f"{output_dir}/type2_top20.html")
generate_network_graph(edges_df_t3, labels_to_remove, output_html=f"{output_dir}/type3_top20.html")