https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import py4cytoscape as p4c
import os

In [None]:
# --- Load data files ---
entity_weight_df = pd.read_csv('entity_weight.csv', dtype={'Phecode': str})
entity_df = pd.read_csv('processed_entities.csv')
relation_df = pd.read_csv('processed_relationships.csv')

In [None]:
# Initialize an undirected graph
G = nx.Graph()
# Add nodes
for _, row in entity_df.iterrows():
    G.add_node(row['node_id'])
# Add edges
for _, row in relation_df.iterrows():
    G.add_edge(row['source'], row['edge'])

In [None]:
# Find largest connected component
largest_cc = max(nx.connected_components(G), key=len)
largest_cc_subgraph = G.subgraph(largest_cc).copy()

# Extract edges from largest connected component
edges = list(largest_cc_subgraph.edges())
edges_df = pd.DataFrame(edges, columns=['from', 'to'])
print(f"Largest connected component edges: {edges_df.shape}")

In [None]:
# Filter relationships to include only nodes in entity set
entity_node = set(entity_df['node_id'])
filtered_relationships = edges_df[(edges_df['from'].isin(entity_node)) & (edges_df['to'].isin(entity_node))]
print(f"Filtered relationships: {filtered_relationships.shape[0]}")
print(f"Original edges: {edges_df.shape[0]}")

In [None]:
filtered_relationships.to_csv('processed_lcc.csv', index=False)

In [None]:
print(f"Entities with beta values: {len(entity_df[~entity_df['Beta'].isna()])}")

# --- Add HS-gene relationships based on current knowledge ---
# Related gene to Hidradenitis (DOID:2280, DOID:2282):
# * HGNC:45689 (RNU7-155P)
# * HGNC:51646 (MYL6P5) 
# * HGNC:42721 (LINC00393)
# * HGNC:23151 (FERMT3)

# Gene list are HS related based on this web: https://www.ebi.ac.uk/gwas/publications/37494057
gene_list = ['HGNC:45689', 'HGNC:51646', 'HGNC:42721', 'HGNC:23151']
print(entity_df[entity_df['node_id'].isin(gene_list))

# Add new relationship
add_relationship = {
    'from': ['DOID:2280', 'DOID:2280', 'DOID:2280', 'DOID:2280', 'DOID:2282', 'DOID:2282', 'DOID:2282', 'DOID:2282'],
    'to': ['HGNC:45689', 'HGNC:51646', 'HGNC:42721', 'HGNC:23151', 'HGNC:45689', 'HGNC:51646', 'HGNC:42721', 'HGNC:23151']
}
add_relationship_df = pd.DataFrame(add_relationship)
lcc_update_df = pd.concat([lcc_df, add_relationship_df], ignore_index=True)

In [None]:
# --- Find HS related nodes in knowledge pool ---
# Extract related nodes directly from the relationship data
related_nodes = relation_all_df[(relation_all_df['source'] == 'DOID:2280') | (relation_all_df['source'] == 'DOID:2282')]['edge']
print(f"Found {len(related_nodes)} HS-related nodes from relationship data")

# --- Find HS related nodes in largest connected component ---
# Check which HS-related nodes are connected to DOID:2280 in the LCC
hs_lcc_connections = lcc_df[(lcc_df['to'].isin(related_nodes)) & (lcc_df['from'] == 'DOID:2280')]
print(f"Found {len(hs_lcc_connections)} HS-related connections in LCC")

# --- Create graph ---
beta_df = entity_df[~entity_df['Beta'].isna()]
beta_list = beta_df.set_index('node_id')['Beta'].to_dict()
print(f"Beta list length: {len(beta_list)}")

In [None]:
G = nx.Graph()

# Add nodes and edges
for _, row in entity_df.iterrows():
    G.add_node(row['node_id'], node_type=row['category'], weight=row['Beta'])

for _, row in lcc_df.iterrows():
    G.add_edge(row['from'], row['to'])

# Create adjacency matrix
W = nx.adjacency_matrix(G)
W = np.transpose(np.nan_to_num(W/np.sum(W, axis=0)))

In [None]:
# --- Network propagation ---
# Initialize node values
y_initial = np.zeros(len(G.nodes))
node_list = list(G.nodes)
for node in node_list:
    if node in beta_list:
        y_initial[node_list.index(node)] = beta_list[node]

# Propagation parameters
alpha = 0.85
max_iter = 40
tol = 1e-6
y = y_initial.copy()

# Iterative propagation
for i in range(max_iter):
    y_new = alpha * W.dot(y) + (1 - alpha) * y_initial
    if np.linalg.norm(y_new - y) < tol:
        print(f"Converged after {i+1} iterations")
        break
    y = y_new

# Final propagated values
y_propagated = y

# Adding propagated values back to nodes
for i, node in enumerate(node_list):
    G.nodes[node]['value'] = y_propagated[i]

In [None]:
# --- GPSnet result processing ---
path = '/GPSnet_result/yesSmooth'
files = [
    'run1004_0.001.txt', 
    'run1004_0.005.txt'
]

file_names = []
for file in files:
    df_name = file.replace('run1004_', 'df_yes_').replace('.txt', '').replace('0.', '0_')
    file_path = os.path.join(path, file)
    globals()[df_name] = pd.read_csv(file_path, delimiter="\t")
    print(f"Loaded {file} into {df_name}")
    file_names.append(df_name)

for i in file_names:
    df = globals()[i]
    print(i, df.shape)

# Add HS node to network
HS_node = [{'gene': 'DOID:2280'}, {'gene': 'DOID:2282'}]
for file_name in file_names:
    df = globals()[file_name]
    globals()[file_name] = df._append(HS_node, ignore_index=True)

file_names = ['df_yes_0_001']

# --- Connect to Cytoscape ---
p4c.cytoscape_ping()

for file_name in file_names:
    df = globals()[file_name]
    gene_column = df['gene']
    subgraph = G.subgraph(gene_column)
    
    # Create a file name for the GraphML file
    graphml_filename = f"{file_name}_0411_all_cohort_no_update_llm.graphml"
    
    # Write the graph to a GraphML file
    nx.write_graphml(subgraph, graphml_filename)
    
    print(f"Processed {file_name} and saved as {graphml_filename} into Cytoscape.")