In [19]:
import networkx as nx
import matplotlib.pyplot as plt

def read_node_file(file_path):
    nodes = []
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            node_id = parts[0]
            node_name = parts[1]
            node_type = parts[2]
            nodes.append((node_id, node_name, node_type))
    return nodes

def read_edge_file(file_path):
    edges = []
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            source = parts[0]
            target = parts[1]
            metaedge = parts[2]
            edges.append((source, target, metaedge))
    return edges

# File paths for nodes and edges
node_file_path = 'hetionet-v1.0-nodes.tsv'
edge_file_path = 'edges.sif'

# Read node and edge data
nodes_data = read_node_file(node_file_path)
edges_data = read_edge_file(edge_file_path)

edges_data = edges_data[1:]
edges_data
nodes_data = nodes_data[1:]  
nodes_data


In [17]:
d = [x[0] for x in nodes_data]
v = [x[2] for x in nodes_data]

nodes_stripped = [(x, y) for x, y in zip(d, v)]

nodes_stripped

[('Anatomy::UBERON:0000002', 'Anatomy'),
 ('Anatomy::UBERON:0000004', 'Anatomy'),
 ('Anatomy::UBERON:0000006', 'Anatomy'),
 ('Anatomy::UBERON:0000007', 'Anatomy'),
 ('Anatomy::UBERON:0000010', 'Anatomy'),
 ('Anatomy::UBERON:0000011', 'Anatomy'),
 ('Anatomy::UBERON:0000013', 'Anatomy'),
 ('Anatomy::UBERON:0000020', 'Anatomy'),
 ('Anatomy::UBERON:0000026', 'Anatomy'),
 ('Anatomy::UBERON:0000029', 'Anatomy'),
 ('Anatomy::UBERON:0000033', 'Anatomy'),
 ('Anatomy::UBERON:0000038', 'Anatomy'),
 ('Anatomy::UBERON:0000042', 'Anatomy'),
 ('Anatomy::UBERON:0000043', 'Anatomy'),
 ('Anatomy::UBERON:0000045', 'Anatomy'),
 ('Anatomy::UBERON:0000053', 'Anatomy'),
 ('Anatomy::UBERON:0000054', 'Anatomy'),
 ('Anatomy::UBERON:0000056', 'Anatomy'),
 ('Anatomy::UBERON:0000057', 'Anatomy'),
 ('Anatomy::UBERON:0000165', 'Anatomy'),
 ('Anatomy::UBERON:0000178', 'Anatomy'),
 ('Anatomy::UBERON:0000211', 'Anatomy'),
 ('Anatomy::UBERON:0000473', 'Anatomy'),
 ('Anatomy::UBERON:0000474', 'Anatomy'),
 ('Anatomy::UBER

In [11]:
# Create a graph
G = nx.MultiDiGraph()

# Add nodes with types
for node_id, node_type in nodes_stripped:
    G.add_node(node_id, type=node_type)

# Add edges with types
for source, edge_type, target in edges_data:
    G.add_edge(source, target, type=edge_type)

# Count different types for nodes
node_types = {node_attrs['type'] for _, node_attrs in G.nodes(data=True)}

# Count different types for edges
edge_types = {edge_attrs['type'] for _, _, edge_attrs in G.edges(data=True)}

H = G.to_directed()

# Print counts
print("Different types for nodes:", len(node_types))
print("Different types for edges:", len(edge_types))

Different types for nodes: 11
Different types for edges: 24


In [12]:

# Count nodes associated with each type
node_counts = {}
for node, node_attrs in G.nodes(data=True):
    node_type = node_attrs['type']
    if node_type not in node_counts:
        node_counts[node_type] = 1
    else:
        node_counts[node_type] += 1

# Count edges associated with each type
edge_counts = {}
for _, _, edge_attrs in G.edges(data=True):
    edge_type = edge_attrs['type']
    if edge_type not in edge_counts:
        edge_counts[edge_type] = 1
    else:
        edge_counts[edge_type] += 1

# Print counts
#List the Total Number of Entities for Each Unique Kind
print("Nodes associated with each type:")
for node_type, count in node_counts.items():
    print(f"{node_type}: {count}")

print("\nEdges associated with each type:")
for edge_type, count in edge_counts.items():
    print(f"{edge_type}: {count}")

Nodes associated with each type:
Anatomy: 402
Biological Process: 11381
Cellular Component: 1391
Compound: 1552
Disease: 137
Gene: 20945
Molecular Function: 2884
Pathway: 1822
Pharmacologic Class: 345
Side Effect: 5734
Symptom: 438

Edges associated with each type:
AdG: 102240
AeG: 526407
AuG: 97848
CrC: 6486
CtD: 755
CbG: 11571
CuG: 18756
CcSE: 138944
CdG: 21102
CpD: 390
DdG: 7623
DpS: 3357
DlA: 3602
DrD: 543
DaG: 12623
DuG: 7731
GiG: 147164
GpCC: 73566
GpBP: 559504
GcG: 61690
GpMF: 97222
GpPW: 84372
Gr>G: 265672
PCiC: 1029


In [13]:
# H is our direct graph
from collections import defaultdict

# Initialize dictionaries to sum degrees and count nodes for each type
in_degrees = defaultdict(int)
out_degrees = defaultdict(int)
counts = defaultdict(int)

for node, attrs in H.nodes(data=True):
    node_type = attrs['type']
    counts[node_type] += 1
    in_degrees[node_type] += H.in_degree(node)
    out_degrees[node_type] += H.out_degree(node)

# Calculate and print average in/out-degrees
avg_in_degrees = {node_type: in_deg / counts[node_type] for node_type, in_deg in in_degrees.items()}
avg_out_degrees = {node_type: out_deg / counts[node_type] for node_type, out_deg in out_degrees.items()}

print("Average In-Degree for each node type:")
for node_type, avg_in in avg_in_degrees.items():
    print(f"{node_type}: {avg_in}")

print("\nAverage Out-Degree for each node type:")
for node_type, avg_out in avg_out_degrees.items():
    print(f"{node_type}: {avg_out}")

Average In-Degree for each node type:
Anatomy: 8.960199004975124
Biological Process: 49.16123363500571
Cellular Component: 52.887131560028756
Compound: 4.842139175257732
Disease: 12.321167883211679
Gene: 61.1328240630222
Molecular Function: 33.71081830790569
Pathway: 46.30735455543359
Pharmacologic Class: 0.0
Side Effect: 24.231600976630624
Symptom: 7.664383561643835

Average Out-Degree for each node type:
Anatomy: 1807.2014925373135
Biological Process: 0.0
Cellular Component: 0.0
Compound: 127.5798969072165
Disease: 258.97080291970804
Gene: 61.55120553831463
Molecular Function: 0.0
Pathway: 0.0
Pharmacologic Class: 2.982608695652174
Side Effect: 0.0
Symptom: 0.0


In [18]:
# Filter out the disease nodes
disease_nodes = [node for node, attrs in H.nodes(data=True) if attrs['type'] == 'Disease']

# Compute in-degrees and out-degrees for each disease node
disease_degrees = [(node, H.in_degree(node), H.out_degree(node)) for node in disease_nodes]

# Sort the list by in-degree and out-degree to find the most connected diseases
most_connected_by_in_degree = sorted(disease_degrees, key=lambda x: x[0], reverse=True)
most_connected_by_out_degree = sorted(disease_degrees, key=lambda x: x[1], reverse=True)

# Display the top 5 most connected diseases by in-degree, assuming the name is in position 1
print("Top 5 diseases by in-degree (most incoming connections):")
for disease, in_degree, _ in most_connected_by_in_degree[:5]:
    # Assuming the name is stored under index 1 in the node attributes
    disease_name = H.nodes[disease][0] if 0 in H.nodes[disease] else "Unknown"
    print(f"Disease: {disease_name}, In-Degree: {in_degree}")

# Display the top 5 most connected diseases by out-degree, with the same assumption
print("\nTop 5 diseases by out-degree (most outgoing connections):")
for disease, _, out_degree in most_connected_by_out_degree[:5]:
    disease_name = H.nodes[disease][0] if 0 in H.nodes[disease] else "Unknown"
    print(f"Disease: {disease_name}, Out-Degree: {out_degree}")


Top 5 diseases by in-degree (most incoming connections):
Disease: Unknown, In-Degree: 14
Disease: Unknown, In-Degree: 0
Disease: Unknown, In-Degree: 1
Disease: Unknown, In-Degree: 3
Disease: Unknown, In-Degree: 5

Top 5 diseases by out-degree (most outgoing connections):
Disease: Unknown, Out-Degree: 699
Disease: Unknown, Out-Degree: 1076
Disease: Unknown, Out-Degree: 1115
Disease: Unknown, Out-Degree: 277
Disease: Unknown, Out-Degree: 595


In [34]:
# Inspect the first few nodes to see how their attributes are stored
for node, attrs in list(H.nodes(data=True))[:5]:
    print(node, attrs)


Anatomy::UBERON:0000002 {'type': 'Anatomy'}
Anatomy::UBERON:0000004 {'type': 'Anatomy'}
Anatomy::UBERON:0000006 {'type': 'Anatomy'}
Anatomy::UBERON:0000007 {'type': 'Anatomy'}
Anatomy::UBERON:0000010 {'type': 'Anatomy'}
