In [3]:
import csv
from operator import itemgetter
import networkx as nx
from networkx.algorithms import community

# Read data

In [10]:
with open('www.massgeneral.org - Nodes (crawl depth = 3).csv', 'r') as nodecsv: # Open the file
    nodereader = csv.reader(nodecsv) # Read the csv
    # Retrieve the data (using Python list comprhension and list slicing to remove the header row, see footnote 3)
    nodes = [n for n in nodereader][1:]

node_names = [n[0] for n in nodes] # Get a list of only the node names

In [40]:
edges = []
counter = 0 
with open('www.massgeneral.org - Edges (Internal, crawl depth = 3).csv', 'r') as edgecsv: # Open the file
    edgereader = csv.reader(edgecsv) # Read the csv
    for e in edgereader:
        edges.append(tuple(e))

 # Retrieve the data
edges = edges[1:].copy()

# Build Network

In [43]:
G = nx.Graph()
G.add_nodes_from(node_names)
G.add_edges_from(edges)

In [44]:
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 1301
Number of edges: 117605
Average degree: 180.7917


In [45]:
degree_dict = dict(G.degree(G.nodes()))
nx.set_node_attributes(G, degree_dict, 'degree')

# Metrics

In [11]:
density = nx.density(G)
print("Network density:", density)

Network density: 0.022642353941987308


In [12]:
triadic_closure = nx.transitivity(G)
print("Triadic closure:", triadic_closure)

Triadic closure: 0.07445814081487942


In [13]:
betweenness_dict = nx.betweenness_centrality(G) 
eigenvector_dict = nx.eigenvector_centrality(G)

In [14]:
nx.set_node_attributes(G, betweenness_dict, 'betweenness')
nx.set_node_attributes(G, eigenvector_dict, 'eigenvector')

In [15]:
sorted_betweenness = sorted(betweenness_dict.items(), key=itemgetter(1), reverse=True)

print("Top 20 nodes by betweenness centrality:")
for b in sorted_betweenness[:20]:
    print(b)

Top 20 nodes by betweenness centrality:
('https://www.stvincenthospital.com/privacy-policy', 0.03911058902347307)
('https://www.stvincenthospital.com/health-professionals', 0.03795252315543992)
('https://www.stvincenthospital.com/patients', 0.03795252315543992)
('https://www.stvincenthospital.com/careers', 0.03795252315543992)
('https://www.stvincenthospital.com/financial-resources', 0.03795252315543992)
('https://www.stvincenthospital.com/find-a-doctor', 0.03795252315543992)
('https://www.stvincenthospital.com/healthy-living', 0.03730195162568863)
('https://www.stvincenthospital.com/about', 0.03679378242384733)
('https://www.stvincenthospital.com/terms-of-use', 0.03679378242384733)
('https://www.stvincenthospital.com/health-assessments', 0.03679378242384733)
('https://www.stvincenthospital.com/portal', 0.03679378242384733)
('https://www.stvincenthospital.com/news', 0.03679378242384733)
('https://www.stvincenthospital.com/accessibility-statement', 0.03679378242384733)
('https://www.stv

In [16]:
#First get the top 20 nodes by betweenness as a list
top_betweenness = sorted_betweenness[:20]

#Then find and print their degree
for tb in top_betweenness: # Loop through top_betweenness
    degree = degree_dict[tb[0]] # Use degree_dict to access a node's degree, see footnote 2
    print("Name:", tb[0], "| Betweenness Centrality:", tb[1], "| Degree:", degree)

Name: https://www.stvincenthospital.com/privacy-policy | Betweenness Centrality: 0.03911058902347307 | Degree: 1127
Name: https://www.stvincenthospital.com/health-professionals | Betweenness Centrality: 0.03795252315543992 | Degree: 1126
Name: https://www.stvincenthospital.com/patients | Betweenness Centrality: 0.03795252315543992 | Degree: 1126
Name: https://www.stvincenthospital.com/careers | Betweenness Centrality: 0.03795252315543992 | Degree: 1126
Name: https://www.stvincenthospital.com/financial-resources | Betweenness Centrality: 0.03795252315543992 | Degree: 1126
Name: https://www.stvincenthospital.com/find-a-doctor | Betweenness Centrality: 0.03795252315543992 | Degree: 1126
Name: https://www.stvincenthospital.com/healthy-living | Betweenness Centrality: 0.03730195162568863 | Degree: 1126
Name: https://www.stvincenthospital.com/about | Betweenness Centrality: 0.03679378242384733 | Degree: 1125
Name: https://www.stvincenthospital.com/terms-of-use | Betweenness Centrality: 0.036

# Community Detection

In [17]:
communities = community.greedy_modularity_communities(G)

In [18]:
modularity_dict = {} # Create a blank dictionary
for i,c in enumerate(communities): # Loop through the list of communities, keeping track of the number for the community
    for name in c: # Loop through each person in a community
        modularity_dict[name] = i # Create an entry in the dictionary for the person, where the value is which group they belong to.

# Now you can add modularity information like we did the other metrics
nx.set_node_attributes(G, modularity_dict, 'modularity')

In [19]:
# First get a list of just the nodes in that class
class0 = [n for n in G.nodes() if G.nodes[n]['modularity'] == 0]

# Then create a dictionary of the eigenvector centralities of those nodes
class0_eigenvector = {n:G.nodes[n]['eigenvector'] for n in class0}

# Then sort that dictionary and print the first 5 results
class0_sorted_by_eigenvector = sorted(class0_eigenvector.items(), key=itemgetter(1), reverse=True)

print("Modularity Class 0 Sorted by Eigenvector Centrality:")
for node in class0_sorted_by_eigenvector[:5]:
    print("Name:", node[0], "| Eigenvector Centrality:", node[1])

Modularity Class 0 Sorted by Eigenvector Centrality:
Name: https://www.stvincenthospital.com/healthy-living | Eigenvector Centrality: 0.14385144512231968
Name: https://www.stvincenthospital.com/privacy-policy | Eigenvector Centrality: 0.14362078664748892
Name: https://www.stvincenthospital.com/careers | Eigenvector Centrality: 0.14361654502408516
Name: https://www.stvincenthospital.com/terms-of-use | Eigenvector Centrality: 0.14361230365121264
Name: https://www.stvincenthospital.com/health-assessments | Eigenvector Centrality: 0.14361230365121264


In [20]:
for i,c in enumerate(communities): # Loop through the list of communities
    if len(c) > 2: # Filter out modularity classes with 2 or fewer nodes
        print('Class '+str(i)+':', list(c)) # Print out the classes and their members

Class 1: ['https://www.stvincenthospital.com/strike-news/latest-strike-news/newsroom/saint-vincent-hospital-focuses-on-continued-care-for-community-after-mna-rejects-offer', 'https://www.stvincenthospital.com/news/newsroom/saint-vincent-hospital-marks-another-day-of-quality-care', 'https://www.stvincenthospital.com/patients/what-to-expect', 'https://www.carecalendar.org/', 'https://www.cdc.gov/', 'https://www.stvincenthospital.com/healthy-living/corporate-content/holidays-better-safe-than-covid', 'https://svh.simpleepay.com/', 'https://www.stvincenthospital.com/strike-news/latest-strike-news/newsroom/a-letter-from-the-saint-vincent-hospital-governing-board', 'https://www.stvincenthospital.com/patients/interpretive-language-service', 'https://www.youtube.com/embed/1LGk1z0ssaM', 'https://www.stvincenthospital.com/services/physical-therapy/aquatic-therapy', 'https://www.stvincenthospital.com/health-professionals/med-safety-quality-informatics-fellowship', 'https://www.stvincenthospital.co

In [46]:
nx.write_gexf(G, 'massgeneral_internal.gexf')