## Knowledge Graphs

In [60]:
import numpy as np
import pandas as pd

import networkx as nx
import pyvis
from pyvis.network import Network
import matplotlib.pyplot as plt

from json import dumps

In [61]:
df_r = pd.read_csv('/Users/David/Desktop/McGill Project/Datasets/Comorbidity/regular_full_text_search/results_df.csv')
df_e = pd.read_csv('/Users/David/Desktop/McGill Project/Datasets/Comorbidity/elderly_full_text_search/results_df.csv')

Fix the NaN and retrieve back the tuples of comorbidities

In [62]:
df_r = df_r.fillna('N/A')
df_r['comorbidity'] = df_r['comorbidity'].apply(lambda x: tuple(x.split("!")))

df_e = df_e.fillna('N/A')
df_e['comorbidity'] = df_e['comorbidity'].apply(lambda x: tuple(x.split("!")))

### Comorbidity counts by (unique) article

In [63]:
# comorbid_counts = df.groupby(by=['pmc_id', 'comorbidity']).size().reset_index().groupby(by=['comorbidity']).size().sort_values(ascending=False).to_frame().rename(columns={0:'num_articles'})

In [64]:
comorbid_counts_r = df_r.groupby(by=['pmc_id', 'comorbidity']).size().reset_index().groupby(by=['comorbidity']).size().sort_values(ascending=False).to_frame().rename(columns={0:'num_articles'})
comorbid_counts_e = df_e.groupby(by=['pmc_id', 'comorbidity']).size().reset_index().groupby(by=['comorbidity']).size().sort_values(ascending=False).to_frame().rename(columns={0:'num_articles'})

In [65]:
# # add distinct columns for the comorbid diseases 
# comorbid_counts['comorbidity_1'] = comorbid_counts.index.to_frame()['comorbidity'].apply(lambda x: x[0])
# comorbid_counts['comorbidity_2'] = comorbid_counts.index.to_frame()['comorbidity'].apply(lambda x: x[1])

# # remove COVID-19 (dataset is pre-pandemic) and 'N/A' values
# comorbid_counts = comorbid_counts[(~comorbid_counts['comorbidity_1'].isin(['COVID-19', 'N/A'])) & (~comorbid_counts['comorbidity_2'].isin(['COVID-19', 'N/A']))].reset_index(drop=True)

In [66]:
# add distinct columns for the comorbid diseases 
comorbid_counts_r['comorbidity_1'] = comorbid_counts_r.index.to_frame()['comorbidity'].apply(lambda x: x[0])
comorbid_counts_r['comorbidity_2'] = comorbid_counts_r.index.to_frame()['comorbidity'].apply(lambda x: x[1])

# remove COVID-19 (dataset is pre-pandemic) and 'N/A' values
comorbid_counts_r = comorbid_counts_r[(~comorbid_counts_r['comorbidity_1'].isin(['COVID-19', 'N/A', 'Disease'])) & (~comorbid_counts_r['comorbidity_2'].isin(['COVID-19', 'N/A', 'Disease']))].reset_index(drop=True)

# add distinct columns for the comorbid diseases 
comorbid_counts_e['comorbidity_1'] = comorbid_counts_e.index.to_frame()['comorbidity'].apply(lambda x: x[0])
comorbid_counts_e['comorbidity_2'] = comorbid_counts_e.index.to_frame()['comorbidity'].apply(lambda x: x[1])

# remove COVID-19 (dataset is pre-pandemic) and 'N/A' values
comorbid_counts_e = comorbid_counts_e[(~comorbid_counts_e['comorbidity_1'].isin(['COVID-19', 'N/A', 'Disease'])) & (~comorbid_counts_e['comorbidity_2'].isin(['COVID-19', 'N/A', 'Disease']))].reset_index(drop=True)

### Create Graph

Using networkX

In [67]:
# using networkx and pyvis
count_col = 'num_articles'
n = 50

# networkx
G_r = nx.from_pandas_edgelist(df=comorbid_counts_r.head(n), source='comorbidity_1', target='comorbidity_2', edge_attr=count_col)

# nx.draw(G, with_labels=True)

# # nicer looking in some ways
# pos = nx.spring_layout(G, k=7)  # For better example looking
# nx.draw(G, pos, with_labels=True)
# labels = {e: G.edges[e][count_col] for e in G.edges}
# nx.draw_networkx_edge_labels(G, pos, edge_labels=labels)
# plt.show()

In [53]:
# using networkx and pyvis
count_col = 'num_articles'
n = 50

# networkx
G_e = nx.from_pandas_edgelist(df=comorbid_counts_e.head(n), source='comorbidity_1', target='comorbidity_2', edge_attr=count_col)

# nx.draw(G, with_labels=True)

# # nicer looking in some ways
# pos = nx.spring_layout(G, k=7)  # For better example looking
# nx.draw(G, pos, with_labels=True)
# labels = {e: G.edges[e][count_col] for e in G.edges}
# nx.draw_networkx_edge_labels(G, pos, edge_labels=labels)
# plt.show()

Using pyvis

In [66]:
G_nx_r = Network('1000px', '1000px')
G_nx_r.from_nx(G_r)
G_nx_r.show('/Users/David/Desktop/McGill Project/Results/regular_graph_50.html')

In [58]:
# pyvis
G_nx_e = Network('1000px', '1000px')
G_nx_e.from_nx(G_e)
G_nx_e.show('/Users/David/Desktop/McGill Project/Results/elderly_graph_50.html')

pyvis with a weighted graph

In [84]:
G_pv_r = Network('1000px', '1000px')
n = 50

sources = comorbid_counts_r['comorbidity_1'].head(n)
targets = comorbid_counts_r['comorbidity_2'].head(n)
weights = comorbid_counts_r[count_col].head(n)

edge_data = zip(sources, targets, weights)

for e in edge_data:
    src = e[0]
    dst = e[1]
    w = e[2]

    G_pv_r.add_node(src, label=src, title=src)      # label is the associated label you see for each node
    G_pv_r.add_node(dst, label=dst, title=dst)      # title allows you to see the node name when you hover over it
    G_pv_r.add_edge(src, dst, value=w)

# # create a map of the neighbours of each node
# neighbor_map = G_pv.get_adj_list()
# # add neighbor data to node hover data
# for node in G_pv.nodes:
#     node['title'] += ' Neighbors:\n' + '\n'.join(neighbor_map[node['id']])
#     node['value'] = len(neighbor_map[node['id']])

In [85]:
# G_pv_r.set_options(options=dumps({
#   "physics": {
#     "forceAtlas2Based": {
#       "springLength": 100
#     },
#     "minVelocity": 0.75,
#     "solver": "forceAtlas2Based"
#   }
# }))

In [86]:
# G_pv_e.show_buttons(filter_=['physics'])
G_pv_r.show(f'/Users/David/Desktop/McGill Project/Results/html/regular_weighted_graph_{n}_barnes.html')

In [87]:
G_pv_e = Network('1000px', '1000px')
n = 50

sources = comorbid_counts_e['comorbidity_1'].head(n)
targets = comorbid_counts_e['comorbidity_2'].head(n)
weights = comorbid_counts_e[count_col].head(n)

edge_data = zip(sources, targets, weights)

for e in edge_data:
    src = e[0]
    dst = e[1]
    w = e[2]

    G_pv_e.add_node(src, label=src, title=src)      # label is the associated label you see for each node
    G_pv_e.add_node(dst, label=dst, title=dst)      # title allows you to see the node name when you hover over it
    G_pv_e.add_edge(src, dst, value=w)

# # create a map of the neighbours of each node
# neighbor_map = G_pv.get_adj_list()
# # add neighbor data to node hover data
# for node in G_pv.nodes:
#     node['title'] += ' Neighbors:\n' + '\n'.join(neighbor_map[node['id']])
#     node['value'] = len(neighbor_map[node['id']])

In [88]:
# G_pv_e.set_options(options=dumps({
#   "physics": {
#     "forceAtlas2Based": {
#       "springLength": 100
#     },
#     "minVelocity": 0.75,
#     "solver": "forceAtlas2Based"
#   }
# }))

In [89]:
# G_pv_e.show_buttons(filter_=['physics'])
G_pv_e.show(f'/Users/David/Desktop/McGill Project/Results/html/elderly_weighted_graph_{n}_barnes.html', )