In [1]:
# | code-summary: "Imports and setup"
# | code-fold: true

import networkx as nx
import numpy as np
from bokeh.io import output_notebook, show
from bokeh.models import (
    Circle,
    MultiLine,
)
from bokeh.palettes import Magma256
from bokeh.plotting import figure, from_networkx
from bokeh.transform import linear_cmap

from rs_graph.data import load_rs_graph_upstream_deps_dataset

In [2]:
# | code-summary: "Dataset pre-processing"
# | code-fold: true
# Load dataset sample
df = load_rs_graph_upstream_deps_dataset()

# Create integer ids for each node
node_ids = {}
node_id = 0
for upstream_dep in df.upstream_dep_name.unique():
    node_ids[upstream_dep] = node_id
    node_id += 1

for repo in df.repo.unique():
    node_ids[repo] = node_id
    node_id += 1

# Add integer ids to each relationship
df["source_node_id"] = df.repo.apply(lambda r: node_ids[r])
df["target_node_id"] = df.upstream_dep_name.apply(lambda r: node_ids[r])

# Filter out non-pip registries
df = df.loc[df.upstream_dep_registry == "pip"]

In [3]:
# | code-summary: "Basic graph creation with networkx"
# | code-fold: true
# Create graph
graph = nx.from_pandas_edgelist(df, "source_node_id", "target_node_id")

# Add dep_count_of_upstream_dep to node attrs
upstream_dep_value_counts = df.target_node_id.value_counts()
maximum_count = upstream_dep_value_counts.max()

# Create dependent dict
unique_upstreams = df.target_node_id.unique()
dependents = {}
log_dependents = {}
node_names = {}
for node in graph.nodes():
    if node in unique_upstreams:
        # calculate dependents
        upstream_deps_count = upstream_dep_value_counts[node]
        dependents[node] = upstream_deps_count
        log_dependents[node] = int(np.log(upstream_deps_count)) * 10 + 5

        # add node name
        node_names[node] = df.loc[df.target_node_id == node, "upstream_dep_name"].iloc[
            0
        ]
    else:
        # no dependents for now
        dependents[node] = 1
        log_dependents[node] = 5

        # add node name
        node_names[node] = df.loc[df.source_node_id == node, "repo"].iloc[0]

# Add extra attrs
nx.set_node_attributes(
    graph,
    name="package",
    values=node_names,
)
nx.set_node_attributes(
    graph,
    name="dependents",
    values=dependents,
)
nx.set_node_attributes(
    graph,
    name="log_dependents",
    values=log_dependents,
)

In [4]:
# | code-summary: "Plotting with Bokeh"
# | code-fold: true
# Bokeh output to notebook
output_notebook()

# Create plot
plot = figure(
    tooltips=[("Package", "@package"), ("Dependents", "@dependents")],
    tools="pan,wheel_zoom,reset,hover",
    active_scroll="wheel_zoom",
)

# Create a network graph object
network_graph = from_networkx(
    graph, nx.spring_layout, scale=10, iterations=50, center=(0, 0)
)

# Set node size and color
minimum_value_color = min(
    network_graph.node_renderer.data_source.data["log_dependents"]
)
maximum_value_color = max(
    network_graph.node_renderer.data_source.data["log_dependents"]
)
network_graph.node_renderer.glyph = Circle(
    size="log_dependents",
    line_alpha=0,
    fill_alpha=0.5,
    fill_color=linear_cmap(
        "log_dependents", Magma256[::-1], minimum_value_color, maximum_value_color
    ),
)

# Set edge opacity and width
network_graph.edge_renderer.glyph = MultiLine(
    line_alpha=0.3, line_width=1, line_color="lightgrey"
)

# Add network graph to the plot
plot.renderers.append(network_graph)

# Render
show(plot)