In [1]:
import math

import networkx as nx
from bokeh.io import output_notebook, show
from bokeh.models import Circle, MultiLine, Range1d
from bokeh.palettes import Blues8
from bokeh.plotting import figure, from_networkx
from bokeh.transform import linear_cmap

from rs_graph.data import load_rs_graph_upstream_deps_dataset

###############################################################################

SAMPLE_SIZE = 1000

###############################################################################

# Bokeh output to notebook
output_notebook()

In [2]:
# Load dataset sample
df = load_rs_graph_upstream_deps_dataset()

# Create integer ids for each node
node_ids = {}
node_id = 0
for upstream_dep in df.upstream_dep_name.unique():
    node_ids[upstream_dep] = node_id
    node_id += 1

for repo in df.repo.unique():
    node_ids[repo] = node_id
    node_id += 1

# Add integer ids to each relationship
df["source_node_id"] = df.repo.apply(lambda r: node_ids[r])
df["target_node_id"] = df.upstream_dep_name.apply(lambda r: node_ids[r])

# Filter out non-pip registries
df = df.loc[df.upstream_dep_registry == "pip"]

# Most machines can't render full dataset, reduce
sampled_df = df.sample(SAMPLE_SIZE)

In [3]:
# Create graph
graph = nx.from_pandas_edgelist(sampled_df, "source_node_id", "target_node_id")

# Add dep_count_of_upstream_dep to node attrs
upstream_dep_value_counts = sampled_df.target_node_id.value_counts()
maximum_count = upstream_dep_value_counts.max()

# Create dependent dict
unique_upstreams = sampled_df.target_node_id.unique()
dependents = {}
normalized_dependents = {}
node_names = {}
for node in graph.nodes():
    if node in unique_upstreams:
        # calculate dependents
        upstream_deps_count = upstream_dep_value_counts[node]
        dependents[node] = upstream_deps_count
        normalized_dependents[node] = math.ceil(
            (upstream_deps_count / maximum_count) * 20
        )

        # add node name
        node_names[node] = sampled_df.loc[sampled_df.target_node_id == node, "upstream_dep_name"].iloc[0]
    else:
        # no dependents for now
        dependents[node] = 1
        normalized_dependents[node] = 1

        # add node name
        node_names[node] = sampled_df.loc[sampled_df.source_node_id == node, "repo"].iloc[0]

# Add extra attrs
# nx.set_node_attributes(
#     graph,
#     name="package",
#     values=node_names,
# )
nx.set_node_attributes(
    graph,
    name="dependents",
    values=dependents,
)
nx.set_node_attributes(
    graph,
    name="normalized_dependents",
    values=normalized_dependents,
)

# Create a plot — set dimensions, toolbar, and title
plot = figure(
    tooltips=[("Package", "@package")],
    tools="pan,wheel_zoom,reset,hover",
    active_scroll="wheel_zoom",
)

# Create a network graph object
network_graph = from_networkx(graph, nx.spring_layout, scale=10, center=(0, 0))

# Set node size and color
# Set node sizes and colors according to node degree
# (color as spectrum of color palette)
minimum_value_color = min(
    network_graph.node_renderer.data_source.data["normalized_dependents"]
)
maximum_value_color = max(
    network_graph.node_renderer.data_source.data["normalized_dependents"]
)
network_graph.node_renderer.glyph = Circle(
    size="normalized_dependents",
    line_alpha=0,
    fill_color=linear_cmap(
        "normalized_dependents", Blues8, minimum_value_color, maximum_value_color
    ),
)

# Set edge opacity and width
network_graph.edge_renderer.glyph = MultiLine(
    line_alpha=0.2, line_width=1, line_color="lightgrey"
)

# Add network graph to the plot
plot.renderers.append(network_graph)

# Render
show(plot)

ValueError: failed to validate ColumnDataSource(id='p1028', ...).data: expected a dict of type ColumnData(String, Seq(Any)), got a dict with invalid values for keys: package