# 01: Data Exploration

Distributions, keyword frequencies, correlations, missing data


In [None]:
import pickle
from pathlib import Path

import pandas as pd
import networkx as nx

In [41]:
GRAPH_DATA_DIR = Path("../data/graph")
GRAPH_NAME = "graph_top_n200_20250717"

edges_path = GRAPH_DATA_DIR / f"{GRAPH_NAME}_edges.csv"
gpickle_path = GRAPH_DATA_DIR / f"{GRAPH_NAME}.gpickle"

# Load the edges CSV file
edges_df = pd.read_csv(edges_path)
print("Sample edges:")
print(edges_df.head())

# Load the graph from the gpickle file
with open(gpickle_path, "rb") as f:
    G = pickle.load(f)

print(f"\nGraph: {G.number_of_nodes():,} nodes, {G.number_of_edges():,} edges, directed={G.is_directed()}")

# Print concise metadata for the first few nodes
print("\nSample node metadata:")
for node, data in list(G.nodes(data=True))[:5]:
    summary = ", ".join(f"{k}={v}" for k, v in list(data.items())[:3])
    print(f"  {node}: {summary}")

# Print concise metadata for the first few edges
print("\nSample edge metadata:")
for u, v, data in list(G.edges(data=True))[:5]:
    summary = ", ".join(f"{k}={v_}" for k, v_ in data.items())
    print(f"  {u} -> {v}: {summary}")

Sample edges:
     source           target              kind  optional
0      cffi        pycparser           runtime     False
1  pygments         colorama  windows-terminal      True
2      lxml        cssselect         cssselect      True
3      lxml  lxml_html_clean        html-clean      True
4      lxml         html5lib             html5      True

Graph: 559 nodes, 1,151 edges, directed=True

Sample node metadata:
  aiohappyeyeballs: rank=17, stars=31, forks=15
  cffi: rank=20, stars=150, forks=43
  pycparser: rank=21, stars=3255, forks=612
  pygments: rank=26, stars=1989, forks=727
  colorama: rank=24, stars=3664, forks=261

Sample edge metadata:
  cffi -> pycparser: kind=runtime, optional=False
  pygments -> colorama: kind=windows-terminal, optional=True
  lxml -> cssselect: kind=cssselect, optional=True
  lxml -> lxml_html_clean: kind=html-clean, optional=True
  lxml -> html5lib: kind=html5, optional=True


### Missing Metadata

In [49]:
def show_nodes_with_missing_metadata(G, max_display=20):
    missing_metadata_nodes = [node for node, data in G.nodes(data=True) if data.get('missing_metadata')]

    if missing_metadata_nodes:
        print(f"\nNodes with missing general metadata: {len(missing_metadata_nodes):,}")
        print("-" * 40)
        for i, node in enumerate(missing_metadata_nodes[:max_display], 1):
            print(f"{i:3d}. {node}")
        if len(missing_metadata_nodes) > max_display:
            print(f"... and {len(missing_metadata_nodes) - max_display} more.")
        print("-" * 40)
    else:
        print("All nodes have complete general metadata.")

show_nodes_with_missing_metadata(G)


Nodes with missing general metadata: 17
----------------------------------------
  1. build[uv,virtualenv]
  2. build[uv]
  3. botocore[crt]
  4. pyrfc3339
  5. coverage[toml]
  6. oauthlib[signedtoken]
  7. pytest-xdist[psutil]
  8. uvicorn[standard]
  9. aiobotocore[awscli]
 10. aiobotocore[boto3]
 11. nox[uv]
 12. ray[data,default]
 13. pygit2
 14. dask[dataframe,test]
 15. moto[server]
 16. anyio[trio]
 17. pyjwt[crypto]
----------------------------------------
