# Press Ego-Network Search (Footy Core, Inline Matplotlib)

This notebook is a **fast, inline-only** ego-network explorer for your press graph.

Design choices:

- Restricts to the **football subgraph**: nodes in `press_footy_centrality.parquet`
- Uses **NetworkX + Matplotlib** for drawing (no PyVis, no HTML, no iframes)
- Lets you:
  - Search by team / player / journalist name
  - Select a node
  - Draw its **ego network** (radius 1–2) directly inside the notebook


In [1]:
\
# 1. Imports and file paths

import pandas as pd
import networkx as nx
import pickle

import matplotlib.pyplot as plt

from ipywidgets import Text, Button, Dropdown, IntSlider, VBox, HBox, Output
from IPython.display import display

# ------------------------------------------------------------------
# File paths (update if you move things)
# ------------------------------------------------------------------
PRESS_ENTITIES_PATH   = r"C:\Users\dshog\Prem NLP Project\network_outputs\press_entities_2016_2025.parquet"
PRESS_CENTRALITY_PATH = r"C:\Users\dshog\Prem NLP Project\network_outputs\press_footy_centrality.parquet"
PRESS_GRAPH_PATH      = r"C:\Users\dshog\Prem NLP Project\network_outputs\press_graph_full.pkl"


In [2]:
# 2. Load data

entities = pd.read_parquet(PRESS_ENTITIES_PATH)
centrality = pd.read_parquet(PRESS_CENTRALITY_PATH)

with open(PRESS_GRAPH_PATH, "rb") as f:
    G_full = pickle.load(f)

print("Entities shape:", entities.shape)
print("Centrality shape:", centrality.shape)
print(f"Full graph: {len(G_full.nodes())} nodes, {len(G_full.edges())} edges")

display(entities.head())
display(centrality.head())


Entities shape: (1950601, 3)
Centrality shape: (1707, 6)
Full graph: 234520 nodes, 49245109 edges


Unnamed: 0,doc_id,entity_text,entity_label
0,guardian_football/2016/sep/30/arsene-wenger-pe...,Allardyce,GPE
1,guardian_football/2016/sep/30/arsene-wenger-pe...,Allardyce,ORG
2,guardian_football/2016/sep/30/arsene-wenger-pe...,Arsenal,ORG
3,guardian_football/2016/sep/30/arsene-wenger-pe...,Arsenal Asked,ORG
4,guardian_football/2016/sep/30/arsene-wenger-pe...,Arsène Wenger,PERSON


Unnamed: 0,node,degree,strength,component_id,component_size,type
0,Álvaro Fernández,172,367,0,1707,player
1,Josh Benson,302,433,0,1707,player
2,Jairo Riedewald,459,1203,0,1707,player
3,Mateo Kovacic,959,10025,0,1707,player
4,Carlos Sánchez,360,911,0,1707,player


## Build `nodes_df` and the football-only graph `G_footy`

We:

1. Make a unique `(entity_text, entity_label)` table from the entities file  
2. Merge it with the centrality table on `node` ↔ `entity_text`  
3. Create `G_footy` as the subgraph of `G_full` induced by `centrality['node']`


In [3]:
# Unique entity table
entities_unique = (
    entities[["entity_text", "entity_label"]]
    .drop_duplicates()
)

# Merge node-level centrality metrics with entity metadata
nodes_df = centrality.merge(
    entities_unique,
    left_on="node",
    right_on="entity_text",
    how="left"
)

print("Combined nodes_df shape:", nodes_df.shape)
display(nodes_df.head())

# Build football-only subgraph based on centrality nodes
footy_nodes = set(nodes_df["node"].unique())
G_footy = G_full.subgraph(footy_nodes).copy()

print(f"Footy graph: {len(G_footy.nodes())} nodes, {G_footy.number_of_edges()} edges")


Combined nodes_df shape: (2522, 8)


Unnamed: 0,node,degree,strength,component_id,component_size,type,entity_text,entity_label
0,Álvaro Fernández,172,367,0,1707,player,Álvaro Fernández,PERSON
1,Josh Benson,302,433,0,1707,player,Josh Benson,PERSON
2,Jairo Riedewald,459,1203,0,1707,player,Jairo Riedewald,PERSON
3,Mateo Kovacic,959,10025,0,1707,player,Mateo Kovacic,PERSON
4,Carlos Sánchez,360,911,0,1707,player,Carlos Sánchez,PERSON


Footy graph: 1707 nodes, 410063 edges


## Attach attributes to `G_footy`

We attach centrality + entity metadata to each node and set a readable label:

- Prefer `entity_text`
- Fall back to the raw node id


In [None]:
def attach_node_attributes_from_df(G, nodes_df, key_col="node"):
    """Attach attributes from nodes_df to graph G.

    - key_col should match the node IDs used in the graph.
    - Force 'label' to be entity_text when available.
    """
    # Collapse to one row per node
    df_grouped = (
        nodes_df
        .groupby(key_col, as_index=True)
        .agg("first")
    )

    mapping = df_grouped.to_dict(orient="index")
    
    for n in G.nodes():
        attrs = mapping.get(n)
        if attrs:
            G.nodes[n].update(attrs)
        
        ent_text = G.nodes[n].get("entity_text", None)
        if isinstance(ent_text, str) and ent_text.strip():
            G.nodes[n]["label"] = ent_text.strip()
        else:
            # fallback to node id
            G.nodes[n]["label"] = str(n)


## Helper: search for nodes

We search across:

- `entity_text`
- `entity_label`
- `node` (as string)

and only keep nodes that actually exist in `G_footy`.


In [10]:
def find_matching_nodes(term: str, limit: int = 50):
    term = term.lower().strip()
    if not term:
        return nodes_df.iloc[0:0].copy()

    masks = []

    # entity_text
    if "entity_text" in nodes_df.columns:
        masks.append(
            nodes_df["entity_text"].astype(str).str.lower().str.contains(term, na=False)
        )

    # entity_label
    if "entity_label" in nodes_df.columns:
        masks.append(
            nodes_df["entity_label"].astype(str).str.lower().str.contains(term, na=False)
        )

    # node as string
    masks.append(
        nodes_df["node"].astype(str).str.lower().str.contains(term, na=False)
    )

    mask = masks[0]
    for m in masks[1:]:
        mask = mask | m

    matches = nodes_df.loc[mask].copy()
    if matches.empty:
        return matches

    matches = matches.drop_duplicates(subset="node")

    if "degree" in matches.columns:
        matches = matches.sort_values("degree", ascending=False)

    # Keep only nodes present in G_footy
    matches = matches[matches["node"].isin(G_footy.nodes())]

    return matches.head(limit)


## Ego-network visualization (inline Matplotlib)

In [None]:
def visualize_ego_matplotlib(
    node_id,
    radius: int = 1,
    max_nodes: int = 250,
    min_weight: int = 2,
    figsize=(14, 10),
    save_path: str | None = None,
    dpi: int = 300,
):
    if node_id not in G_footy.nodes:
        print(f"Node ID '{node_id}' not found in footy graph.")
        return

    # --- Build full ego graph first ---
    G_ego_full = nx.ego_graph(G_footy, node_id, radius=radius)
    print(f"Raw ego graph: {len(G_ego_full)} nodes, {G_ego_full.number_of_edges()} edges")

    # --- Filter edges by weight ---
    H = nx.Graph()
    for u, v, d in G_ego_full.edges(data=True):
        w = d.get("weight", 1)
        if w >= min_weight:
            H.add_edge(u, v, **d)

    # Copy node attrs
    for n in H.nodes():
        H.nodes[n].update(G_footy.nodes[n])

    # Always keep center node even if it lost all edges
    if node_id not in H:
        H.add_node(node_id, **G_footy.nodes[node_id])

    print(f"After weight filter (>= {min_weight}): {len(H)} nodes, {H.number_of_edges()} edges")

    G_ego = H

    # --- Cap by max_nodes: keep top-degree nodes BUT keep center for sure ---
    if len(G_ego) > max_nodes:
        deg_all = pd.Series(dict(G_ego.degree()))
        keep_nodes = set(deg_all.nlargest(max_nodes - 1).index)
        keep_nodes.add(node_id)
        G_ego = G_ego.subgraph(keep_nodes).copy()
        print(f"Capped ego graph to {len(G_ego)} nodes.")

    # --- Layout ---
    pos = nx.spring_layout(G_ego, k=0.5, iterations=60, seed=42)

    # --- Node sizes by degree ---
    deg = dict(G_ego.degree())
    deg_series = pd.Series(deg)
    if deg_series.max() > 0:
        sizes = 120 + 380 * (deg_series / deg_series.max())
    else:
        sizes = pd.Series(250, index=G_ego.nodes())

    # --- Colors by entity type (ORG / PERSON / GPE) ---
    type_colors = {
        "ORG": "tab:blue",
        "PERSON": "tab:orange",
        "GPE": "tab:green",
    }

    node_colors = []
    for n in G_ego.nodes():
        if n == node_id:
            node_colors.append("red")  # center
        else:
            ent_type = G_ego.nodes[n].get("entity_label")
            node_colors.append(type_colors.get(ent_type, "skyblue"))

    fig, ax = plt.subplots(figsize=figsize)

    # Lighter edges so they don't dominate
    nx.draw_networkx_edges(
        G_ego,
        pos,
        ax=ax,
        alpha=0.12,
        width=0.7,
        edge_color="gray",
    )

    nx.draw_networkx_nodes(
        G_ego,
        pos,
        ax=ax,
        node_size=[sizes[n] for n in G_ego.nodes()],
        node_color=node_colors,
        alpha=0.98,
        linewidths=0.5,
        edgecolors="black",
    )

    # --- Labels: few, with white background boxes ---
    K = 18  # label center + 17 most-connected neighbours
    top_nodes = set(deg_series.sort_values(ascending=False).head(K).index)

    for n, (x, y) in pos.items():
        if n == node_id or n in top_nodes:
            data = G_ego.nodes[n]
            name = data.get("entity_text") or str(n)

            ax.text(
                x,
                y,
                name,
                fontsize=8,
                ha="center",
                va="center",
                color="black",
                bbox=dict(
                    facecolor="white",
                    edgecolor="none",
                    alpha=0.8,
                    pad=0.5,
                ),
            )

    center_data = G_ego.nodes[node_id]
    center_name = center_data.get("entity_text") or str(node_id)
    ax.set_title(f"Ego network (radius={radius}, min_weight={min_weight}) for: {center_name}")
    ax.axis("off")

    if save_path is not None:
        plt.savefig(save_path, dpi=dpi, bbox_inches="tight")
        print(f"Saved figure to {save_path}")

    plt.show()


## Widgets: search + visualize

- Type part of a **club / player / journalist** name
- Click **Find**
- Pick a match in the dropdown
- Click **Visualize** to see the ego network inline


### How to read this network graph

- **Nodes (circles)**  
  Each node is a named entity from Guardian/Independent articles – clubs, players, or places.  
  - Blue = clubs / teams (`ORG`)  
  - Orange = players / people (`PERSON`)  
  - Green = places (`GPE`)  
  The red node is the focal entity (e.g., Mohamed Salah or Erling Haaland).

- **Edges (lines)**  
  A line means the two entities are mentioned together in the same article.  
  After filtering, stronger / more frequent co-mentions are the connections that remain.

- **Positions (layout)**  
  The layout is *force-directed*: entities that are often mentioned together are pulled closer; those with fewer or weaker links are pushed toward the edges.  
  Clusters of nodes represent groups of entities that frequently appear together in the press narrative around the focal player or club.

In [None]:
search_box = Text(
    description="Search:",
    placeholder="Type club/player/journalist…"
)
search_button = Button(
    description="Find",
    button_style="info"
)

matches_dropdown = Dropdown(
    options=[],
    description="Matches:",
    layout={'width': '700px'}
)

radius_slider = IntSlider(
    value=1,
    min=1,
    max=2,
    step=1,
    description="Radius"
)

max_nodes_slider = IntSlider(
    value=200,
    min=50,
    max=400,
    step=25,
    description="Max nodes"
)

viz_button = Button(
    description="Visualize",
    button_style="success"
)

status_out = Output()


def on_search_clicked(b):
    term = search_box.value
    with status_out:
        status_out.clear_output()
        if not term.strip():
            matches_dropdown.options = []
            print("Type at least one character to search.")
            return

        matches = find_matching_nodes(term, limit=50)
        if matches.empty:
            matches_dropdown.options = []
            print("No matches found for that search term.")
            return

        options = []
        for _, row in matches.iterrows():
            node_id = row["node"]
            label = row.get("entity_text") or str(node_id)
            typ = row.get("type", "")
            deg = row.get("degree", "")
            display_label = f"{label} | node={node_id} | type={typ} | degree={deg}"
            options.append((display_label, node_id))

        matches_dropdown.options = options
        print(f"Found {len(options)} matches (showing up to {len(options)}).")


def on_viz_clicked(b):
    node_id = matches_dropdown.value
    if node_id is None:
        with status_out:
            status_out.clear_output()
            print("Select a node from 'Matches' before visualizing.")
        return

    with status_out:
        status_out.clear_output()
        print(f"Visualizing ego-network around node: {node_id} (radius={radius_slider.value}, max_nodes={max_nodes_slider.value})")

    visualize_ego_matplotlib(
        node_id=node_id,
        radius=radius_slider.value,
        max_nodes=max_nodes_slider.value,
    )


search_button.on_click(on_search_clicked)
viz_button.on_click(on_viz_clicked)

ui = VBox([
    HBox([search_box, search_button]),
    matches_dropdown,
    HBox([radius_slider, max_nodes_slider]),
    HBox([viz_button]),
    status_out
])

display(ui)
