# Premier League Transfer Rumor Networks – Press vs Reddit

This notebook constructs two co-occurrence networks:

- **Press network** from Guardian + Independent football articles (2016–2025)
- **Reddit network** from EPL transfer-related posts & comments (2018–2025, sampled)

Nodes are named entities (players, clubs, etc.). Edges represent co-occurrence
in the same document. We then restrict to **players and clubs** using a transfer
truth table and compute centrality measures for each source.


In [None]:
import os
import pandas as pd
import networkx as nx
import pickle
import itertools
from pathlib import Path
from tqdm.auto import tqdm
from IPython.display import display
import spacy


PROJECT_ROOT_NAME = "Prem NLP Project"

def get_project_root() -> Path:
    """
    Resolve the project root directory in a user-agnostic way.

    Priority:
    1) If PREM_NLP_PROJECT_ROOT env var is set, use that.
    2) Otherwise, walk up from the current working directory until we find
       a folder whose name matches PROJECT_ROOT_NAME.
    """
    env_root = os.getenv("PREM_NLP_PROJECT_ROOT")
    if env_root:
        return Path(env_root).expanduser().resolve()

    here = Path.cwd().resolve()
    for p in (here, *here.parents):
        if p.name == PROJECT_ROOT_NAME:
            return p

    raise RuntimeError(
        f"Could not find project root '{PROJECT_ROOT_NAME}' starting from {here}. "
        "Either run the notebook from inside the project folder, or set the "
        "PREM_NLP_PROJECT_ROOT environment variable."
    )

BASE_DIR = get_project_root()
print("Detected BASE_DIR:", BASE_DIR)

OUT_DIR = BASE_DIR / "network_outputs"
OUT_DIR.mkdir(exist_ok=True, parents=True)


# Transfer truth
TRANSFER_PATH = BASE_DIR / "transfer_data" / "pl_transfers_2016_2022.csv"

# Press raw data (Guardian + Independent)
GUARDIAN_DIR = BASE_DIR / "Guardian Data" / "processed"
INDEP_PATH   = BASE_DIR / "The Independent Data" / "independent_football_2016_2025.csv"

# Press outputs
PRESS_ENTITIES_PATH      = OUT_DIR / "press_entities_2016_2025.parquet"
PRESS_GRAPH_PATH         = OUT_DIR / "press_graph_full.pkl"
PRESS_FOOTY_CENTRAL_PATH = OUT_DIR / "press_footy_centrality.parquet"

# Reddit raw data
REDDIT_DIR          = BASE_DIR / "Reddit data"
REDDIT_POSTS_ZIP    = REDDIT_DIR / "epl_transfer_posts_2018_2025_keywords_EPL_ONLY.zip"
REDDIT_COMMENTS_ZIP = REDDIT_DIR / "epl_transfer_comments_2018_2025_keywords_EPL_ONLY.zip"

# Reddit outputs
REDDIT_ENTITIES_PATH      = OUT_DIR / "reddit_entities_sample.parquet"
REDDIT_FOOTY_CENTRAL_PATH = OUT_DIR / "reddit_footy_centrality_sample.parquet"


Detected BASE_DIR: C:\Users\dshog\Prem NLP Project


In [2]:
transfers = pd.read_csv(TRANSFER_PATH)
print("Transfers loaded:", transfers.shape)
display(transfers.head())

player_set = set(
    transfers["player_name"]
    .dropna()
    .astype(str)
    .str.strip()
    .unique()
)
club_set = set(
    pd.concat(
        [
            transfers["arrival_club"].dropna().astype(str).str.strip(),
            transfers["departure_club"].dropna().astype(str).str.strip(),
        ]
    ).unique()
)

print("Unique players in transfers:", len(player_set))
print("Unique clubs in transfers:", len(club_set))


Transfers loaded: (2546, 11)


Unnamed: 0,player_name,arrival_club,departure_club,age,position,transfer_period,year,season,transfer_type,fee_eur_m,fee
0,Islam Slimani,Leicester City,Sporting CP,28.0,Centre-Forward,Summer,2016,2016/2017,Transfer,31.0,€31.00m
1,Ahmed Musa,Leicester City,CSKA Moscow,23.0,Right Winger,Summer,2016,2016/2017,Transfer,19.5,€19.50m
2,Nampalys Mendy,Leicester City,OGC Nice,24.0,Defensive Midfield,Summer,2016,2016/2017,Transfer,15.5,€15.50m
3,Bartosz Kapustka,Leicester City,Cracovia,19.0,Attacking Midfield,Summer,2016,2016/2017,Transfer,5.0,€5.00m
4,Ron-Robert Zieler,Leicester City,Hannover 96,27.0,Goalkeeper,Summer,2016,2016/2017,Transfer,3.5,€3.50m


Unique players in transfers: 1402
Unique clubs in transfers: 537


Build press corpus (Guardian + Independent)

In [None]:
guardian_files = sorted(GUARDIAN_DIR.glob("guardian_*.csv"))
print("Guardian files found:", len(guardian_files))

guardian_list = []
for f in guardian_files:
    df_g = pd.read_csv(f)
    df_g["source"] = "guardian"
    guardian_list.append(df_g)

guardian_all = pd.concat(guardian_list, ignore_index=True)
print("Guardian total rows:", guardian_all.shape)
display(guardian_all.head())

# Independent
indep = pd.read_csv(INDEP_PATH)
indep["source"] = "independent"
print("Independent rows:", indep.shape)
display(indep.head())

# Harmonize columns: we'll build a common 'full_text'
def make_full_text(df, title_col="title", trail_col="trailText",
                   body_col="body", headline_col="headline"):
    parts = []
    for col in [headline_col, title_col, trail_col, body_col]:
        if col in df.columns:
            parts.append(df[col].fillna(""))
    if parts:
        return (" \n".join(parts))
    else:
        return ""

guardian_all["full_text"] = (
    guardian_all.get("headline", "").fillna("") + " \n" +
    guardian_all.get("title", "").fillna("") + " \n" +
    guardian_all.get("trailText", "").fillna("") + " \n" +
    guardian_all.get("body", "").fillna("")
)

indep["full_text"] = (
    indep.get("headline", "").fillna("") + " \n" +
    indep.get("title", "").fillna("") + " \n" +
    indep.get("trailText", "").fillna("") + " \n" +
    indep.get("body", "").fillna("")
)

# Build unified press corpus
guardian_all["doc_id"] = "guardian_" + guardian_all["id"].astype(str)
indep["doc_id"]        = "independent_" + indep["id"].astype(str)

press_corpus = pd.concat(
    [
        guardian_all[["doc_id", "source", "webPublicationDate", "full_text"]],
        indep[["doc_id", "source", "webPublicationDate", "full_text"]],
    ],
    ignore_index=True,
)

print("Press corpus:", press_corpus.shape)
display(press_corpus.head())


Guardian files found: 19
Guardian total rows: (34927, 14)


Unnamed: 0,id,webPublicationDate,sectionName,webUrl,title,trailText,body,byline,headline,wordcount,year,month,day,source
0,football/2016/sep/30/arsene-wenger-perfect-eng...,2016-09-30T22:26:00Z,Football,https://www.theguardian.com/football/2016/sep/...,FA chief: Wenger has ‘perfect criteria’ for En...,Martin Glenn has said that successful internat...,Arsène Wenger is one of a handful of managers ...,Owen Gibson,FA chief: Wenger has ‘perfect criteria’ for En...,926,2016,9,30,guardian
1,football/2016/sep/30/mauricio-pochettino-pep-g...,2016-09-30T21:39:00Z,Football,https://www.theguardian.com/football/2016/sep/...,Mauricio Pochettino to face Guardiola as an eq...,"Mauricio Pochettino, as a player and a manager...",Mauricio Pochettino remembers the way that the...,David Hytner,Mauricio Pochettino to face Guardiola as an eq...,1205,2016,9,30,guardian
2,football/2016/sep/30/chelsea-antonio-conte-no-...,2016-09-30T21:34:00Z,Football,https://www.theguardian.com/football/2016/sep/...,Chelsea’s Antonio Conte says he has no magic w...,Antonio Conte has said he cannot wave ‘a magic...,Antonio Conte has warned there can be no quick...,Dominic Fifield,Chelsea’s Antonio Conte says he has no magic w...,787,2016,9,30,guardian
3,football/2016/sep/30/david-moyes-sunderland-im...,2016-09-30T21:30:01Z,Football,https://www.theguardian.com/football/2016/sep/...,David Moyes denies Sunderland is an ‘impossibl...,David Moyes is on a mission to prove that mana...,David Moyes is on a mission to prove that mana...,Louise Taylor,David Moyes denies Sunderland is an ‘impossibl...,544,2016,9,30,guardian
4,football/2016/sep/30/pep-guardiola-manchester-...,2016-09-30T21:30:01Z,Football,https://www.theguardian.com/football/2016/sep/...,Pep Guardiola tells Manchester City players ef...,Pep Guardiola has warned Manchester City’s pla...,Pep Guardiola has warned Manchester City’s pla...,Jamie Jackson,Pep Guardiola tells Manchester City players ef...,803,2016,9,30,guardian


Independent rows: (4950, 11)


Unnamed: 0,id,webPublicationDate,sectionName,webUrl,title,trailText,body,byline,headline,wordcount,source
0,https://www.independent.co.uk/sport/football/j...,"Mon, 17 Nov 2025 22:30:55 GMT",Independent Football,https://www.independent.co.uk/sport/football/j...,Jarrod Bowen ready to rival Phil Foden and Col...,<p>Bowen is trying to give England manager Tho...,Notifications can be managed in browser prefer...,,Jarrod Bowen ready to rival Phil Foden and Col...,661,independent
1,https://www.independent.co.uk/sport/football/n...,"Mon, 17 Nov 2025 22:17:23 GMT",Independent Football,https://www.independent.co.uk/sport/football/n...,Northern Ireland bounce back from Slovakia def...,<p><strong>Northern Ireland 1-0 Luxembourg:</s...,Notifications can be managed in browser prefer...,,Northern Ireland bounce back from Slovakia def...,763,independent
2,https://www.independent.co.uk/sport/football/w...,"Mon, 17 Nov 2025 22:11:21 GMT",Independent Football,https://www.independent.co.uk/sport/football/w...,When are the World Cup qualifying play-offs an...,<p>The 12 group runners-up will be joined by f...,Notifications can be managed in browser prefer...,,When are the World Cup qualifying play-offs an...,589,independent
3,https://www.independent.co.uk/sport/football/w...,"Mon, 17 Nov 2025 22:07:31 GMT",Independent Football,https://www.independent.co.uk/sport/football/w...,"World Cup 2026 draw: Date, start time, format ...","<p>The tournament in USA, Canada and Mexico wi...",Notifications can be managed in browser prefer...,,"World Cup 2026 draw: Date, start time, format ...",713,independent
4,https://www.independent.co.uk/sport/football/w...,"Mon, 17 Nov 2025 22:07:16 GMT",Independent Football,https://www.independent.co.uk/sport/football/w...,Who can qualify for the World Cup this week?,<p>With Norway returning to a men’s World Cup ...,Notifications can be managed in browser prefer...,,Who can qualify for the World Cup this week?,1395,independent


Press corpus: (39877, 4)


Unnamed: 0,doc_id,source,webPublicationDate,full_text
0,guardian_football/2016/sep/30/arsene-wenger-pe...,guardian,2016-09-30T22:26:00Z,FA chief: Wenger has ‘perfect criteria’ for En...
1,guardian_football/2016/sep/30/mauricio-pochett...,guardian,2016-09-30T21:39:00Z,Mauricio Pochettino to face Guardiola as an eq...
2,guardian_football/2016/sep/30/chelsea-antonio-...,guardian,2016-09-30T21:34:00Z,Chelsea’s Antonio Conte says he has no magic w...
3,guardian_football/2016/sep/30/david-moyes-sund...,guardian,2016-09-30T21:30:01Z,David Moyes denies Sunderland is an ‘impossibl...
4,guardian_football/2016/sep/30/pep-guardiola-ma...,guardian,2016-09-30T21:30:01Z,Pep Guardiola tells Manchester City players ef...


In [None]:
PRESS_CORPUS_PATH = OUT_DIR / "press_corpus_2016_2025.parquet"

press_corpus.to_parquet(PRESS_CORPUS_PATH, index=False)
print("Saved press_corpus to:", PRESS_CORPUS_PATH)


Saved press_corpus to: C:\Users\dshog\Prem NLP Project\network_outputs\press_corpus_2016_2025.parquet


install en_core_web_sm

In [None]:
import sys
import subprocess

# Make sure spaCy is installed
try:
    import spacy
except ModuleNotFoundError:
    subprocess.run([sys.executable, "-m", "pip", "install", "spacy"], check=True)
    import spacy

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("en_core_web_sm not found. Installing it now...")
    subprocess.run(
        [sys.executable, "-m", "spacy", "download", "en_core_web_sm"],
        check=True
    )
    nlp = spacy.load("en_core_web_sm")

ENTITY_LABELS = {"PERSON", "ORG", "GPE"}


NER + press knowledge graph

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
ENTITY_LABELS = {"PERSON", "ORG", "GPE"}

def extract_entities(texts, batch_size=16):
    docs = nlp.pipe(texts, batch_size=batch_size)
    for doc in docs:
        ents = []
        for ent in doc.ents:
            if ent.label_ in ENTITY_LABELS:
                txt = ent.text.strip()
                if txt:
                    ents.append((txt, ent.label_))
        yield ents

def build_entity_graph(df, text_col="full_text", doc_id_col="doc_id"):
    doc_ids = df[doc_id_col].tolist()
    texts   = df[text_col].fillna("").tolist()

    rows = []
    G = nx.Graph()

    print(f"Running NER on {len(df)} press documents...")
    for doc_id, ents in tqdm(zip(doc_ids, extract_entities(texts)), total=len(doc_ids)):
        unique_ents = sorted(set(ents))

        for ent_text, ent_label in unique_ents:
            rows.append(
                {
                    "doc_id": doc_id,
                    "entity_text": ent_text,
                    "entity_label": ent_label,
                }
            )
            if not G.has_node(ent_text):
                G.add_node(ent_text, label=ent_label)

        for (a_text, _), (b_text, _) in itertools.combinations(unique_ents, 2):
            if G.has_edge(a_text, b_text):
                G[a_text][b_text]["weight"] += 1
            else:
                G.add_edge(a_text, b_text, weight=1)

    entity_df = pd.DataFrame(rows)
    return entity_df, G

if PRESS_ENTITIES_PATH.exists() and PRESS_GRAPH_PATH.exists():
    print("Loading existing press_entities and press_graph...")
    press_entities = pd.read_parquet(PRESS_ENTITIES_PATH)
    with open(PRESS_GRAPH_PATH, "rb") as f:
        press_G = pickle.load(f)
else:
    press_entities, press_G = build_entity_graph(press_corpus)
    press_entities.to_parquet(PRESS_ENTITIES_PATH, index=False)
    with open(PRESS_GRAPH_PATH, "wb") as f:
        pickle.dump(press_G, f)
    print("Saved press_entities and press_graph.")

print("press_entities:", press_entities.shape)
display(press_entities.head())

print(
    "press_G:",
    press_G.number_of_nodes(), "nodes,",
    press_G.number_of_edges(), "edges"
)


Loading existing press_entities and press_graph...
press_entities: (1950601, 4)


Unnamed: 0,doc_id,entity_text,entity_label,entity_text_norm
0,guardian_football/2016/sep/30/arsene-wenger-pe...,Allardyce,GPE,Allardyce
1,guardian_football/2016/sep/30/arsene-wenger-pe...,Allardyce,ORG,Allardyce
2,guardian_football/2016/sep/30/arsene-wenger-pe...,Arsenal,ORG,Arsenal
3,guardian_football/2016/sep/30/arsene-wenger-pe...,Arsenal Asked,ORG,Arsenal Asked
4,guardian_football/2016/sep/30/arsene-wenger-pe...,Arsène Wenger,PERSON,Arsène Wenger


press_G: 234444 nodes, 48831216 edges


Club alias dictionary & normalization helper

In [None]:
CLUB_ALIASES = {
    # Arsenal
    "arsenal": "Arsenal",
    "arsenal fc": "Arsenal",

    # Aston Villa
    "aston villa": "Aston Villa",
    "aston villa fc": "Aston Villa",

    # Bournemouth
    "bournemouth": "AFC Bournemouth",
    "afc bournemouth": "AFC Bournemouth",

    # Brentford
    "brentford": "Brentford",
    "brentford fc": "Brentford",

    # Brighton
    "brighton": "Brighton & Hove Albion",
    "brighton & hove": "Brighton & Hove Albion",
    "brighton & hove albion": "Brighton & Hove Albion",
    "bha": "Brighton & Hove Albion",

    # Burnley
    "burnley": "Burnley",
    "burnley fc": "Burnley",

    # Chelsea
    "chelsea": "Chelsea",
    "chelsea fc": "Chelsea",
    "cfc": "Chelsea",

    # Crystal Palace
    "crystal palace": "Crystal Palace",
    "crystal palace fc": "Crystal Palace",

    # Everton
    "everton": "Everton",
    "everton fc": "Everton",

    # Fulham
    "fulham": "Fulham",
    "fulham fc": "Fulham",

    # Leeds
    "leeds": "Leeds United",
    "leeds united": "Leeds United",
    "leeds united fc": "Leeds United",

    # Leicester
    "leicester": "Leicester City",
    "leicester city": "Leicester City",
    "leicester city fc": "Leicester City",

    # Liverpool
    "liverpool": "Liverpool",
    "liverpool fc": "Liverpool",
    "lfc": "Liverpool",

    # Manchester City
    "man city": "Manchester City",
    "manchester city": "Manchester City",
    "manchester city fc": "Manchester City",
    "mcfc": "Manchester City",

    # Manchester United
    "man utd": "Manchester United",
    "manchester utd": "Manchester United",
    "manchester united": "Manchester United",
    "manchester united fc": "Manchester United",
    "mufc": "Manchester United",

    # Newcastle
    "newcastle": "Newcastle United",
    "newcastle united": "Newcastle United",
    "newcastle united fc": "Newcastle United",

    # Norwich
    "norwich": "Norwich City",
    "norwich city": "Norwich City",
    "norwich city fc": "Norwich City",

    # Nottingham Forest
    "nottingham forest": "Nottingham Forest",
    "forest": "Nottingham Forest",  # football-context assumption

    # Sheffield United
    "sheffield united": "Sheffield United",
    "sheffield utd": "Sheffield United",

    # Southampton
    "southampton": "Southampton",
    "southampton fc": "Southampton",

    # Spurs / Tottenham
    "spurs": "Tottenham Hotspur",
    "tottenham": "Tottenham Hotspur",
    "tottenham hotspur": "Tottenham Hotspur",
    "tottenham hotspur fc": "Tottenham Hotspur",

    # Watford
    "watford": "Watford",
    "watford fc": "Watford",

    # West Ham
    "west ham": "West Ham United",
    "west ham united": "West Ham United",
    "west ham united fc": "West Ham United",

    # Wolves
    "wolves": "Wolverhampton Wanderers",
    "wolverhampton": "Wolverhampton Wanderers",
    "wolverhampton wanderers": "Wolverhampton Wanderers",
    
    # Other
    "west brom": "West Bromwich Albion",
    "cardiff": "Cardiff City",
    "swansea": "Swansea City",
    "huddersfield": "Huddersfield Town",
    "hull": "Hull City",
    "stoke": "Stoke City"
}

CLUB_ALIAS_MAP = {k.lower(): v for k, v in CLUB_ALIASES.items()}


def canonical_entity(name: str) -> str:
    """
    Normalize an entity string to a canonical club name when possible.
    Non-club entities (players, managers, orgs) are left as-is (stripped).
    """
    if not isinstance(name, str):
        return name
    key = name.strip().lower()
    return CLUB_ALIAS_MAP.get(key, name.strip())


# --- Normalize press_entities -----------------------------------------------

press_entities = press_entities.copy()
press_entities["entity_text_norm"] = press_entities["entity_text"].apply(canonical_entity)

display(
    press_entities[["doc_id", "entity_text", "entity_text_norm", "entity_label"]]
    .head(10)
)


# --- Normalize nodes in press_G (collapse aliases) --------------------------

import networkx as nx

def normalize_press_graph(G: nx.Graph) -> nx.Graph:
    newG = nx.Graph()

    for node, data in G.nodes(data=True):
        cn = canonical_entity(node)
        if cn not in newG:
            newG.add_node(cn, **data)

    for u, v, data in G.edges(data=True):
        cu = canonical_entity(u)
        cv = canonical_entity(v)
        if cu == cv:
            continue

        w = data.get("weight", 1)
        if newG.has_edge(cu, cv):
            newG[cu][cv]["weight"] += w
        else:
            newG.add_edge(cu, cv, weight=w)

    return newG


press_G = normalize_press_graph(press_G)

print(
    "Normalized press_G:",
    press_G.number_of_nodes(), "nodes,",
    press_G.number_of_edges(), "edges"
)


Unnamed: 0,doc_id,entity_text,entity_text_norm,entity_label
0,guardian_football/2016/sep/30/arsene-wenger-pe...,Allardyce,Allardyce,GPE
1,guardian_football/2016/sep/30/arsene-wenger-pe...,Allardyce,Allardyce,ORG
2,guardian_football/2016/sep/30/arsene-wenger-pe...,Arsenal,Arsenal,ORG
3,guardian_football/2016/sep/30/arsene-wenger-pe...,Arsenal Asked,Arsenal Asked,ORG
4,guardian_football/2016/sep/30/arsene-wenger-pe...,Arsène Wenger,Arsène Wenger,PERSON
5,guardian_football/2016/sep/30/arsene-wenger-pe...,BBC Panorama,BBC Panorama,ORG
6,guardian_football/2016/sep/30/arsene-wenger-pe...,Bolton Wanderers,Bolton Wanderers,PERSON
7,guardian_football/2016/sep/30/arsene-wenger-pe...,England,England,GPE
8,guardian_football/2016/sep/30/arsene-wenger-pe...,England’s Euro,England’s Euro,ORG
9,guardian_football/2016/sep/30/arsene-wenger-pe...,Euros,Euros,PERSON


Normalized press_G: 234444 nodes, 48831216 edges


save the results

In [8]:
press_entities.to_parquet(PRESS_ENTITIES_PATH, index=False)
with open(PRESS_GRAPH_PATH, "wb") as f:
    pickle.dump(press_G, f)

print("Saved normalized press_entities and press_G.")
print("Entities path:", PRESS_ENTITIES_PATH)
print("Graph path:", PRESS_GRAPH_PATH)


Saved normalized press_entities and press_G.
Entities path: C:\Users\dshog\Prem NLP Project\network_outputs\press_entities_2016_2025.parquet
Graph path: C:\Users\dshog\Prem NLP Project\network_outputs\press_graph_full.pkl


In [7]:
from pathlib import Path

# Force recompute press_footy_centrality with the normalized graph
path = Path(PRESS_FOOTY_CENTRAL_PATH)
if path.exists():
    path.unlink()
    print("Deleted old press_footy_centrality file so it can be recomputed.")
else:
    print("No existing press_footy_centrality file found; will compute fresh.")


Deleted old press_footy_centrality file so it can be recomputed.


Football-only press subgraph + centrality

In [9]:
def compute_light_centrality(G: nx.Graph, player_set, club_set) -> pd.DataFrame:
    components = list(nx.connected_components(G))
    comp_id_map = {}
    comp_size_map = {}
    for i, comp in enumerate(components):
        size = len(comp)
        for node in comp:
            comp_id_map[node] = i
            comp_size_map[node] = size

    deg = dict(G.degree())
    strength = dict(G.degree(weight="weight"))

    df = pd.DataFrame(
        {
            "node": list(G.nodes()),
            "degree": [deg[n] for n in G.nodes()],
            "strength": [strength[n] for n in G.nodes()],
            "component_id": [comp_id_map[n] for n in G.nodes()],
            "component_size": [comp_size_map[n] for n in G.nodes()],
        }
    )

    df["type"] = df["node"].apply(
        lambda x: "player" if x in player_set else ("club" if x in club_set else "other")
    )
    return df

if PRESS_FOOTY_CENTRAL_PATH.exists():
    press_footy_centrality = pd.read_parquet(PRESS_FOOTY_CENTRAL_PATH)
    print("Loaded press_footy_centrality:", press_footy_centrality.shape)
else:
    relevant_nodes = [
        n for n in press_G.nodes
        if n in player_set or n in club_set
    ]
    print("Football nodes in press_G:", len(relevant_nodes))

    press_footy_G = press_G.subgraph(relevant_nodes).copy()
    print(
        "press_footy_G:",
        press_footy_G.number_of_nodes(), "nodes,",
        press_footy_G.number_of_edges(), "edges"
    )

    press_footy_centrality = compute_light_centrality(press_footy_G, player_set, club_set)
    press_footy_centrality.to_parquet(PRESS_FOOTY_CENTRAL_PATH, index=False)
    print("Saved press_footy_centrality.")

display(
    press_footy_centrality
    .sort_values("strength", ascending=False)
    .head(10)
)


Football nodes in press_G: 1683
press_footy_G: 1683 nodes, 392764 edges
Saved press_footy_centrality.


Unnamed: 0,node,degree,strength,component_id,component_size,type
184,Tottenham Hotspur,1588,289136,0,1683,club
1335,Liverpool,1615,267780,0,1683,club
797,Chelsea,1607,249450,0,1683,club
348,Manchester United,1596,211129,0,1683,club
910,Manchester City,1603,194962,0,1683,club
48,Everton,1568,169552,0,1683,club
555,Arsenal,1570,149741,0,1683,club
473,Newcastle United,1555,127955,0,1683,club
1407,Brighton & Hove Albion,1514,122193,0,1683,club
1202,Aston Villa,1541,115807,0,1683,club


In [10]:
press_footy_centrality.query("node.str.contains('Liverpool')", engine="python").head()


Unnamed: 0,node,degree,strength,component_id,component_size,type
1335,Liverpool,1615,267780,0,1683,club


Load Reddit posts + comments and build corpus

In [11]:
def load_reddit_posts(posts_zip_path: Path) -> pd.DataFrame:
    posts = pd.read_csv(posts_zip_path, compression="zip")
    print(f"Loaded Reddit POSTS: {len(posts)} rows")
    display(posts.head())
    return posts

def load_reddit_comments(comments_zip_path: Path) -> pd.DataFrame:
    comments = pd.read_csv(comments_zip_path, compression="zip")
    print(f"Loaded Reddit COMMENTS: {len(comments)} rows")
    display(comments.head())
    return comments

def build_reddit_corpus(posts_zip_path: Path, comments_zip_path: Path) -> pd.DataFrame:
    posts    = load_reddit_posts(posts_zip_path)
    comments = load_reddit_comments(comments_zip_path)

    posts = posts.rename(
        columns={
            "id": "post_id",
            "title": "title_text",
            "selftext": "body_text",
        }
    )
    comments = comments.rename(
        columns={
            "comment_id": "comment_id",
            "body": "body_text",
        }
    )

    posts_doc = posts.copy()
    posts_doc["doc_id"] = "post_" + posts_doc["post_id"].astype(str)
    posts_doc["source"] = "reddit_post"
    posts_doc["full_text"] = (
        posts_doc["title_text"].fillna("") + "\n\n" +
        posts_doc["body_text"].fillna("")
    )

    comments_doc = comments.copy()
    comments_doc["doc_id"] = "comment_" + comments_doc["comment_id"].astype(str)
    comments_doc["source"] = "reddit_comment"
    comments_doc["title_text"] = ""
    comments_doc["full_text"]  = comments_doc["body_text"].fillna("")

    common_cols = [
        "doc_id", "source", "subreddit", "author", "created_iso",
        "season", "window", "title_text", "body_text", "full_text",
    ]

    for col in common_cols:
        if col not in posts_doc.columns:
            posts_doc[col] = None
        if col not in comments_doc.columns:
            comments_doc[col] = None

    posts_doc    = posts_doc[common_cols].copy()
    comments_doc = comments_doc[common_cols].copy()

    reddit_corpus = pd.concat([posts_doc, comments_doc], ignore_index=True)
    print("Unified Reddit corpus:", reddit_corpus.shape)
    display(
        reddit_corpus[["doc_id", "source", "subreddit", "season", "window"]]
        .head(10)
    )
    return reddit_corpus

reddit_corpus = build_reddit_corpus(REDDIT_POSTS_ZIP, REDDIT_COMMENTS_ZIP)


Loaded Reddit POSTS: 139871 rows


Unnamed: 0,post_id,subreddit,author,created_utc,created_iso,title,selftext,url,permalink,score,upvote_ratio,num_comments,link_flair_text,is_self,is_transfer_related,window,season
0,7uazvq,Brentford,Lard_Baron,1517415283,2018-01-31T16:14:43+00:00,Bournemouth return with improved offer for Bre...,,https://www.getwestlondon.co.uk/sport/football...,/r/Brentford/comments/7uazvq/bournemouth_retur...,3,,6,,False,True,winter,2017-18
1,8rqgrx,Brentford,Lard_Baron,1529235246,2018-06-17T11:34:06+00:00,Brentford 'reject £10m Bournemouth bid' for Ch...,,http://www.skysports.com/transfer/news/12691/1...,/r/Brentford/comments/8rqgrx/brentford_reject_...,7,,11,,False,True,summer,2018-19
2,8zsbwl,Brentford,entregg,1531888101,2018-07-18T04:28:21+00:00,Dean Smith accepts offer for Florian Jozefzoon,,https://twitter.com/BrentfordFC/status/1019326...,/r/Brentford/comments/8zsbwl/dean_smith_accept...,4,,3,,False,True,summer,2018-19
3,905isz,Brentford,[deleted],1532004319,2018-07-19T12:45:19+00:00,Bee's want £15m for Ollie Watkins. Crystal Pal...,[deleted],https://www.standard.co.uk/sport/football/bren...,/r/Brentford/comments/905isz/bees_want_15m_for...,5,,0,,False,True,summer,2018-19
4,9367tt,Brentford,peeps3298,1532972613,2018-07-30T17:43:33+00:00,Do you want to discuss the EFL? Transfers Play...,,https://discord.gg/CfreSjj,/r/Brentford/comments/9367tt/do_you_want_to_di...,2,,0,,False,True,summer,2018-19


Loaded Reddit COMMENTS: 6980109 rows


Unnamed: 0,comment_id,link_id,post_id,parent_id,subreddit,author,created_utc,created_iso,body,score,is_submitter,is_transfer_related_comment,window,season
0,ds4i021,t3_7ntrjl,7ntrjl,t3_7ntrjl,Brentford,jamesbm54,1514981537,2018-01-03T12:12:17+00:00,We can’t be disappointed by that though 9/12 p...,1,False,True,winter,2017-18
1,dspk7hp,t3_7qbzxm,7qbzxm,t1_dspjz9p,Brentford,JotathantheSun,1516018664,2018-01-15T12:17:44+00:00,"Yeah, completely agree. For someone nicknamed ...",1,False,True,winter,2017-18
2,dt9jclq,t3_7t2ldk,7t2ldk,t3_7t2ldk,Brentford,bixer25,1516957875,2018-01-26T09:11:15+00:00,I'm not sure if someone else Googling might ha...,1,False,True,winter,2017-18
3,dtivanw,t3_7uazvq,7uazvq,t3_7uazvq,Brentford,Lard_Baron,1517415328,2018-01-31T16:15:28+00:00,We hardly got to know him and he's taken away.,2,True,True,winter,2017-18
4,dtiw7nt,t3_7uazvq,7uazvq,t1_dtivanw,Brentford,pm_me_jk_dont,1517416221,2018-01-31T16:30:21+00:00,Not if it continues to be rejected,2,False,True,winter,2017-18


Unified Reddit corpus: (7119980, 10)


Unnamed: 0,doc_id,source,subreddit,season,window
0,post_7uazvq,reddit_post,Brentford,2017-18,winter
1,post_8rqgrx,reddit_post,Brentford,2018-19,summer
2,post_8zsbwl,reddit_post,Brentford,2018-19,summer
3,post_905isz,reddit_post,Brentford,2018-19,summer
4,post_9367tt,reddit_post,Brentford,2018-19,summer
5,post_94ccns,reddit_post,Brentford,2018-19,summer
6,post_953q3j,reddit_post,Brentford,2018-19,summer
7,post_95dnxm,reddit_post,Brentford,2018-19,summer
8,post_95e436,reddit_post,Brentford,2018-19,summer
9,post_95yeks,reddit_post,Brentford,2018-19,summer


In [47]:
REDDIT_CORPUS_PATH = OUT_DIR / "reddit_corpus_2018_2025.parquet"

reddit_corpus.to_parquet(REDDIT_CORPUS_PATH, index=False)
print("Saved reddit_corpus to:", REDDIT_CORPUS_PATH)


Saved reddit_corpus to: C:\Users\dshog\Prem NLP Project\network_outputs\reddit_corpus_2018_2025.parquet


Random Reddit sample (10,000 docs, random_state=42) + NER

In [None]:
REDDIT_SAMPLE_SIZE = 10000 

reddit_sample = reddit_corpus.sample(REDDIT_SAMPLE_SIZE, random_state=42)
print("Reddit sample size:", reddit_sample.shape)
display(reddit_sample.head())

def build_reddit_entity_graph(df, text_col="full_text", doc_id_col="doc_id"):
    """
    Run NER over Reddit sample and build a co-occurrence graph.
    Uses canonical_entity(...) so that club aliases (e.g., 'Liverpool FC', 'LFC')
    are collapsed into a single node in the graph.
    """
    doc_ids = df[doc_id_col].tolist()
    texts   = df[text_col].fillna("").tolist()

    rows = []
    G = nx.Graph()

    print(f"Running NER on {len(df)} Reddit documents (sample)...")
    for doc_id, ents in tqdm(zip(doc_ids, extract_entities(texts)), total=len(doc_ids)):
        raw_unique_ents = sorted(set(ents))

        canonical_ents = []
        for ent_text, ent_label in raw_unique_ents:
            norm_text = canonical_entity(ent_text)
            canonical_ents.append((ent_text, norm_text, ent_label))

            rows.append(
                {
                    "doc_id": doc_id,
                    "entity_text": ent_text,     
                    "entity_text_norm": norm_text, 
                    "entity_label": ent_label,
                }
            )

            if not G.has_node(norm_text):
                G.add_node(norm_text, label=ent_label)

        for (_, a_norm, _), (_, b_norm, _) in itertools.combinations(canonical_ents, 2):
            if a_norm == b_norm:
                continue
            if G.has_edge(a_norm, b_norm):
                G[a_norm][b_norm]["weight"] += 1
            else:
                G.add_edge(a_norm, b_norm, weight=1)

    entity_df = pd.DataFrame(rows)
    return entity_df, G

if REDDIT_ENTITIES_PATH.exists() and REDDIT_FOOTY_CENTRAL_PATH.exists():
    print("Loading existing Reddit entities and football centrality...")
    reddit_entities = pd.read_parquet(REDDIT_ENTITIES_PATH)
    reddit_footy_centrality = pd.read_parquet(REDDIT_FOOTY_CENTRAL_PATH)
else:
    reddit_entities, reddit_G = build_reddit_entity_graph(reddit_sample)

    print("Reddit entity graph:",
          reddit_G.number_of_nodes(), "nodes,",
          reddit_G.number_of_edges(), "edges")

    reddit_entities.to_parquet(REDDIT_ENTITIES_PATH, index=False)

    # Football-only Reddit subgraph
    relevant_nodes = [
        n for n in reddit_G.nodes
        if n in player_set or n in club_set
    ]
    print("Relevant (player/club) nodes in reddit_G:", len(relevant_nodes))

    reddit_footy_G = reddit_G.subgraph(relevant_nodes).copy()
    print(
        "Reddit football subgraph:",
        reddit_footy_G.number_of_nodes(), "nodes,",
        reddit_footy_G.number_of_edges(), "edges"
    )

    reddit_footy_centrality = compute_light_centrality(
        reddit_footy_G, player_set, club_set
    )
    reddit_footy_centrality.to_parquet(REDDIT_FOOTY_CENTRAL_PATH, index=False)
    print("Saved reddit_entities and reddit_footy_centrality.")

print("reddit_entities_sample:", reddit_entities.shape)
display(reddit_entities.head())

print("reddit_footy_centrality_sample:", reddit_footy_centrality.shape)
display(reddit_footy_centrality.head())


Reddit sample size: (10000, 10)


Unnamed: 0,doc_id,source,subreddit,author,created_iso,season,window,title_text,body_text,full_text
5526958,comment_ew8gjyf,reddit_comment,reddevils,K_Uger_Industries,2019-08-07T20:20:57+00:00,2019-20,summer,,He gained weight and turned into Ben Cheddar,He gained weight and turned into Ben Cheddar
5403456,comment_escxbmb,reddit_comment,reddevils,dWaldizzle,2019-06-29T20:49:46+00:00,2019-20,summer,,Neither would start over Shaw lol.,Neither would start over Shaw lol.
4089696,comment_j50qcon,reddit_comment,chelseafc,pointlessbanter1,2023-01-19T16:26:18+00:00,2022-23,winter,,"I’m not worried bro, Boehly is a mad man and h...","I’m not worried bro, Boehly is a mad man and h..."
1209817,comment_ib297zb,reddit_comment,Gunners,PierreEmerickMorgan,2022-06-03T17:53:04+00:00,2022-23,summer,,"When fit it's a great team, 80 points a season...","When fit it's a great team, 80 points a season..."
2530347,comment_epzo4rx,reddit_comment,LiverpoolFC,R3dbeardLFC,2019-06-04T17:30:21+00:00,2019-20,summer,,"No, I get that, but if he can't get the playin...","No, I get that, but if he can't get the playin..."


Loading existing Reddit entities and football centrality...
reddit_entities_sample: (13650, 4)


Unnamed: 0,doc_id,entity_text,entity_text_norm,entity_label
0,comment_ew8gjyf,Ben Cheddar,Ben Cheddar,PERSON
1,comment_escxbmb,Shaw,Shaw,ORG
2,comment_j50qcon,Boehly,Boehly,PERSON
3,comment_epzo4rx,SHOULD,SHOULD,ORG
4,comment_fege3kx,Fred,Fred,PERSON


reddit_footy_centrality_sample: (299, 6)


Unnamed: 0,node,degree,strength,component_id,component_size,type
0,Huddersfield,4,4,0,205,club
1,Ethan Ampadu,1,1,0,205,player
2,Yerry Mina,0,0,1,1,player
3,Joel Campbell,0,0,2,1,player
4,Tottenham Hotspur,58,178,0,205,club


In [None]:
if REDDIT_ENTITIES_PATH.exists() and REDDIT_FOOTY_CENTRAL_PATH.exists():
    print("Loading existing Reddit entities and football centrality...")
    reddit_entities = pd.read_parquet(REDDIT_ENTITIES_PATH)
    reddit_footy_centrality = pd.read_parquet(REDDIT_FOOTY_CENTRAL_PATH)
else:
    reddit_entities, reddit_G = build_reddit_entity_graph(reddit_sample)

    print("Reddit entity graph:",
          reddit_G.number_of_nodes(), "nodes,",
          reddit_G.number_of_edges(), "edges")

    reddit_entities.to_parquet(REDDIT_ENTITIES_PATH, index=False)

    relevant_nodes = [
        n for n in reddit_G.nodes
        if n in player_set or n in club_set
    ]
    print("Relevant (player/club) nodes in reddit_G:", len(relevant_nodes))

    reddit_footy_G = reddit_G.subgraph(relevant_nodes).copy()
    print(
        "Reddit football subgraph:",
        reddit_footy_G.number_of_nodes(), "nodes,",
        reddit_footy_G.number_of_edges(), "edges"
    )

    reddit_footy_centrality = compute_light_centrality(
        reddit_footy_G, player_set, club_set
    )
    reddit_footy_centrality.to_parquet(REDDIT_FOOTY_CENTRAL_PATH, index=False)
    print("Saved reddit_entities and reddit_footy_centrality.")

print("reddit_entities_sample:", reddit_entities.shape)
display(reddit_entities.head())

print("reddit_footy_centrality_sample:", reddit_footy_centrality.shape)
display(reddit_footy_centrality.head())


Loading existing Reddit entities and football centrality...
reddit_entities_sample: (13650, 4)


Unnamed: 0,doc_id,entity_text,entity_text_norm,entity_label
0,comment_ew8gjyf,Ben Cheddar,Ben Cheddar,PERSON
1,comment_escxbmb,Shaw,Shaw,ORG
2,comment_j50qcon,Boehly,Boehly,PERSON
3,comment_epzo4rx,SHOULD,SHOULD,ORG
4,comment_fege3kx,Fred,Fred,PERSON


reddit_footy_centrality_sample: (299, 6)


Unnamed: 0,node,degree,strength,component_id,component_size,type
0,Huddersfield,4,4,0,205,club
1,Ethan Ampadu,1,1,0,205,player
2,Yerry Mina,0,0,1,1,player
3,Joel Campbell,0,0,2,1,player
4,Tottenham Hotspur,58,178,0,205,club


In [14]:
def normalize_club_name(name: str) -> str:
    """
    Wrapper for canonical_entity so we can reuse the same normalization
    logic when working with club labels in tables.

    This keeps compatibility with existing code that expects
    normalize_club_name(...) to exist.
    """
    if not isinstance(name, str):
        return name
    return canonical_entity(name)


In [15]:
reddit_clubs = (
    reddit_footy_centrality
    .query("type == 'club'")
    .copy()
)

reddit_clubs["club_raw"] = reddit_clubs["node"].astype(str)
reddit_clubs["club"] = reddit_clubs["club_raw"].apply(normalize_club_name)

print("Reddit clubs (raw vs normalized) example after 10k sample:")
display(
    reddit_clubs[["club_raw", "club", "degree", "strength"]]
    .head(15)
)


Reddit clubs (raw vs normalized) example after 10k sample:


Unnamed: 0,club_raw,club,degree,strength
0,Huddersfield,Huddersfield Town,4,4
4,Tottenham Hotspur,Tottenham Hotspur,58,178
7,Montpellier,Montpellier,2,2
13,Fiorentina,Fiorentina,3,3
14,West Ham United,West Ham United,18,24
22,Sassuolo,Sassuolo,0,0
37,Inter,Inter,6,10
39,Brentford,Brentford,6,9
40,Blackburn,Blackburn,8,12
43,Peterborough,Peterborough,0,0


In [16]:
reddit_clubs[["club_raw", "club"]].drop_duplicates().head(30)


Unnamed: 0,club_raw,club
0,Huddersfield,Huddersfield Town
4,Tottenham Hotspur,Tottenham Hotspur
7,Montpellier,Montpellier
13,Fiorentina,Fiorentina
14,West Ham United,West Ham United
22,Sassuolo,Sassuolo
37,Inter,Inter
39,Brentford,Brentford
40,Blackburn,Blackburn
43,Peterborough,Peterborough


compute main PL coverage for Reddit

In [17]:
# Press clubs from updated press_footy_centrality
press_clubs = (
    press_footy_centrality
    .query("type == 'club'")
    .copy()
)

press_clubs["club_raw"] = press_clubs["node"].astype(str)
press_clubs["club"] = press_clubs["club_raw"].apply(normalize_club_name)

print("Press clubs (raw vs normalized):")
display(
    press_clubs[["club_raw", "club", "degree", "strength"]]
    .head(15)
)


Press clubs (raw vs normalized):


Unnamed: 0,club_raw,club,degree,strength
0,PSV Eindhoven,PSV Eindhoven,60,82
20,FC Emmen,FC Emmen,183,198
25,Vasco da Gama,Vasco da Gama,176,359
30,Alcorcón,Alcorcón,74,151
35,Southend United,Southend United,464,1617
36,Celta de Vigo,Celta de Vigo,40,67
42,Lincoln City,Lincoln City,817,5954
48,Everton,Everton,1568,169552
50,Partizan,Partizan,325,946
53,Paços Ferreira,Paços Ferreira,28,35


In [18]:
# Canonical set of "main" Premier League clubs
pl_main_clubs = {
    "Arsenal",
    "Aston Villa",
    "AFC Bournemouth",
    "Brentford",
    "Brighton & Hove Albion",
    "Burnley",
    "Chelsea",
    "Crystal Palace",
    "Everton",
    "Fulham",
    "Leeds United",
    "Leicester City",
    "Liverpool",
    "Manchester City",
    "Manchester United",
    "Newcastle United",
    "Norwich City",
    "Nottingham Forest",
    "Sheffield United",
    "Southampton",
    "Tottenham Hotspur",
    "Watford",
    "West Ham United",
    "Wolverhampton Wanderers",
}


In [19]:
clubs_press      = set(press_clubs["club"])
clubs_reddit     = set(reddit_clubs["club"])
clubs_transfers  = {normalize_club_name(c) for c in club_set}

pl_main_in_transfers = pl_main_clubs & clubs_transfers
pl_main_in_press     = pl_main_clubs & clubs_press
pl_main_in_reddit    = pl_main_clubs & clubs_reddit

print("Main PL clubs in transfers:", len(pl_main_in_transfers))
print("Main PL clubs in PRESS graph:", len(pl_main_in_press))
print("Main PL clubs in REDDIT graph (10k):", len(pl_main_in_reddit))

print("\nMain PL clubs missing from REDDIT graph (10k):")
print(sorted(pl_main_in_transfers - pl_main_in_reddit))


Main PL clubs in transfers: 24
Main PL clubs in PRESS graph: 24
Main PL clubs in REDDIT graph (10k): 23

Main PL clubs missing from REDDIT graph (10k):
['Crystal Palace']


Clubs in both networks

In [20]:
# Clubs that are main PL AND appear in both press + Reddit graphs
clubs_both = pl_main_in_press & pl_main_in_reddit
print("Clubs in BOTH press & Reddit:", len(clubs_both))
sorted(list(clubs_both))

Clubs in BOTH press & Reddit: 23


['AFC Bournemouth',
 'Arsenal',
 'Aston Villa',
 'Brentford',
 'Brighton & Hove Albion',
 'Burnley',
 'Chelsea',
 'Everton',
 'Fulham',
 'Leeds United',
 'Leicester City',
 'Liverpool',
 'Manchester City',
 'Manchester United',
 'Newcastle United',
 'Norwich City',
 'Nottingham Forest',
 'Sheffield United',
 'Southampton',
 'Tottenham Hotspur',
 'Watford',
 'West Ham United',
 'Wolverhampton Wanderers']

In [21]:
display(press_clubs[["club_raw", "club", "degree", "strength"]].head(10))
display(reddit_clubs[["club_raw", "club", "degree", "strength"]].head(10))


Unnamed: 0,club_raw,club,degree,strength
0,PSV Eindhoven,PSV Eindhoven,60,82
20,FC Emmen,FC Emmen,183,198
25,Vasco da Gama,Vasco da Gama,176,359
30,Alcorcón,Alcorcón,74,151
35,Southend United,Southend United,464,1617
36,Celta de Vigo,Celta de Vigo,40,67
42,Lincoln City,Lincoln City,817,5954
48,Everton,Everton,1568,169552
50,Partizan,Partizan,325,946
53,Paços Ferreira,Paços Ferreira,28,35


Unnamed: 0,club_raw,club,degree,strength
0,Huddersfield,Huddersfield Town,4,4
4,Tottenham Hotspur,Tottenham Hotspur,58,178
7,Montpellier,Montpellier,2,2
13,Fiorentina,Fiorentina,3,3
14,West Ham United,West Ham United,18,24
22,Sassuolo,Sassuolo,0,0
37,Inter,Inter,6,10
39,Brentford,Brentford,6,9
40,Blackburn,Blackburn,8,12
43,Peterborough,Peterborough,0,0


In [None]:
import pandas as pd

# 1) Aggregate PRESS by canonical club
press_agg = (
    press_clubs
    .groupby("club", as_index=False)
    .agg(
        press_degree=("degree", "sum"),
        press_strength=("strength", "sum"),
        # keep track of which raw names got merged
        club_raw_press=("club_raw", lambda s: ", ".join(sorted(set(s)))),
        press_component_id=("component_id", "max"),
        press_component_size=("component_size", "max"),
    )
)

print("press_agg shape:", press_agg.shape)
display(press_agg.head())


press_agg shape: (353, 6)


Unnamed: 0,club,press_degree,press_strength,club_raw_press,press_component_id,press_component_size
0,AC Milan,944,7366,AC Milan,0,1683
1,ADO Den Haag,43,74,ADO Den Haag,0,1683
2,AEK Athens,429,1238,AEK Athens,0,1683
3,AFC Bournemouth,1508,72620,AFC Bournemouth,0,1683
4,AFC Wimbledon,872,6580,AFC Wimbledon,0,1683


In [23]:
# 2) Aggregate REDDIT by canonical club 

reddit_agg = ( 
    reddit_clubs 
    .groupby("club", as_index=False) 
    .agg( 
        reddit_degree=("degree", "sum"), 
        reddit_strength=("strength", "sum"), 
        club_raw_reddit=("club_raw", lambda s: ", ".join(sorted(set(s)))), 
        reddit_component_id=("component_id", "max"), 
        reddit_component_size=("component_size", "max"), ) ) 

print("reddit_agg shape:", reddit_agg.shape) 
display(reddit_agg.head())

reddit_agg shape: (96, 6)


Unnamed: 0,club,reddit_degree,reddit_strength,club_raw_reddit,reddit_component_id,reddit_component_size
0,AC Milan,8,8,AC Milan,0,205
1,AFC Bournemouth,23,32,AFC Bournemouth,0,205
2,Ajax,17,22,Ajax,0,205
3,Arsenal,48,136,Arsenal,0,205
4,Aston Villa,14,28,Aston Villa,0,205


In [None]:
# 3) Clubs present in both networks + in your main PL list
clubs_both = (
    set(press_agg["club"])
    & set(reddit_agg["club"])
    & pl_main_clubs 
)
print("Clubs in BOTH networks + main PL:", len(clubs_both))
print(sorted(list(clubs_both)))

Clubs in BOTH networks + main PL: 23
['AFC Bournemouth', 'Arsenal', 'Aston Villa', 'Brentford', 'Brighton & Hove Albion', 'Burnley', 'Chelsea', 'Everton', 'Fulham', 'Leeds United', 'Leicester City', 'Liverpool', 'Manchester City', 'Manchester United', 'Newcastle United', 'Norwich City', 'Nottingham Forest', 'Sheffield United', 'Southampton', 'Tottenham Hotspur', 'Watford', 'West Ham United', 'Wolverhampton Wanderers']


In [25]:
# 4) Merge aggregated press + reddit tables
club_compare_basic = press_agg.merge(
    reddit_agg,
    on="club",
    how="outer"
)

In [26]:
# 5) Normalize strengths to [0, 1] *after* aggregation
for col in ["press_strength", "reddit_strength"]:
    max_val = club_compare_basic[col].max()
    club_compare_basic[col + "_norm"] = club_compare_basic[col] / max_val

print("club_compare_basic shape:", club_compare_basic.shape)
display(
    club_compare_basic
    .sort_values("press_strength_norm", ascending=False)
    .head(15)[
        ["club", "club_raw_press", "club_raw_reddit",
         "press_strength", "press_strength_norm",
         "reddit_strength", "reddit_strength_norm"]
    ]
)

club_compare_basic shape: (353, 13)


Unnamed: 0,club,club_raw_press,club_raw_reddit,press_strength,press_strength_norm,reddit_strength,reddit_strength_norm
310,Tottenham Hotspur,Tottenham Hotspur,Tottenham Hotspur,289136,1.0,178.0,0.618056
181,Liverpool,Liverpool,Liverpool,267780,0.926139,273.0,0.947917
67,Chelsea,Chelsea,Chelsea,249450,0.862743,288.0,1.0
189,Manchester United,Manchester United,Manchester United,211129,0.730207,75.0,0.260417
188,Manchester City,Manchester City,Manchester City,194962,0.674292,121.0,0.420139
98,Everton,Everton,Everton,169552,0.586409,93.0,0.322917
19,Arsenal,Arsenal,Arsenal,149741,0.517891,136.0,0.472222
207,Newcastle United,Newcastle United,Newcastle United,127955,0.442543,91.0,0.315972
50,Brighton & Hove Albion,Brighton & Hove Albion,Brighton & Hove Albion,122193,0.422614,112.0,0.388889
20,Aston Villa,Aston Villa,Aston Villa,115807,0.400528,28.0,0.097222


In [27]:
# 6) Final network table: only the clubs in both networks
club_network = club_compare_basic[
    club_compare_basic["club"].isin(clubs_both)
].copy()

print("club_network shape:", club_network.shape)
display(
    club_network[
        ["club", "press_strength_norm", "reddit_strength_norm"]
    ].sort_values("press_strength_norm", ascending=False)
)


club_network shape: (23, 13)


Unnamed: 0,club,press_strength_norm,reddit_strength_norm
310,Tottenham Hotspur,1.0,0.618056
181,Liverpool,0.926139,0.947917
67,Chelsea,0.862743,1.0
189,Manchester United,0.730207,0.260417
188,Manchester City,0.674292,0.420139
98,Everton,0.586409,0.322917
19,Arsenal,0.517891,0.472222
207,Newcastle United,0.442543,0.315972
50,Brighton & Hove Albion,0.422614,0.388889
20,Aston Villa,0.400528,0.097222


Connectivity metrics on club_network

In [28]:
import numpy as np

# 1) Giant-component style connectivity: how big is the component a club sits in?

max_press_comp_size  = club_network["press_component_size"].max()
max_reddit_comp_size = club_network["reddit_component_size"].max()

club_network["press_comp_share"] = (
    club_network["press_component_size"] / max_press_comp_size
)
club_network["reddit_comp_share"] = (
    club_network["reddit_component_size"] / max_reddit_comp_size
)

club_network["press_in_giant"] = club_network["press_component_size"] == max_press_comp_size
club_network["reddit_in_giant"] = club_network["reddit_component_size"] == max_reddit_comp_size

In [29]:
# 2) Z-scores of normalized strengths (how far above/below average)

for col in ["press_strength_norm", "reddit_strength_norm"]:
    mean_val = club_network[col].mean()
    std_val  = club_network[col].std(ddof=0)  # population std; ddof=1 also fine
    z_col    = col.replace("_norm", "_z")     # e.g. press_strength_z

    if std_val == 0:
        club_network[z_col] = 0.0
    else:
        club_network[z_col] = (club_network[col] - mean_val) / std_val

In [30]:
# 3) Press vs Reddit narrative balance (ratio of centrality)

club_network["press_to_reddit_strength_ratio"] = np.where(
    club_network["reddit_strength_norm"] > 0,
    club_network["press_strength_norm"] / club_network["reddit_strength_norm"],
    np.nan,
)

print("club_network with connectivity metrics:", club_network.shape)
display(
    club_network[
        [
            "club",
            "press_strength_norm",
            "reddit_strength_norm",
            "press_strength_z",
            "reddit_strength_z",
            "press_comp_share",
            "reddit_comp_share",
            "press_in_giant",
            "reddit_in_giant",
            "press_to_reddit_strength_ratio",
        ]
    ]
    .sort_values("press_strength_norm", ascending=False)
    .head(15)
)

club_network with connectivity metrics: (23, 20)


Unnamed: 0,club,press_strength_norm,reddit_strength_norm,press_strength_z,reddit_strength_z,press_comp_share,reddit_comp_share,press_in_giant,reddit_in_giant,press_to_reddit_strength_ratio
310,Tottenham Hotspur,1.0,0.618056,2.343285,1.320838,1.0,1.0,True,True,1.617978
181,Liverpool,0.926139,0.947917,2.033982,2.528888,1.0,1.0,True,True,0.977025
67,Chelsea,0.862743,1.0,1.768504,2.719633,1.0,1.0,True,True,0.862743
189,Manchester United,0.730207,0.260417,1.213493,0.011058,1.0,1.0,True,True,2.803993
188,Manchester City,0.674292,0.420139,0.979343,0.596008,1.0,1.0,True,True,1.604926
98,Everton,0.586409,0.322917,0.611324,0.239951,1.0,1.0,True,True,1.815977
19,Arsenal,0.517891,0.472222,0.324397,0.786753,1.0,1.0,True,True,1.096711
207,Newcastle United,0.442543,0.315972,0.008866,0.214519,1.0,1.0,True,True,1.400574
50,Brighton & Hove Albion,0.422614,0.388889,-0.074587,0.481561,1.0,1.0,True,True,1.086722
20,Aston Villa,0.400528,0.097222,-0.167077,-0.586609,1.0,1.0,True,True,4.119714


Create the club_network

In [31]:
from pathlib import Path

outputs_dir = BASE_DIR / "outputs"
outputs_dir.mkdir(parents=True, exist_ok=True)

network_out_path = outputs_dir / "club_network_centrality_2016_2022.csv"

# Delete file if it already exists
if network_out_path.exists():
    network_out_path.unlink()  # remove the file

# Now save again
club_network.to_csv(network_out_path, index=False)
print("Saved club_network to:", network_out_path)


Saved club_network to: C:\Users\dshog\Prem NLP Project\outputs\club_network_centrality_2016_2022.csv


In [32]:
import pandas as pd

# BASE_DIR already comes from get_project_root() in the first cell

club_network_path = BASE_DIR / "outputs" / "club_network_centrality_2016_2022.csv"

club_network = pd.read_csv(club_network_path)

print(club_network_path)
print(club_network.shape)
display(club_network.head())



C:\Users\dshog\Prem NLP Project\outputs\club_network_centrality_2016_2022.csv
(23, 20)


Unnamed: 0,club,press_degree,press_strength,club_raw_press,press_component_id,press_component_size,reddit_degree,reddit_strength,club_raw_reddit,reddit_component_id,reddit_component_size,press_strength_norm,reddit_strength_norm,press_comp_share,reddit_comp_share,press_in_giant,reddit_in_giant,press_strength_z,reddit_strength_z,press_to_reddit_strength_ratio
0,AFC Bournemouth,1508,72620,AFC Bournemouth,0,1683,23.0,32.0,AFC Bournemouth,0.0,205.0,0.251162,0.111111,1.0,1.0,True,True,-0.792563,-0.535744,2.260459
1,Arsenal,1570,149741,Arsenal,0,1683,48.0,136.0,Arsenal,0.0,205.0,0.517891,0.472222,1.0,1.0,True,True,0.324397,0.786753,1.096711
2,Aston Villa,1541,115807,Aston Villa,0,1683,14.0,28.0,Aston Villa,0.0,205.0,0.400528,0.097222,1.0,1.0,True,True,-0.167077,-0.586609,4.119714
3,Brentford,1435,73189,Brentford,0,1683,6.0,9.0,Brentford,0.0,205.0,0.25313,0.03125,1.0,1.0,True,True,-0.784322,-0.828219,8.10016
4,Brighton & Hove Albion,1514,122193,Brighton & Hove Albion,0,1683,31.0,112.0,Brighton & Hove Albion,0.0,205.0,0.422614,0.388889,1.0,1.0,True,True,-0.074587,0.481561,1.086722


Build transfer features (premier-league.csv)

In [33]:
TRANSFER_PATH = BASE_DIR / "transfer_data" / "premier-league.csv"
prem_tx = pd.read_csv(TRANSFER_PATH)

# Use 2016+ to match your network window
prem_tx = prem_tx[prem_tx["year"] >= 2016].copy()

# Canonical club name
prem_tx["club"] = (
    prem_tx["club_name"]
    .astype(str)
    .str.strip()
    .apply(normalize_club_name)
)


In [34]:
import numpy as np
fee_col = "fee_cleaned" if "fee_cleaned" in prem_tx.columns else "fee"
prem_tx["fee_m"] = prem_tx[fee_col].fillna(0.0)

mov = prem_tx["transfer_movement"].astype(str).str.lower()
prem_tx["spend_eur_m"]  = np.where(mov.eq("in"),  prem_tx["fee_m"], 0.0)
prem_tx["income_eur_m"] = np.where(mov.eq("out"), prem_tx["fee_m"], 0.0)


In [35]:
club_transfers = (
    prem_tx
    .groupby("club", as_index=False)
    .agg(
        n_transfers=("player_name", "count"),
        total_spend_eur_m=("spend_eur_m", "sum"),
        total_income_eur_m=("income_eur_m", "sum"),
    )
)

club_transfers["net_spend_eur_m"] = (
    club_transfers["total_spend_eur_m"] - club_transfers["total_income_eur_m"]
)


In [36]:
display(
    club_transfers[club_transfers["club"].isin(club_network["club"])]
    .sort_values("net_spend_eur_m", ascending=False)
    .head(10)
)


Unnamed: 0,club,n_transfers,total_spend_eur_m,total_income_eur_m,net_spend_eur_m
17,Manchester United,195,1167.99,266.18,901.81
7,Chelsea,382,1623.79,791.216,832.574
1,Arsenal,212,952.58,309.45,643.13
16,Manchester City,342,1227.91,587.95,639.96
19,Newcastle United,202,550.0,110.495,439.505
30,West Ham United,210,684.2,250.46,433.74
27,Tottenham Hotspur,162,739.8,312.12,427.68
31,Wolverhampton Wanderers,263,529.957,218.77,311.187
2,Aston Villa,157,490.81,186.77,304.04
15,Liverpool,209,748.1,483.15,264.95


Build finance features (Premier_League_Finances.xlsx)

In [37]:
FIN_PATH = BASE_DIR / "Premier_League_Finances.xlsx"
fin_raw = pd.read_excel(FIN_PATH)

print("Finances shape:", fin_raw.shape)
print("Finance columns:", fin_raw.columns.tolist())
display(fin_raw.head())

metric_col = fin_raw.columns[0]    # e.g., 'Unnamed: 0'
value_cols = fin_raw.columns[1:]

fin_long = fin_raw.melt(
    id_vars=[metric_col],
    value_vars=value_cols,
    var_name="club_raw_fin",
    value_name="value"
)

fin_long["club"] = fin_long["club_raw_fin"].astype(str).apply(normalize_club_name)

fin_pivot = (
    fin_long
    .pivot_table(
        index="club",
        columns=metric_col,
        values="value",
        aggfunc="first"
    )
    .reset_index()
)

fin_pivot.columns.name = None
fin_pivot = fin_pivot.rename(
    columns={
        "Revenue": "revenue",
        "Operating Expenses": "operating_expenses",
        "Operating Profit": "operating_profit",
        "Player Trading Profit": "player_trading_profit",
    }
)

print("fin_pivot:", fin_pivot.shape)
display(
    fin_pivot[fin_pivot["club"].isin(club_network["club"])]
)


Finances shape: (4, 30)
Finance columns: ['Unnamed: 0', 'Arsenal', 'Aston Villa', 'AFC Bournemouth', 'Brentford', 'Brighton & Hove Albion', 'Burnley', 'Cardiff City', 'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Huddersfield Town', 'Ipswich Town', 'Leeds United', 'Leicester City', 'Liverpool', 'Luton Town', 'Manchester City', 'Manchester United', 'Newcastle United', 'Norwich City', 'Nottingham Forest', 'Sheffield United', 'Southampton', 'Tottenham Hotspur', 'Watford', 'West Bromwich Albion', 'West Ham United', 'Wolverhampton Wanderers']


Unnamed: 0.1,Unnamed: 0,Arsenal,Aston Villa,AFC Bournemouth,Brentford,Brighton & Hove Albion,Burnley,Cardiff City,Chelsea,Crystal Palace,...,Newcastle United,Norwich City,Nottingham Forest,Sheffield United,Southampton,Tottenham Hotspur,Watford,West Bromwich Albion,West Ham United,Wolverhampton Wanderers
0,Revenue,435000,236000,65584,172000,257200,135200,24200,586000,222642,...,220000,96310,128000,74600,167300,444000,80000,80000,312000,204000
1,Operating Expenses,544136,327500,212000,117000,269000,172100,61902,891400,219600,...,193060,202500,180570,110230,192600,607000,193040,93035,319000,332715
2,Operating Profit,-76826,-107500,-70000,25100,-50000,48800,-36929,-148800,-29520,...,-92492,3645,-65860,-27870,-5214,-63000,-30480,-15240,27000,-87726
3,Player Trading Profit,29970,120189,21027,5900,24100,65000,5200,154000,-97400,...,-90000,-26700,5000,13700,-4600,24100,19100,21500,1400,-64200


fin_pivot: (29, 5)


Unnamed: 0,club,operating_expenses,operating_profit,player_trading_profit,revenue
0,AFC Bournemouth,212000,-70000,21027,65584
1,Arsenal,544136,-76826,29970,435000
2,Aston Villa,327500,-107500,120189,236000
3,Brentford,117000,25100,5900,172000
4,Brighton & Hove Albion,269000,-50000,24100,257200
5,Burnley,172100,48800,65000,135200
7,Chelsea,891400,-148800,154000,586000
9,Everton,270200,-38000,-15600,246558
10,Fulham,144300,-72900,-15900,88975
13,Leeds United,246200,-41800,1200,234260


Build performance features (fixtures CSV)

Creating performance features from fixture

In [38]:
FIX_PATH = BASE_DIR / "Fixture data 2019-2025" / "fbref_pl_fixtures_2019_2025.csv"
fix = pd.read_csv(FIX_PATH)

print("Fixtures shape:", fix.shape)
print(fix.columns.tolist())
display(fix.head())

# Normalize club names
fix["home_club"] = fix["home"].astype(str).apply(normalize_club_name)
fix["away_club"] = fix["away"].astype(str).apply(normalize_club_name)

# Points for home team
fix["home_points"] = np.select(
    [
        fix["home_goals"] > fix["away_goals"],
        fix["home_goals"] == fix["away_goals"],
        fix["home_goals"] < fix["away_goals"],
    ],
    [3, 1, 0],
    default=0,
)

# Points for away team
fix["away_points"] = np.select(
    [
        fix["away_goals"] > fix["home_goals"],
        fix["away_goals"] == fix["home_goals"],
        fix["away_goals"] < fix["home_goals"],
    ],
    [3, 1, 0],
    default=0,
)

# Long form: one row per (club, match, season)
home_rows = fix[["season", "home_club", "home_goals", "away_goals", "home_points"]].copy()
home_rows = home_rows.rename(
    columns={
        "home_club": "club",
        "home_goals": "goals_for",
        "away_goals": "goals_against",
        "home_points": "points",
    }
)

away_rows = fix[["season", "away_club", "away_goals", "home_goals", "away_points"]].copy()
away_rows = away_rows.rename(
    columns={
        "away_club": "club",
        "away_goals": "goals_for",
        "home_goals": "goals_against",
        "away_points": "points",
    }
)

fix_long = pd.concat([home_rows, away_rows], ignore_index=True)

# Season-level club performance
club_perf = (
    fix_long
    .groupby(["club", "season"], as_index=False)
    .agg(
        matches=("points", "size"),
        total_points=("points", "sum"),
        goals_for=("goals_for", "sum"),
        goals_against=("goals_against", "sum"),
    )
)

club_perf["points_per_game"] = club_perf["total_points"] / club_perf["matches"]
club_perf["goal_diff"] = club_perf["goals_for"] - club_perf["goals_against"]

print("club_perf:", club_perf.shape)
display(
    club_perf[club_perf["club"].isin(club_network["club"])]
    .sort_values(["season", "points_per_game"], ascending=[False, False])
    .head(10)
)


Fixtures shape: (2660, 9)
['date', 'home', 'away', 'home_goals', 'away_goals', 'attendance', 'venue', 'season', 'game_id']


Unnamed: 0,date,home,away,home_goals,away_goals,attendance,venue,season,game_id
0,2019-08-09,Liverpool,Norwich City,4.0,1.0,53333.0,Anfield,1920,928467bd
1,2019-08-10,Bournemouth,Sheffield Utd,1.0,1.0,10714.0,Vitality Stadium,1920,d402cacd
2,2019-08-10,Burnley,Southampton,3.0,0.0,19784.0,Turf Moor,1920,34b99058
3,2019-08-10,Crystal Palace,Everton,0.0,0.0,25151.0,Selhurst Park,1920,a802f51e
4,2019-08-10,Tottenham,Aston Villa,3.0,1.0,60407.0,Tottenham Hotspur Stadium,1920,404ee5d3


club_perf: (140, 8)


Unnamed: 0,club,season,matches,total_points,goals_for,goals_against,points_per_game,goal_diff
11,Arsenal,2526,38,25,18.0,3.0,0.657895,15.0
86,Manchester City,2526,38,19,20.0,8.0,0.5,12.0
4,AFC Bournemouth,2526,38,18,17.0,14.0,0.473684,3.0
78,Liverpool,2526,38,18,18.0,14.0,0.473684,4.0
42,Chelsea,2526,38,17,18.0,11.0,0.447368,7.0
93,Manchester United,2526,38,17,17.0,16.0,0.447368,1.0
122,Tottenham Hotspur,2526,38,17,17.0,8.0,0.447368,9.0
18,Aston Villa,2526,38,15,9.0,10.0,0.394737,-1.0
30,Brighton & Hove Albion,2526,38,15,17.0,15.0,0.394737,2.0
23,Brentford,2526,38,13,14.0,16.0,0.342105,-2.0


Create Season label

In [39]:
def format_season(code):
    s = str(int(code)).zfill(4)   # e.g. 1920 -> "1920"
    return f"{s[:2]}/{s[2:]}"     # "19/20"

club_perf["season_label"] = club_perf["season"].apply(format_season)

display(
    club_perf
    .sort_values(["season", "club"])
    .head(10)
)


Unnamed: 0,club,season,matches,total_points,goals_for,goals_against,points_per_game,goal_diff,season_label
0,AFC Bournemouth,1920,38,34,40.0,65.0,0.894737,-25.0,19/20
5,Arsenal,1920,38,56,56.0,48.0,1.473684,8.0,19/20
12,Aston Villa,1920,38,35,41.0,67.0,0.921053,-26.0,19/20
24,Brighton & Hove Albion,1920,38,41,39.0,54.0,1.078947,-15.0,19/20
31,Burnley,1920,38,54,43.0,50.0,1.421053,-7.0,19/20
36,Chelsea,1920,38,66,69.0,54.0,1.736842,15.0,19/20
43,Crystal Palace,1920,38,43,31.0,50.0,1.131579,-19.0,19/20
50,Everton,1920,38,49,44.0,56.0,1.289474,-12.0,19/20
67,Leicester City,1920,38,62,67.0,41.0,1.631579,26.0,19/20
72,Liverpool,1920,38,99,85.0,33.0,2.605263,52.0,19/20


Merge everything

In [40]:
from pathlib import Path

# ---- Club-level static features (network + transfers + finance) ----
# Restrict transfers/finance to clubs that are in the network table
club_transfers_sub = club_transfers[
    club_transfers["club"].isin(club_network["club"])
].copy()

fin_sub = fin_pivot[
    fin_pivot["club"].isin(club_network["club"])
].copy()

print("club_transfers_sub:", club_transfers_sub.shape)
print("fin_sub:", fin_sub.shape)

# Merge: network centrality + transfers + finance
club_features = (
    club_network
    .merge(
        club_transfers_sub[
            ["club",
             "n_transfers",
             "total_spend_eur_m",
             "total_income_eur_m",
             "net_spend_eur_m"]
        ],
        on="club",
        how="left"
    )
    .merge(
        fin_sub,
        on="club",
        how="left"
    )
)

print("club_features:", club_features.shape)
display(
    club_features
    .sort_values("press_strength_norm", ascending=False)
    .head(15)
)

club_transfers_sub: (23, 5)
fin_sub: (23, 5)
club_features: (23, 28)


Unnamed: 0,club,press_degree,press_strength,club_raw_press,press_component_id,press_component_size,reddit_degree,reddit_strength,club_raw_reddit,reddit_component_id,...,reddit_strength_z,press_to_reddit_strength_ratio,n_transfers,total_spend_eur_m,total_income_eur_m,net_spend_eur_m,operating_expenses,operating_profit,player_trading_profit,revenue
19,Tottenham Hotspur,1588,289136,Tottenham Hotspur,0,1683,58.0,178.0,Tottenham Hotspur,0.0,...,1.320838,1.617978,162,739.8,312.12,427.68,607000,-63000,24100,444000
11,Liverpool,1615,267780,Liverpool,0,1683,82.0,273.0,Liverpool,0.0,...,2.528888,0.977025,209,748.1,483.15,264.95,666000,41200,34500,725000
6,Chelsea,1607,249450,Chelsea,0,1683,72.0,288.0,Chelsea,0.0,...,2.719633,0.862743,382,1623.79,791.216,832.574,891400,-148800,154000,586000
13,Manchester United,1596,211129,Manchester United,0,1683,44.0,75.0,Manchester United,0.0,...,0.011058,2.803993,195,1167.99,266.18,901.81,667778,-115510,53800,700000
12,Manchester City,1603,194962,Manchester City,0,1683,53.0,121.0,Manchester City,0.0,...,0.596008,1.604926,342,1227.91,587.95,639.96,778000,-22000,75178,815000
7,Everton,1568,169552,Everton,0,1683,42.0,93.0,Everton,0.0,...,0.239951,1.815977,264,708.07,457.66,250.41,270200,-38000,-15600,246558
1,Arsenal,1570,149741,Arsenal,0,1683,48.0,136.0,Arsenal,0.0,...,0.786753,1.096711,212,952.58,309.45,643.13,544136,-76826,29970,435000
14,Newcastle United,1555,127955,Newcastle United,0,1683,43.0,91.0,Newcastle United,0.0,...,0.214519,1.400574,202,550.0,110.495,439.505,193060,-92492,-90000,220000
4,Brighton & Hove Albion,1514,122193,Brighton & Hove Albion,0,1683,31.0,112.0,Brighton & Hove Albion,0.0,...,0.481561,1.086722,305,383.32,259.52,123.8,269000,-50000,24100,257200
2,Aston Villa,1541,115807,Aston Villa,0,1683,14.0,28.0,Aston Villa,0.0,...,-0.586609,4.119714,157,490.81,186.77,304.04,327500,-107500,120189,236000


In [41]:
club_perf_sub = club_perf[
    club_perf["club"].isin(club_features["club"])
].copy()

print("club_perf_sub:", club_perf_sub.shape)

club_season_panel = (
    club_perf_sub
    .merge(
        club_features,
        on="club",
        how="left"
    )
)

print("club_season_panel:", club_season_panel.shape)
display(
    club_season_panel
    .sort_values(["season", "points_per_game"], ascending=[False, False])
    .head(15)
)

club_perf_sub: (118, 9)
club_season_panel: (118, 36)


Unnamed: 0,club,season,matches,total_points,goals_for,goals_against,points_per_game,goal_diff,season_label,press_degree,...,reddit_strength_z,press_to_reddit_strength_ratio,n_transfers,total_spend_eur_m,total_income_eur_m,net_spend_eur_m,operating_expenses,operating_profit,player_trading_profit,revenue
11,Arsenal,2526,38,25,18.0,3.0,0.657895,15.0,25/26,1570,...,0.786753,1.096711,212,952.58,309.45,643.13,544136,-76826,29970,435000
77,Manchester City,2526,38,19,20.0,8.0,0.5,12.0,25/26,1603,...,0.596008,1.604926,342,1227.91,587.95,639.96,778000,-22000,75178,815000
4,AFC Bournemouth,2526,38,18,17.0,14.0,0.473684,3.0,25/26,1508,...,-0.535744,2.260459,162,302.1,78.515,223.585,212000,-70000,21027,65584
70,Liverpool,2526,38,18,18.0,14.0,0.473684,4.0,25/26,1615,...,2.528888,0.977025,209,748.1,483.15,264.95,666000,41200,34500,725000
42,Chelsea,2526,38,17,18.0,11.0,0.447368,7.0,25/26,1607,...,2.719633,0.862743,382,1623.79,791.216,832.574,891400,-148800,154000,586000
84,Manchester United,2526,38,17,17.0,16.0,0.447368,1.0,25/26,1596,...,0.011058,2.803993,195,1167.99,266.18,901.81,667778,-115510,53800,700000
101,Tottenham Hotspur,2526,38,17,17.0,8.0,0.447368,9.0,25/26,1588,...,1.320838,1.617978,162,739.8,312.12,427.68,607000,-63000,24100,444000
18,Aston Villa,2526,38,15,9.0,10.0,0.394737,-1.0,25/26,1541,...,-0.586609,4.119714,157,490.81,186.77,304.04,327500,-107500,120189,236000
30,Brighton & Hove Albion,2526,38,15,17.0,15.0,0.394737,2.0,25/26,1514,...,0.481561,1.086722,305,383.32,259.52,123.8,269000,-50000,24100,257200
23,Brentford,2526,38,13,14.0,16.0,0.342105,-2.0,25/26,1435,...,-0.828219,8.10016,68,88.2,4.6,83.6,117000,25100,5900,172000


In [42]:
print("unique clubs in club_network:", club_network["club"].nunique())
print("rows in club_features:", club_features.shape[0])

print(
    club_features["club"].value_counts()
    .head()
)


unique clubs in club_network: 23
rows in club_features: 23
club
AFC Bournemouth      1
Manchester City      1
West Ham United      1
Watford              1
Tottenham Hotspur    1
Name: count, dtype: int64


In [43]:
# 1) Check that each (club, season) is unique
print(
    club_season_panel[["club", "season"]]
    .drop_duplicates()
    .shape[0],
    "rows after dropping duplicates vs original:",
    club_season_panel.shape[0]
)

# 2) How many seasons per club?
display(
    club_season_panel.groupby("club")["season"].nunique().sort_values(ascending=False).head(10)
)

# 3) Inspect a single club across seasons
display(
    club_season_panel[club_season_panel["club"] == "Liverpool"]
    .sort_values("season")
)


118 rows after dropping duplicates vs original: 118


club
Wolverhampton Wanderers    7
Manchester City            7
Aston Villa                7
West Ham United            7
Brighton & Hove Albion     7
Tottenham Hotspur          7
Chelsea                    7
Everton                    7
Manchester United          7
Arsenal                    7
Name: season, dtype: int64

Unnamed: 0,club,season,matches,total_points,goals_for,goals_against,points_per_game,goal_diff,season_label,press_degree,...,reddit_strength_z,press_to_reddit_strength_ratio,n_transfers,total_spend_eur_m,total_income_eur_m,net_spend_eur_m,operating_expenses,operating_profit,player_trading_profit,revenue
64,Liverpool,1920,38,99,85.0,33.0,2.605263,52.0,19/20,1615,...,2.528888,0.977025,209,748.1,483.15,264.95,666000,41200,34500,725000
65,Liverpool,2021,38,69,68.0,42.0,1.815789,26.0,20/21,1615,...,2.528888,0.977025,209,748.1,483.15,264.95,666000,41200,34500,725000
66,Liverpool,2122,38,92,94.0,26.0,2.421053,68.0,21/22,1615,...,2.528888,0.977025,209,748.1,483.15,264.95,666000,41200,34500,725000
67,Liverpool,2223,38,67,75.0,47.0,1.763158,28.0,22/23,1615,...,2.528888,0.977025,209,748.1,483.15,264.95,666000,41200,34500,725000
68,Liverpool,2324,38,82,86.0,41.0,2.157895,45.0,23/24,1615,...,2.528888,0.977025,209,748.1,483.15,264.95,666000,41200,34500,725000
69,Liverpool,2425,38,84,86.0,41.0,2.210526,45.0,24/25,1615,...,2.528888,0.977025,209,748.1,483.15,264.95,666000,41200,34500,725000
70,Liverpool,2526,38,18,18.0,14.0,0.473684,4.0,25/26,1615,...,2.528888,0.977025,209,748.1,483.15,264.95,666000,41200,34500,725000


In [None]:
from pathlib import Path

analysis_path = BASE_DIR / "outputs" / "club_analysis_panel_2016_2022.csv"

if analysis_path.exists():
    analysis_path.unlink()

club_season_panel.to_csv(analysis_path, index=False)
print("Saved season panel to:", analysis_path)


Saved season panel to: C:\Users\dshog\Prem NLP Project\outputs\club_analysis_panel_2016_2022.csv


In [45]:
import pandas as pd

analysis_path = BASE_DIR / "outputs" / "club_analysis_panel_2016_2022.csv"
analysis_df = pd.read_csv(analysis_path)

print(analysis_df.shape)
display(analysis_df.head())



(118, 36)


Unnamed: 0,club,season,matches,total_points,goals_for,goals_against,points_per_game,goal_diff,season_label,press_degree,...,reddit_strength_z,press_to_reddit_strength_ratio,n_transfers,total_spend_eur_m,total_income_eur_m,net_spend_eur_m,operating_expenses,operating_profit,player_trading_profit,revenue
0,AFC Bournemouth,1920,38,34,40.0,65.0,0.894737,-25.0,19/20,1508,...,-0.535744,2.260459,162,302.1,78.515,223.585,212000,-70000,21027,65584
1,AFC Bournemouth,2223,38,39,37.0,71.0,1.026316,-34.0,22/23,1508,...,-0.535744,2.260459,162,302.1,78.515,223.585,212000,-70000,21027,65584
2,AFC Bournemouth,2324,38,48,54.0,67.0,1.263158,-13.0,23/24,1508,...,-0.535744,2.260459,162,302.1,78.515,223.585,212000,-70000,21027,65584
3,AFC Bournemouth,2425,38,56,58.0,46.0,1.473684,12.0,24/25,1508,...,-0.535744,2.260459,162,302.1,78.515,223.585,212000,-70000,21027,65584
4,AFC Bournemouth,2526,38,18,17.0,14.0,0.473684,3.0,25/26,1508,...,-0.535744,2.260459,162,302.1,78.515,223.585,212000,-70000,21027,65584
