In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.environ.get('CFB_API_KEY')

In [72]:
import requests
import re

ENDPOINT = 'https://api.collegefootballdata.com'
HEADER = {'Authorization': f'Bearer {API_KEY}'}

def get_game_info(years):
    URL = ENDPOINT + '/games'
    game_info = []
    for year in years:
        PAYLOAD = {'year': year}
        rsp = requests.get(url=URL, params=PAYLOAD, headers=HEADER)
        rsp.raise_for_status()
        game_info.extend(rsp.json())
    return game_info

def parse_games(game_info):
    # Filter out unnecessary information returned by API
    games = []
    missing_scores = 0
    for data in game_info:
        home_team, away_team = data['homeTeam'], data['awayTeam']
        home_score, away_score = data['homePoints'], data['awayPoints']
        if not home_score or not away_score:
            # print(f'Missing scores for {away_team} @ {home_team}')
            missing_scores += 1
            continue
        if home_score > away_score:
            winner, loser = home_team, away_team
            winner_score, loser_score = home_score, away_score
        else:
            winner, loser = away_team, home_team
            winner_score, loser_score = away_score, home_score
        games.append(
            {
                'winner': winner,
                'loser': loser,
                'winner_score': winner_score,
                'loser_score': loser_score,
                'year': data['season']
            }
        )
    print(f'{missing_scores}/{len(games)} games missing scores')
    
    return games

YEARS = [2020, 2021, 2022, 2023, 2024, 2025]
game_info = get_game_info(YEARS)
games = parse_games(game_info)

1318/17332 games missing scores


In [92]:
from time import sleep

def get_teams():
    RECORD_PAYLOAD = {'year': 2025}
    RECORD_URL = ENDPOINT + '/records'
    rsp = requests.get(url=RECORD_URL, params=RECORD_PAYLOAD, headers=HEADER)
    rsp.raise_for_status()
    
    sleep(1)
    
    TEAM_PAYLOAD = {'year': 2025}
    TEAM_URL = ENDPOINT + '/teams'
    rsp2 = requests.get(url=TEAM_URL, params=TEAM_PAYLOAD, headers=HEADER)
    rsp2.raise_for_status()
    
    teams = []
    record_data, team_data = rsp.json(), rsp2.json()
    team_idx = {}
    for idx, team in enumerate(team_data):
        teams.append({
            'id': idx,
            'name': team['school'],
            'mascot': team['mascot'],
            'logo': team['logos'][0] if team['logos'] else None,
        })
        team_idx[team['school']] = idx
    
    for record in record_data:
        idx = team_idx[record['team']]
        teams[idx]['wins'] = record['total']['wins']
        teams[idx]['losses'] = record['total']['losses']

    return teams, team_idx

teams, team_idx = get_teams()
print(teams[:5])

[{'id': 0, 'name': 'Abilene Christian', 'mascot': 'Wildcats', 'logo': 'http://a.espncdn.com/i/teamlogos/ncaa/500/2000.png', 'wins': 9, 'losses': 5}, {'id': 1, 'name': 'Adams State', 'mascot': 'Grizzlies', 'logo': 'http://a.espncdn.com/i/teamlogos/ncaa/500/2001.png', 'wins': 0, 'losses': 11}, {'id': 2, 'name': 'Adrian', 'mascot': 'Bulldogs', 'logo': 'http://a.espncdn.com/i/teamlogos/ncaa/500/2003.png', 'wins': 8, 'losses': 2}, {'id': 3, 'name': 'Air Force', 'mascot': 'Falcons', 'logo': 'http://a.espncdn.com/i/teamlogos/ncaa/500/2005.png', 'wins': 4, 'losses': 8}, {'id': 4, 'name': 'Akron', 'mascot': 'Zips', 'logo': 'http://a.espncdn.com/i/teamlogos/ncaa/500/2006.png', 'wins': 5, 'losses': 7}]


In [93]:
# Cross-reference teams acquired from games and records to make sure they match
unseen_teams = set()
for game in games:
    t1, t2 = game['winner'], game['loser']
    if team_idx.get(t1) is None:
        unseen_teams.add(t1)
    if team_idx.get(t2) is None:
        unseen_teams.add(t2)
        
print(f'{len(unseen_teams)}/{len(teams)} teams unknown')

63/683 teams unknown


In [94]:
# There appears to be a nontrivial amount of teams who played game(s) but are missing from the /teams endpoint
# I assume this is just a gap in the dataset
# Delete any games involving these teams to ensure that teams are synced
filtered_games = []
for game in games:
    if game['winner'] in unseen_teams or game['loser'] in unseen_teams:
        continue
    filtered_games.append(game)
print(f'{len(filtered_games)} games remaining')

17087 games remaining


In [121]:
import networkx

# Allocate the weights in a way where the SSSP prioritizes recency
# (i.e. if a path containing 2025 games exists, it dominates any path containing earlier games)
# Use len(teams) as an upper bound on the diameter of the graph to ensure this
def generate_weight(current_year, max_year):
    return len(teams) * (max_year - current_year) + 1

# Construct directed graph, with teams as nodes
# A -> B iff A defeats B in a game    
graph = networkx.DiGraph()
graph.add_nodes_from([(team['id'], {'label': team['name']}) for team in teams])
edge_info = []
for game in filtered_games:
    winner, loser = game['winner'], game['loser']
    weight = generate_weight(game['year'], YEARS[-1])
    tag = f"{winner} def. {loser}"
    if game['year'] != YEARS[-1]:
        tag += f" ({game['year']})"
    edge_info.append((team_idx[winner], team_idx[loser], {'label': tag, 'weight': weight}))
graph.add_edges_from(edge_info)


In [122]:
from networkx import NetworkXNoPath

def find_path(winner, loser):
    src, dest = team_idx[winner], team_idx[loser]
    path = []
    try:
        nodes = networkx.dijkstra_path(graph, src, dest)
        for i in range(len(nodes)-1):
            winner, loser = nodes[i], nodes[i+1]
            edge = graph.get_edge_data(winner, loser)
            path.append(edge['label'])
    except NetworkXNoPath:
        # TODO: LLM call here
        pass
    return path
        

In [None]:
comps = list(networkx.strongly_connected_components(graph))
for c in comps:
    print(' '.join([teams[idx]['name'] for idx in c]))

Finlandia University
Hilbert College
Maine Maritime
Whittier
Alderson-Broaddus West Virginia Wesleyan
Oklahoma Panhandle St
Abilene Christian Adams State Adrian Air Force Akron Alabama Alabama A&M Alabama State Albany State GA Albion Albright Alcorn State Alfred State Alfred University Allegheny Allen Alma American International Anderson (IN) Anderson (Sc) Angelo State Anna Maria College Apprentice School App State Arizona Arizona State Arkansas Arkansas-Monticello Arkansas-Pine Bluff Arkansas State Arkansas Tech Army Ashland Assumption Auburn Augsburg Augustana (IL) Augustana University (SD) Aurora Austin Austin Peay Averett Baldwin Wallace Ball State Baylor Beloit Bemidji State Benedict College Benedictine University Bentley Berry College Bethany (WV) Bethel (MN) Bethune-Cookman Biddeford Birmingham-Southern Black Hills State Bloomsburg Bluefield State Bluffton Boise State Boston College Bowie State Bowling Green Brevard College Bridgewater State Bridgewater (VA) Brockport Brown Brya

: 

In [125]:
import pickle

networkx.write_gexf(graph, './graph.gexf')
with open('teams.pkl', 'wb') as fp:
    pickle.dump(teams, fp)