In [679]:
from nhlpy import NHLClient
from nba_api.stats.endpoints import LeagueStandings
from pybaseball import standings
import statsapi

import requests
import spacy

import pandas as pd
import numpy as np
import time
import re

In [342]:
client = NHLClient()
nlp = spacy.load("en_core_web_sm")

In [681]:
city_grouping = {
    'Boston': ['New England'],
    'South Florida': ['Miami', 'Florida'],
    'Bay Area': ['San Francisco', 'Oakland', 'San Jose', 'Golden State'],
    'Los Angeles': ['Anaheim', 'California', 'LA'],
    'Dallas': ['Texas'],
    'Denver': ['Colorado'],
    'New York': ['NY Rangers', 'NY Islanders'],
    'Washington': ['Capital'],
    'Kansas City': ['Kansas City-Omaha'],
    'New Orleans': ['New Orleans/Oklahoma City'],
    'Montreal': ['Montréal']
}

inverted_grouping = {}
for key, values in city_grouping.items():
    for value in values:
        inverted_grouping[value] = key

In [634]:
def nhl_season_constructor(start, stop):
    nhl_season_strings = []
    for s in range (start, stop, 1):
        season = str(s-1) + str(s)
        nhl_season_strings.append(season)
    return nhl_season_strings

In [635]:
nhl_seasons = nhl_season_constructor(1950, 2024)

In [636]:
nhl_seasons

['19491950',
 '19501951',
 '19511952',
 '19521953',
 '19531954',
 '19541955',
 '19551956',
 '19561957',
 '19571958',
 '19581959',
 '19591960',
 '19601961',
 '19611962',
 '19621963',
 '19631964',
 '19641965',
 '19651966',
 '19661967',
 '19671968',
 '19681969',
 '19691970',
 '19701971',
 '19711972',
 '19721973',
 '19731974',
 '19741975',
 '19751976',
 '19761977',
 '19771978',
 '19781979',
 '19791980',
 '19801981',
 '19811982',
 '19821983',
 '19831984',
 '19841985',
 '19851986',
 '19861987',
 '19871988',
 '19881989',
 '19891990',
 '19901991',
 '19911992',
 '19921993',
 '19931994',
 '19941995',
 '19951996',
 '19961997',
 '19971998',
 '19981999',
 '19992000',
 '20002001',
 '20012002',
 '20022003',
 '20032004',
 '20042005',
 '20052006',
 '20062007',
 '20072008',
 '20082009',
 '20092010',
 '20102011',
 '20112012',
 '20122013',
 '20132014',
 '20142015',
 '20152016',
 '20162017',
 '20172018',
 '20182019',
 '20192020',
 '20202021',
 '20212022',
 '20222023']

In [637]:
def get_nhl_standings(season):
    standings_resp = client.standings.get_standings(season=season)
    df_standings = pd.DataFrame(standings_resp['standings'])
    df_standings['name'] = df_standings['teamName'].apply(lambda x: x['default'])
    df_standings['city'] = df_standings['placeName'].apply(lambda x: x['default'])
    df_standings = df_standings[['city', 'name', 'wins', 'ties', 'losses', 'otLosses', 'gamesPlayed']]
    
    season_format = season[:4] + '-' + season[-2:]
    df_standings.loc[:, "season"] = season_format

    season_year = int(season[-2:])
    if season_year <= 25:
        season_end = "20"+str(season_year)
    else:
        season_end = "19"+str(season_year)
    df_standings.loc[:, 'season'] = season
    df_standings.loc[:, 'season_year'] = season_end

    return df_standings

In [638]:
def nhl_combine(start, stop):
    seasons = nhl_season_constructor(start, stop)

    df_all_standings = pd.DataFrame()

    for s in seasons:
        if s != '20042005':
            df_all_standings = pd.concat([df_all_standings, get_nhl_standings(s)])
    df_all_standings = df_all_standings.reset_index(drop=True)
    df_all_standings.loc[:, 'percentage'] = (
    df_all_standings.loc[:,'wins']*2 + df_all_standings.loc[:,'ties'] + df_all_standings.loc[:,'otLosses']
) / (df_all_standings.loc[:,'gamesPlayed']*2)
    
    df_all_standings = df_all_standings.loc[:, ['city', 'name', 'percentage', 'season', 'season_year']]
    df_all_standings.loc[:, 'league'] = 'NHL'

    df_all_standings['name'] = df_all_standings.apply(lambda row: row['name'].replace(row['city'], ''), axis = 1)
    return df_all_standings

In [639]:
def nba_season_constructor(start, stop):
    nba_season_strings = []
    for s in range(start,stop,1):
        print(s)
        first_year = str(s-1)
        current_year = str(s)[-2:]
        season = first_year + '-' + current_year
        nba_season_strings.append(season)
    return nba_season_strings

In [640]:
nba_season_constructor(2023, 2024)

2023


['2022-23']

In [641]:
def get_nba_standings(season):
    time.sleep(1)
    standings_resp = LeagueStandings(season=season)
    df_standings = standings_resp.get_data_frames()[0]
    df_standings = df_standings.loc[:, ['TeamCity', 'TeamName', 'WinPCT']]
    df_standings = df_standings.rename(columns={'TeamCity': 'city',
                                        'TeamName': 'name',
                                        'WinPCT': 'percentage'})
    
    season_year = int(season[-2:])
    if season_year <= 25:
        season_end = "20"+str(season_year)
    else:
        season_end = "19"+str(season_year)
    df_standings.loc[:, 'season'] = season
    df_standings.loc[:, 'season_year'] = season_end
    
    return df_standings

In [642]:
def nba_combine(start, stop):
    seasons = nba_season_constructor(start, stop)
    
    df_all_standings = pd.DataFrame()
    for s in seasons:
        df_all_standings = pd.concat([df_all_standings, get_nba_standings(season=s)])
    
    df_all_standings.loc[:,'league'] = 'NBA'
    return df_all_standings

In [643]:
mlb_localities = [
    "Arizona",
    "Atlanta",
    "Baltimore",
    "Boston",
    "Brooklyn",
    "California",
    "Chicago",
    "Cincinnati",
    "Cleveland",
    "Colorado",
    "Detroit",
    "Florida",
    "Houston",
    "Kansas City",
    "Los Angeles",
    "Miami",
    "Milwaukee",
    "Minnesota",
    "Montreal",
    "New York",
    "Oakland",
    "Philadelphia",
    "Pittsburgh",
    "San Diego",
    "San Francisco",
    "Seattle",
    "St. Louis",
    "Tampa Bay",
    "Texas",
    "Toronto",
    "Washington",
    "Anaheim"
]

def extract_city(text):
    for pattern in mlb_localities:
        match = re.search(pattern, text)
        if match:
            return match.group()
    return None

In [644]:
def get_mlb_standings(s):
    data = statsapi.standings_data(season=s)

    team_names = []
    percentages = []
    for div in data.keys():
        div_standings = data[div]
        teams = div_standings['teams']
        for team in teams:
            team_names.append(team['name'])
            percentages.append(team['w'] / (team['w'] + team['l']))

    df = pd.DataFrame(
        {
            'team_name': team_names,
            'percentage': percentages
        }
    )
    df.loc[:, 'season'] = s
    df.loc[:, 'season_year'] = str(s)

    return df

            


In [650]:
def mlb_combine(start, stop=None):
    
    if not stop:
        stop = start+1
    df_all_mlb = pd.DataFrame()

    for s in range(start, stop, 1):
        print(s)
        df_season = get_mlb_standings(s)
        df_all_mlb = pd.concat([df_all_mlb, df_season])
    
    df_final = df_all_mlb.rename(columns={'team_name':'team'})[['team', 'percentage', 'season', 'season_year']]
    df_final.loc[:, 'city'] = df_final['team'].apply(extract_city)
    df_final.loc[:, 'name'] = df_final.apply(lambda row: row['team'].replace(row['city'], ''), axis=1)
    df_final.loc[:, 'league'] = 'MLB'
    df_final = df_final.drop('team', axis=1)
    return df_final


In [651]:
def get_nfl_standings(season):

    url = f"https://site.api.espn.com/apis/v2/sports/football/nfl/standings?season={season}"
    response = requests.get(url)
    data = response.json()

    names = []
    locations = []
    percents = []

    if 'standings' not in data.keys():
        for c in [0,1]:
            entries = data['children'][c]['standings']['entries']
    else:
        entries = data['standings']['entries']
    for e in entries:
        team_info = e['team']
        if 'name' in team_info.keys():
            name = team_info['name']
        else:
            name = 'Football Team'
        location = team_info['location']

        stats_info = e['stats']
        for l in stats_info:
            if l['name'] == 'winPercent':
                percentage = l['value']
        
        names.append(name)
        locations.append(location)
        percents.append(percentage)


    df_season = pd.DataFrame(
        {
            'name': names,
            'city': locations,
            'percentage': percents
        }
    )

    df_season.loc[:, 'season'] = season
    df_season.loc[:, 'season_year'] = str(season)
    return df_season


In [652]:
def nfl_combine(start, stop=None):
    if stop is None:
        stop = start
    df_all_nfl = pd.DataFrame()
    for y in range(start, stop, 1):
        print(y)
        df_season = get_nfl_standings(y)
        df_all_nfl = pd.concat([df_all_nfl, df_season])
    
    df_all_nfl.loc[:, 'league'] = 'NFL'
    return df_all_nfl

In [686]:
def construct_dataset(start, stop, league='all'):
    df = pd.DataFrame()

    if league in ['all', 'MLB']:
        print('MLB')
        df_mlb = mlb_combine(start, stop)
        df = pd.concat([df, df_mlb])

    if league in ['all', 'NBA']:
        print('NBA')
        df_nba = nba_combine(start, stop)
        df = pd.concat([df, df_nba])

    if league in ['all', 'NHL']:
        print('NHL')
        df_nhl = nhl_combine(start, stop)
        df = pd.concat([df, df_nhl])
    
    if league in ['all', 'NFL']:
        print('NFL')
        df_nfl = nfl_combine(start, stop)
        df = pd.concat([df, df_nfl])

    df['city'] = df.loc[:, 'city'].replace(inverted_grouping)

    df.to_csv('all_standings.csv')
    return df