In [1]:
import pandas as pd
import datetime
import os
import numpy as np
from collections import defaultdict
import sys
import itertools
import json
import requests
import re
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

sys.path.append('/content/drive/MyDrive/Classes/Spring 2022/INFO606: Network Science/Project/src/lookups/')
from error import SportNotAllowedError

In [14]:
class CollectSeasons:
  seasons = None

  def __init__(self):
    self.json_path = "/content/drive/MyDrive/Classes/Spring 2022/INFO606: Network Science/Project/data/seasons.json"
    self.__INIT_DATE__ = {'mlb': datetime.datetime(1876, 4, 22), 
                          'nba': datetime.datetime(1946, 11, 1), 
                          'ncaaf': datetime.datetime(2000, 8, 26),  
                          'nfl': datetime.datetime(1970, 9, 18), 
                          'nhl': datetime.datetime(1918, 12, 21),
                          'ncaab': datetime.datetime(1947, 1, 1)}
    self.__SPORTS__ = {'nhl': 'https://www.hockey-reference.com/', 
              'mlb': 'https://www.baseball-reference.com/', 
              'nfl': 'https://www.pro-football-reference.com/', 
              'nba': 'https://www.basketball-reference.com/',
              'ncaaf': 'https://www.sports-reference.com/cfb/',
              'ncaab': 'https://www.sports-reference.com/cbb/'}

    # establish seasons and output years
    if CollectSeasons.seasons is None:
      CollectSeasons.seasons = self._collect_seasons()
    
    # jsonify seasons
    self._create_season_json()

  def _collect_seasons(self) -> None:
    sports = {sport: self._collect_seasons_helper(sport) for sport in tqdm(list(self.__SPORTS__.keys()))}
    return sports

  def _create_season_link(self, sport: str) -> str:
    # collect the urls for each sport
    if sport in ['nhl', 'mlb', 'nba']:
      season_link = self.__SPORTS__[sport] + 'leagues/'
    elif sport in ['nfl', 'ncaaf']:
      season_link = self.__SPORTS__[sport] + 'years/'
    else:
      season_link = self.__SPORTS__[sport] + 'seasons/'
    return season_link

  def _collect_seasons_helper(self, sport: str) -> dict: 
    # extract link by sport
    season_link = self._create_season_link(sport)
    
    # collect the reponse and parse the data to fit the data we want and save the data
    if sport in ['nhl', 'nba']:
      response = pd.read_html(season_link)[0]
      response.columns = response.columns.droplevel()
      response[['Year', 'End Year']] = response['Season'].str.split('-', expand = True)
      response['Year'] = response['Year'].astype(int)
      response = response[response['Year'] >= self.__INIT_DATE__[sport].year].reset_index(drop = True)

    elif sport in ['nfl', 'ncaaf']:
      response = pd.read_html(season_link)[0]
      response = response[response['Year'] >= self.__INIT_DATE__[sport].year].reset_index(drop = True)

    elif sport == 'mlb':
      response = pd.read_html(season_link)[1]
      response['Year'] = response['Year'].ffill()
      response['Year'] = response['Year'].astype(int)
      response = response[response['Year'] >= self.__INIT_DATE__[sport].year].reset_index(drop = True)

    elif sport == 'ncaab':
      response = pd.read_html(season_link)[0]
      response['Season'] = response['Season'].apply(lambda x: x.rstrip(' Summary'))
      response[['Year', 'End Year']] = response['Season'].str.split('-', expand = True)
      response = response[response['Year'] != 'Season'].reset_index(drop = True)
      response['Year'] = response['Year'].astype(int)
      response = response[response['Year'] >= self.__INIT_DATE__[sport].year].reset_index(drop = True)
    
    else:
      raise SportNotAllowedError(sport)
    
    min_year = int(response['Year'].min())
    max_year = int(response['Year'].max())
    r = dict(min_year = min_year, max_year = max_year, season_link = season_link)
    return r

  def _create_season_json(self) -> None:
    if not os.path.exists(self.json_path):
      with open(self.json_path, 'w') as file:
        json.dump(CollectSeasons.seasons, file)
    return None

class CollectTeamsBySeason:
  sports_team_by_season = None

  def __init__(self, collectseasons):
    self.teambysport_path = "/content/drive/MyDrive/Classes/Spring 2022/INFO606: Network Science/Project/data/teams.json"
    self.seasons = collectseasons.seasons
    self.sports = list(self.seasons.keys())
    CollectTeamsBySeason.sports_team_by_season = {sport: None for sport in self.sports}

    # collect sports teams
    if not os.path.exists(self.teambysport_path):
      for sport in self.sports:
        if CollectTeamsBySeason.sports_team_by_season[sport] is None:
          print(f"Working on {sport}...")
          CollectTeamsBySeason.sports_team_by_season[sport] = self._collect_team_by_season(sport)
          self._create_team_json()
    else:
      with open(self.teambysport_path, 'r') as file:
        CollectTeamsBySeason.sports_team_by_season = json.load(file)

  def _extract_team_links_helper(self, teams, base_link: str):
    team_end_links = {}
    if teams is not None:
      if 'pro' in base_link:
        teams = teams.find_all(attrs = {'data-stat': 'team'})
      elif "sports-reference" in base_link:
        teams = teams.find_all(attrs = {'data-stat': 'school_name'})
      else:
        teams = teams.find_all(attrs = {'data-stat': 'team_name'})
      for t in teams:
        t_find = t.find('a')
        if t_find is None:
          continue
        link = f"{base_link}{t_find['href']}"
        team_end_links[t_find.text] = link
    return team_end_links

  def _extract_team_links(self, sport:str, year: str, link: str) -> dict:
    # collect the mlb
    if sport in ['mlb']:
      url = f"{link}/majors/{year}.shtml"
      soup = BeautifulSoup(requests.get(url).content, 'html.parser')
      teams = soup.find(id=re.compile('teams_standard_batting+'))
      team_end_links = self._extract_team_links_helper(teams, base_link = "https://www.baseball-reference.com")
    
    # collect the nhl
    if sport in ['nhl']:
      url = f"{link}/NHL_{year+1}.html"
      soup = BeautifulSoup(requests.get(url).content, 'html.parser')
      teams = soup.find(id=re.compile("standings+"))
      team_end_links = self._extract_team_links_helper(teams, base_link = "https://www.hockey-reference.com")
    
    # collect the nfl
    if sport in ['nfl']:
      url = f"{link}{year}"
      soup = BeautifulSoup(requests.get(url).content, 'html.parser')
      teams = soup.find(id=re.compile("AFC|NFC"))
      team_end_links = self._extract_team_links_helper(teams, base_link = "https://www.pro-football-reference.com")

    # collect the ncaaf and ncaab
    if sport in ['ncaaf', 'ncaab']:
      url = f"{link}{year}.html"
      soup = BeautifulSoup(requests.get(url).content, 'html.parser')
      conferences = [c.find('a')['href'] for c in soup.find_all(attrs = {'data-stat': 'conf_name'}) if c.find('a') is not None]
      team_end_links = {}
      for c in conferences:
        conf_url = f"https://www.sports-reference.com{c}"
        conf_soup = BeautifulSoup(requests.get(conf_url).content, 'html.parser')
        teams = conf_soup.find(id = 'standings')
        team_data = self._extract_team_links_helper(teams, base_link = "https://www.sports-reference.com")
        team_end_links.update(team_data)

    if sport in ['nba']:
      if 1969 >= year >= 1949:
        url = f"{link}NBA_{year+1}.html"
        soup = BeautifulSoup(requests.get(url).content, 'html.parser')
        teams = soup.find(id=re.compile("divs_standings_"))      
      elif 2014 >= year > 1969:
        url = f"{link}NBA_{year+1}.html"
        soup = BeautifulSoup(requests.get(url).content, 'html.parser')
        teams = soup.find(id=re.compile("divs_standings_+"))
      elif year >= 2014:
        url = f"{link}NBA_{year+1}.html"
        soup = BeautifulSoup(requests.get(url).content, 'html.parser')
        teams = soup.find(id=re.compile("confs_standings_+"))
      else:
        url = f"{link}BAA_{year+1}.html"
        soup = BeautifulSoup(requests.get(url).content, 'html.parser')
        teams = soup.find(id = 'divs_standings_')        
      team_end_links = self._extract_team_links_helper(teams, base_link = "https://www.basketball-reference.com")
    return team_end_links

  def _collect_team_by_season(self, sport: str) -> None:
    min_year = self.seasons[sport]['min_year']
    max_year = self.seasons[sport]['max_year']
    season_years = range(min_year, max_year+1)
    link = self.seasons[sport]['season_link']
    sport_data = {year:self._extract_team_links(sport, year, link) for year in tqdm(season_years)}
    return sport_data

  def _create_team_json(self) -> None:
    with open(self.teambysport_path, 'w') as file:
      json.dump(CollectTeamsBySeason.sports_team_by_season, file)
    return None

In [67]:
class SportSchedules:
  def __init__(self):
    self._establish_schedules_dir()

  def _establish_schedules_dir(self) -> None:
    schedule_path = "/content/drive/MyDrive/Classes/Spring 2022/INFO606: Network Science/Project/data/schedules"
    if not os.path.isdir(schedule_path):
      os.makedirs(schedule_path)
    return None

  def collect_sport_schedules(self, ):
   pass

In [93]:
class NFLSchedules:
  schedule = None
  def __init__(self, collectseasons):
    self.seasons = collectseasons.seasons
    if NFLSchedules.schedule is None:
      NFLSchedules.schedule = self._collect_and_save_weeks()

  def _collect_and_save_weeks(self) -> dict:
    print("Working on NFL Schedules")
    frames = []
    nfl_seasons = self.seasons['nfl']
    year_range = range(nfl_seasons['min_year'], nfl_seasons['max_year'])
    for year in tqdm(year_range):
      url = f"https://www.pro-football-reference.com/years/{year}/games.htm"
      page = pd.read_html(url)[0]
      page['year'] = year
      page = page[['Week', 'year']].drop_duplicates()
      page = page[page['Week'] != 'Week'].dropna().reset_index(drop = True).reset_index()
      frames.append(page)
    schedule = pd.concat(frames, axis = 0, ignore_index = True)
    schedule['type_week'] = schedule['Week'].apply(lambda x: 'regular' if x.isnumeric() else 'post')
    schedule = schedule.rename({'index': 'week_num'}, axis = 1)
    schedule['week_num'] += 1

    schedule_json = {}
    for year in year_range:
      data = {}
      temp = schedule[schedule['year'] == year]
      data['regular'] = list(map(str, temp.loc[temp['type_week'] == 'regular', 'week_num'].values))
      data['post'] = list(map(str, temp.loc[temp['type_week'] == 'post', 'week_num'].values))
      data['name'] = list(map(str, temp['Week'].values))
      schedule_json[year] = data

    return schedule_json

class MLBSchedules:
  schedule = None
  def __init__(self, collectseasons):
    self.seasons = collectseasons.seasons

    #if MLBSchedules.schedule is None:
    #  MLBSchedules.schedule = self._collect_and_save_splits()

  def _collect_and_save_splits(self):
    print("Working on MLB Schedules...")
    mlb_seasons = self.seasons['mlb']
    year_range = range(mlb_seasons['min_year'], mlb_seasons['max_year'])
    data_split = ['regular', 'post']
    collections = {}
    for year in tqdm(year_range):
      url = f"https://www.baseball-reference.com/leagues/majors/{year}-schedule.shtml"
      response = BeautifulSoup(requests.get(url).content, 'html.parser')
      splits = response.find_all("div", attrs = {"class": 'section_content'})[:-1]
      if len(splits) == 2:
        split_data = dict(zip(data_split, splits))
        for ds in data_split:
          all_h3 = split_data[ds].find_all('h3')
          all_h3 = list(map(lambda x: x.text, all_h3))
          if 'Today' not in all_h3[0] or 'Today' not in all_h3[-1]:
            split_range = [pd.to_datetime(all_h3[0]), pd.to_datetime(all_h3[-1])]
          else:
            split_range = []
          split_data[ds] = split_range
        collections[year] = split_data
      else:
        split_data = {'regular': splits[0], 'post': []}
        all_h3 = split_data['regular'].find_all('h3')
        all_h3 = list(map(lambda x: x.text, all_h3))
        split_range = [pd.to_datetime(all_h3[0]), pd.to_datetime(all_h3[-1])]
        split_data['regular'] = split_range
        collections[year] = split_data
      
    return collections

class NBASchedules:
  pass

class NHLSchedules:
  pass

class NCAAFSchedules:
  pass

class NCAABSchedules:
  pass

In [36]:
seasons = CollectSeasons()

In [91]:
s = MLBSchedules(seasons)._collect_and_save_splits()

Working on MLB Schedules...


  0%|          | 0/146 [00:00<?, ?it/s]

In [92]:
s

{1876: {'post': [],
  'regular': [Timestamp('1876-04-22 00:00:00'),
   Timestamp('1876-10-21 00:00:00')]},
 1877: {'post': [],
  'regular': [Timestamp('1877-04-30 00:00:00'),
   Timestamp('1877-10-06 00:00:00')]},
 1878: {'post': [],
  'regular': [Timestamp('1878-05-01 00:00:00'),
   Timestamp('1878-09-30 00:00:00')]},
 1879: {'post': [],
  'regular': [Timestamp('1879-05-01 00:00:00'),
   Timestamp('1879-09-30 00:00:00')]},
 1880: {'post': [],
  'regular': [Timestamp('1880-05-01 00:00:00'),
   Timestamp('1880-10-01 00:00:00')]},
 1881: {'post': [],
  'regular': [Timestamp('1881-04-30 00:00:00'),
   Timestamp('1881-09-30 00:00:00')]},
 1882: {'post': [],
  'regular': [Timestamp('1882-05-01 00:00:00'),
   Timestamp('1882-10-02 00:00:00')]},
 1883: {'post': [],
  'regular': [Timestamp('1883-05-01 00:00:00'),
   Timestamp('1883-09-30 00:00:00')]},
 1884: {'post': [],
  'regular': [Timestamp('1884-04-17 00:00:00'),
   Timestamp('1884-10-19 00:00:00')]},
 1885: {'post': [],
  'regular': [Tim

In [54]:
!pip install sportsipy

Collecting sportsipy
  Downloading sportsipy-0.6.0-py3-none-any.whl (499 kB)
[?25l[K     |▋                               | 10 kB 19.5 MB/s eta 0:00:01[K     |█▎                              | 20 kB 20.5 MB/s eta 0:00:01[K     |██                              | 30 kB 17.1 MB/s eta 0:00:01[K     |██▋                             | 40 kB 12.8 MB/s eta 0:00:01[K     |███▎                            | 51 kB 10.6 MB/s eta 0:00:01[K     |████                            | 61 kB 12.2 MB/s eta 0:00:01[K     |████▋                           | 71 kB 12.3 MB/s eta 0:00:01[K     |█████▎                          | 81 kB 11.1 MB/s eta 0:00:01[K     |██████                          | 92 kB 12.1 MB/s eta 0:00:01[K     |██████▌                         | 102 kB 12.7 MB/s eta 0:00:01[K     |███████▏                        | 112 kB 12.7 MB/s eta 0:00:01[K     |███████▉                        | 122 kB 12.7 MB/s eta 0:00:01[K     |████████▌                       | 133 kB 12.7 MB/s e

In [55]:
from sportsipy.mlb.schedule import Schedule

In [59]:
s = Schedule('HOU', year = 2020)

In [60]:
s.dataframe

Unnamed: 0,attendance,boxscore_index,date,datetime,game_number_for_day,day_or_night,game,game_duration,games_behind,innings,...,loser,opponent_abbr,rank,record,result,runs_allowed,runs_scored,save,streak,winner
HOU/HOU202007240,,HOU/HOU202007240,"Friday, Jul 24",2020-07-24,1,Night,1,2:49,0.0,9,...,Gonzales,SEA,1,1-0,Win,2,8,,+,Verlander
HOU/HOU202007250,,HOU/HOU202007250,"Saturday, Jul 25",2020-07-25,1,Day,2,2:53,-1.0,9,...,Walker,SEA,1,2-0,Win,2,7,,++,McCullers
HOU/HOU202007260,,HOU/HOU202007260,"Sunday, Jul 26",2020-07-26,1,Day,3,3:41,0.0,9,...,Devenski,SEA,1,2-1,Loss,7,6,Williams,-,Altavilla
HOU/HOU202007270,,HOU/HOU202007270,"Monday, Jul 27",2020-07-27,1,Night,4,3:04,0.0,9,...,Graveman,SEA,1,3-1,Win,5,8,Osuna,+,Bielak
HOU/HOU202007280,,HOU/HOU202007280,"Tuesday, Jul 28",2020-07-28,1,Night,5,3:19,0.0,9,...,Valdez,LAD,1,3-2,Loss,5,2,Jansen,-,Kolarek
HOU/HOU202007290,,HOU/HOU202007290,"Wednesday, Jul 29",2020-07-29,1,Night,6,4:44,0.0,13,...,Sneed,LAD,1,3-3,Loss,4,2,,--,Santana
ANA/ANA202007310,,ANA/ANA202007310,"Friday, Jul 31",2020-07-31,1,Night,7,4:21,-0.5,9,...,Andriese,LAA,1,4-3,Win,6,9,Scrubb,+,Bielak
ANA/ANA202008010,,ANA/ANA202008010,"Saturday, Aug 1",2020-08-01,1,Day,8,3:58,0.0,10,...,Rodriguez,LAA,1,4-4,Loss,5,4,,-,Buchter
ANA/ANA202008020,,ANA/ANA202008020,"Sunday, Aug 2",2020-08-02,1,Day,9,4:37,0.0,11,...,Barnes,LAA,1,5-4,Win,5,6,,+,Taylor
ARI/ARI202008040,,ARI/ARI202008040,"Tuesday, Aug 4",2020-08-04,1,Night,10,3:09,0.5,9,...,Bumgarner,ARI,2,6-4,Win,2,8,,++,Javier
