In [68]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import datetime
from datetime import datetime as DT
from typing import List

In [74]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [114]:
response = requests.get("https://www.basketball-reference.com/leagues/NBA_2020_games-october-2020.html")
Soup = BeautifulSoup(response.text, 'lxml')
rows = Soup.find_all('tr')
len(rows)
for row in rows: 
    print(row.th.text)
    if row.th.text != "Date":
        date_of_game = DT.strptime(row.th.text, '%a, %b %d, %Y')
        print(date_of_game)

Date
Fri, Oct 2, 2020
2020-10-02 00:00:00
Sun, Oct 4, 2020
2020-10-04 00:00:00
Tue, Oct 6, 2020
2020-10-06 00:00:00
Fri, Oct 9, 2020
2020-10-09 00:00:00
Sun, Oct 11, 2020
2020-10-11 00:00:00


In [75]:
def get_proper_name(teamName: str) -> str:
    issue_teams = ["Los Angeles Clippers", "Los Angeles Lakers", "Oklahoma City Thunder"]
    if teamName not in issue_teams:
        return teamName.rsplit(' ', 1)[0]
    else:
        if teamName == issue_teams[0]:
            return 'LA Clippers'
        elif teamName == issue_teams[1]:
            return 'LA Lakers'
        elif teamName == issue_teams[2]:
            return 'Okla City'

In [76]:
def get_months_in_season(year: int) -> List[str]:
    october_to_june = ["october", "november", "december", "january", "february", "march", "april", "may", "june"]
    november_to_june = ["november", "december", "january", "february", "march", "april", "may", "june"]
    switcher = {
        2020: ["october-2019", "november", "december", "january", "february", "march", "july", "august", "september", "october-2020"],
        2019: october_to_june,
        2018: october_to_june,
        2017: october_to_june,
        2016: october_to_june,
        2015: october_to_june,
        2014: october_to_june,
        2013: october_to_june,
        2012: ["december", "january", "february", "march", "april", "may", "june"],
        2011: october_to_june,
        2010: october_to_june,
        2009: october_to_june,
        2008: october_to_june,
        2007: october_to_june,
        2006: november_to_june,
        2005: november_to_june,
        2004: october_to_june,
        2003: october_to_june,
        2002: october_to_june,
        2001: october_to_june,
        2000: november_to_june,
        1999: ["february", "march", "april", "may", "june"],
        1998: october_to_june,
        1997: november_to_june,
        1996: november_to_june,
        1995: november_to_june,
        1994: november_to_june,
        1993: november_to_june,
        1992: november_to_june,
        1991: november_to_june,
        1990: november_to_june,
    }
    return switcher.get(year, "out of range... range is 1990-2020")

In [77]:
class Game:
    def __init__(self, timestamp, homeTeam, awayTeam, didHomeWin):
        self.day = timestamp
        self.homeTeam = homeTeam
        self.awayTeam = awayTeam
        self.didHomeWin = didHomeWin
        
    def __repr__(self):
        return f"Time: {self.day}, Home Team: {self.homeTeam}, Away Team {self.awayTeam}, Home Team Won: {self.didHomeWin}"

In [142]:
# inclusize of startYear not inclusive on endYear
def scrape_game_data(startYear: int, endYear: int) -> List[Game]:
    start_date = datetime.date(startYear, 1, 2)
    end_date = datetime.date(endYear, 1, 1)
    delta = datetime.timedelta(weeks=52, days=1)

    dates = []
    awayTeams = []
    homeTeams = []
    homeWins = []

    while start_date <= end_date:
        int_year = start_date.year
        urls = [f"https://www.basketball-reference.com/leagues/NBA_{int_year}_games-{month}.html" for month in get_months_in_season(int_year)]
        # print(f"for {int_year} the urls are: \n {urls}")
        for url in tqdm(urls):
            response = requests.get(url)
            Soup = BeautifulSoup(response.text, 'lxml')
            rows = Soup.find_all('tr')
            for row in rows:
                if row.th.text not in ["Date", "Playoffs"]:
                    #print(url)
                    rowElems = row.find_all('td')
                    # rowElems[0] is the time of the game
                    # added 'm' in f string because this is the format of rowElems[0] 7:00p. and date time needs pm or am not p or a
                    # gameDay = DT.strptime(f"{row.th.text} {rowElems[0].text}m", '%a, %b %w, %Y %I:%M%p')
                    
                    # Get text vals and append them to their respective list
                    # date
                    try:
                        date_of_game = DT.strptime(row.th.text, '%a, %b %d, %Y')
                        dates.append(date_of_game)
                    except ValueError as e:
                        print(row.th.text)
                        break
                    # away team
                    away = get_proper_name(rowElems[1].text)
                    awayTeams.append(away)
                    # home team
                    home = get_proper_name(rowElems[3].text)
                    homeTeams.append(home)
                    # Did Home team win?
                    awayPts = rowElems[2].text
                    homePts = rowElems[4].text
                    didHomeWin = int(homePts) > int(awayPts)
                    homeWins.append(didHomeWin)
                    
                    #print(f"Time: {date_of_game}, Home/pts: {home}/{homePts}, Away/pts: {away}/{awayPts}, HomeTeamWon: {didHomeWin}")
                    # newGame = Game(gameDay, home, away, didHomeWin)
        # increment year            
        start_date += delta
        
    ret = {
        'Date': dates,
        'Home': homeTeams,
        'Away': awayTeams,
        'DidHomeWin': homeWins,
    }
    return ret

In [143]:
gamesDict = scrape_game_data(2005, 2021)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [144]:
DF = pd.DataFrame(gamesDict)
DF.head(100)


Unnamed: 0,Date,Home,Away,DidHomeWin
0,2004-11-02,Detroit,Houston,True
1,2004-11-02,Dallas,Sacramento,True
2,2004-11-02,LA Lakers,Denver,True
3,2004-11-03,Cleveland,Indiana,False
4,2004-11-03,Orlando,Milwaukee,True
...,...,...,...,...
95,2004-11-15,Cleveland,Golden State,True
96,2004-11-15,New Jersey,Houston,False
97,2004-11-16,Charlotte,Utah,False
98,2004-11-16,Philadelphia,Seattle,False


In [140]:
DF.count()

Date          21680
Home          21680
Away          21680
DidHomeWin    21680
dtype: int64

In [27]:
start_date = datetime.date(2000, 1, 1)
end_date = datetime.date(2020, 1, 1)
months = ["october", "november", "december", "january", "february", "march", "april", "may", "june"]
# 2012 is from Dec - June
# 2006, 2005, 2000, 1997 is from November - June
# 1999 is from February - June
url = "https://www.basketball-reference.com/leagues/NBA_2020_games-december.html"

response = requests.get(url)
Soup = BeautifulSoup(response.text, 'lxml')
rows = Soup.find_all('tr')
games = []
for row in rows:
    if row.th.text != "Date":
        rowElems = row.find_all('td')
        print(row.th.text, "\n")
        print(type(row.th.text))
        # rowElems[0] is the time of the game
        # added 'm' in f string because this is the format of rowElems[0] 7:00p. and date time needs pm or am not p or a
        # gameDay = DT.strptime(f"{row.th.text} {rowElems[0].text}m", '%a, %b %w, %Y %I:%M%p')
        gameDay = DT.strptime(row.th.text, '%a, %b %d, %Y')

        away = rowElems[1].text
        awayPts = rowElems[2].text
        home = rowElems[3].text
        homePts = rowElems[4].text
        didHomeWin = homePts > awayPts
        newGame = Game(gameDay, home, away, didHomeWin)
        games.append(newGame)




Sun, Dec 1, 2019 

<class 'str'>
Sun, Dec 1, 2019 

<class 'str'>
Sun, Dec 1, 2019 

<class 'str'>
Sun, Dec 1, 2019 

<class 'str'>
Sun, Dec 1, 2019 

<class 'str'>
Sun, Dec 1, 2019 

<class 'str'>
Sun, Dec 1, 2019 

<class 'str'>
Sun, Dec 1, 2019 

<class 'str'>
Sun, Dec 1, 2019 

<class 'str'>
Mon, Dec 2, 2019 

<class 'str'>
Mon, Dec 2, 2019 

<class 'str'>
Mon, Dec 2, 2019 

<class 'str'>
Mon, Dec 2, 2019 

<class 'str'>
Mon, Dec 2, 2019 

<class 'str'>
Mon, Dec 2, 2019 

<class 'str'>
Tue, Dec 3, 2019 

<class 'str'>
Tue, Dec 3, 2019 

<class 'str'>
Tue, Dec 3, 2019 

<class 'str'>
Tue, Dec 3, 2019 

<class 'str'>
Tue, Dec 3, 2019 

<class 'str'>
Tue, Dec 3, 2019 

<class 'str'>
Tue, Dec 3, 2019 

<class 'str'>
Wed, Dec 4, 2019 

<class 'str'>
Wed, Dec 4, 2019 

<class 'str'>
Wed, Dec 4, 2019 

<class 'str'>
Wed, Dec 4, 2019 

<class 'str'>
Wed, Dec 4, 2019 

<class 'str'>
Wed, Dec 4, 2019 

<class 'str'>
Wed, Dec 4, 2019 

<class 'str'>
Wed, Dec 4, 2019 

<class 'str'>
Wed, Dec 4

In [28]:
start_date = datetime.date(2001,1,2)
end_date = datetime.date(2020,1,1)
delta = datetime.timedelta(weeks=52, days=1)

while start_date <= end_date:
    print(start_date.year)
    start_date += delta


2001
2002
2003
2004
2005
2006
2007
2008
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019


In [33]:
for game in games:
    print(get_proper_name(game.homeTeam))

Brooklyn
Minnesota
New York
LA Lakers
Detroit
New Orleans
Orlando
Toronto
LA Clippers
Charlotte
Philadelphia
Atlanta
Memphis
Milwaukee
Sacramento
Cleveland
Washington
New Orleans
Toronto
San Antonio
Denver
LA Clippers
Charlotte
Detroit
Orlando
Atlanta
Boston
Chicago
Okla City
Dallas
Utah
Portland Trail
Washington
New York
Toronto
New Orleans
Charlotte
Cleveland
Detroit
Boston
Chicago
Miami
Okla City
Milwaukee
San Antonio
Portland Trail
Dallas
New York
Philadelphia
Houston
Utah
Brooklyn
Charlotte
Miami
Philadelphia
Washington
Dallas
Portland Trail
LA Lakers
Indiana
Boston
Chicago
Houston
Milwaukee
New Orleans
Phoenix
Utah
Golden State
Charlotte
Miami
Philadelphia
Portland Trail
Cleveland
Indiana
Orlando
Toronto
Brooklyn
Chicago
Minnesota
Phoenix
Milwaukee
Sacramento
Golden State
Boston
San Antonio
Detroit
Denver
Miami
Orlando
Philadelphia
Atlanta
Chicago
Memphis
Utah
Minnesota
Sacramento
Phoenix
Toronto
Chicago
Memphis
Dallas
Milwaukee
Denver
Houston
New Orleans
Indiana
Atlanta
Brooklyn

In [30]:
list1 = ["a", "b", "c"]
list2 = [2, 3, 1]

zipped_lists = zip(list2, list1)
sorted_zipped_lists = sorted(zipped_lists)
print(sorted_zipped_lists)
sorted_list1 = [element for _, element in sorted_zipped_lists]

[(1, 'c'), (2, 'a'), (3, 'b')]
