In [1]:
import mechanicalsoup
from bs4 import BeautifulSoup
from requests_html import HTMLSession
import dateutil.parser
import datetime
import sys
import json
import pandas as pd
import requests
import numpy as np


In [2]:
session = HTMLSession()
browser = mechanicalsoup.StatefulBrowser()
browser.addheaders = [('User-agent', 'Firefox')]


In [3]:
def get_games(date):
    '''
        Get NBA games on a particular day
        The data is being screapped from the espn.com site
        date:   YYYYMMDD format
    '''

    games_dict = {}
    gameIndex = 0

    browser.open("https://www.espn.com/nba/schedule/_/date/"+date) # URL containig the games for the given date 

    # Opening the link
    schedule_link = browser.find_link()
    browser.follow_link(schedule_link)

    page = browser.get_current_page() # get the page source code
    games = page.find(class_="schedule has-team-logos align-left") # select only the tag that contains the games' info
    desc_all = games.find_all('small') # Game description

    # return games_dict
    for game in games.find_all('tr'):
        teams = []

        contents = game.find_all('td')
        for content in contents:
            for td in content.find_all('abbr'):
                teams.append(td.get('title'))

        if contents:
            element_list = str(contents[2]).split('>')
            gameId = str(element_list[1][52:-36]) # Isolates the game ID
            date = str(element_list[0][41:-1]) # Isolates the result
            time = str(dateutil.parser.parse(date))[:-9] # Converts date time from ISO 8601:

            teams.reverse() # Inverts the order to use the Home vs. Away format
            games_dict[gameId] = teams , time # Adds the game to a dictionary using the ID as key

            gameIndex+=1

    return games_dict

In [4]:
def get_results(date):
    '''
        Get the results NBA games on a particular day
        The data is being screapped from the espn.com site
        date:   YYYYMMDD format
    '''

    games_dict = {}
    gameIndex = 0

    browser.open("https://www.espn.com/nba/schedule/_/date/"+date) # URL containig the games for the given date 

    # Opening the link
    schedule_link = browser.find_link()
    browser.follow_link(schedule_link)

    page = browser.get_current_page() # get the page source code
    games = page.find(class_="schedule has-team-logos align-left") # select only the tag that contains the games' info

    for game in games.find_all('tr'):
        teams = []

        contents = game.find_all('td')
        for content in contents:
            for td in content.find_all('abbr'):
                teams.append(td.get('title'))

        if contents:
            element_list = str(contents[2]).split('>')
            gameId = str(element_list[1][28:-37]) # Isolates the game ID
            result = str(element_list[2][:-3]) # Isolates the result

            teams.reverse() # Inverts the order to use the Home vs. Away format
            games_dict[gameId] = teams , result # Adds the game to a dictionary using the ID as key

            gameIndex+=1

    return games_dict

In [5]:
def DictToDf(dic):
    df = pd.DataFrame.from_dict(dic,orient='index')
    
    df.loc[:, 'Home'] = df[0].map(lambda x: x[0])
    df.loc[:, 'Away'] = df[0].map(lambda x: x[1])
    df = df.drop(0,axis=1)
    
    df = df.rename(columns={1: "Result"})
    df.index.name = "Game_ID"
    
    return df

In [6]:
def getAbrev():
    url2 = "https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Basketball_Association/National_Basketball_Association_team_abbreviations"

    response = requests.get(
        url=url2,
    )
    soup = BeautifulSoup(response.content, 'html.parser')

    results = {}
    for row in soup.findAll('tr'):
        aux = row.findAll('td')
        try:
            results[aux[0].string[:-1]] = aux[1].find('a').text
        except:
            pass
    
    results["NY"] =  results["NYK"]
    results["UTAH"] =  results["UTA"]
    results["NO"] =  results["NOP"]
    results["SA"] =  results["SAS"]
    results["WSH"] =  results["WAS"]
    results["LAC"] = "LA Clippers"
    results["GS"] =  results["GSW"]
    
    
    return results

In [7]:
def formatResult(df):
    Won = []
    Won_Points = []
    Lost = []
    Lost_Points = []
    abrev = getAbrev()

    for r in range(len(df.Result)):
        row=df.Result[r]
        teams = row.split(',')
        name_res = [team.split(' ') for team in teams]

        if name_res[0][0] in abrev and  name_res[1][1] in abrev:
            Won.append(abrev[name_res[0][0]])
            Won_Points.append(name_res[0][1])

            Lost.append(abrev[name_res[1][1]])
            Lost_Points.append(name_res[1][2])

        else: # Days without games or with exibition matches
            if name_res[0][0] != "Postponed":
                print("ERRO: ABREV ERRADA\n")
                try:
                    print(name_res[0][0])
                    print(name_res[1][1])
                    print(df['Home'],df['Away'])
                except:
                    print(df["Date"][0])

    if name_res[0][0] != "Postponed":
        df["Won"] = Won
        df["Won_Points"] = Won_Points
        df["Lost"] = Lost
        df["Lost_Points"] = Lost_Points

        df['Home_Won'] = np.where((df['Home'] == df['Won']), "Yes", "No")
    
    return df

In [8]:
def getByDay(date):
    '''
    date:   DD/MM/YYYY format
    '''

    now = datetime.datetime.now()

    # Changing the date format to use in the URL
    date_split = date.split('/')
    date = [date_split[2],date_split[1],date_split[0]]
    date = ''.join(date)

    #Checking if the given date is in the past
    if now.year > int(date_split[2]) or now.month > int(date_split[1]) or now.day > int(date_split[0]):
        dic = get_results(date)
        df = DictToDf(dic)
        df["Date"] = [end_date]*len(df)

        df = formatResult(df)
        
        df.to_csv('./games/games'+date+'.csv')
        
        return df

    else:
        dic = get_games(date)
        df = DictToDf(dic)
        df["Date"] = [end_date]*len(df)
        df.to_csv('./dates/dates'+date+'.csv')
        
        return df

In [9]:
def main(start_date, end_date):
    dfs = []
    
    for day in pd.period_range(start=start_date, end=end_date, freq='D'):
        try:
            dfs.append(getByDay(str(day.strftime("%d/%m/%Y"))))
        except:
            print(day)
            
    return dfs

In [10]:
start_date = "01/01/2020"
end_date = "14/05/2021"

dfs = main(start_date, end_date)

ERRO: ABREV ERRADA

USA
WORLD
Game_ID
401197814    USA
Name: Home, dtype: object Game_ID
401197814    World
Name: Away, dtype: object
2020-02-14
2020-02-15
ERRO: ABREV ERRADA

LEB
GIA
Game_ID
401197813    Team Giannis
Name: Home, dtype: object Game_ID
401197813    Team LeBron
Name: Away, dtype: object
2020-02-16
2020-02-17
2020-02-18
2020-02-19
2020-03-12
2020-03-13
2020-03-14
2020-03-15
2020-03-16
2020-03-17
2020-03-18
2020-03-19
2020-03-20
2020-03-21
2020-03-22
2020-03-23
2020-03-24
2020-03-25
2020-03-26
2020-03-27
2020-03-28
2020-03-29
2020-03-30
2020-03-31
2020-04-01
2020-04-02
2020-04-03
2020-04-04
2020-04-05
2020-04-06
2020-04-07
2020-04-08
2020-04-09
2020-04-10
2020-04-11
2020-04-12
2020-04-13
2020-04-14
2020-04-15
2020-04-16
2020-04-17
2020-04-18
2020-04-19
2020-04-20
2020-04-21
2020-04-22
2020-04-23
2020-04-24
2020-04-25
2020-04-26
2020-04-27
2020-04-28
2020-04-29
2020-04-30
2020-05-01
2020-05-02
2020-05-03
2020-05-04
2020-05-05
2020-05-06
2020-05-07
2020-05-08
2020-05-09
2020

In [14]:
result = pd.concat(dfs)
result

Unnamed: 0_level_0,Result,Home,Away,Date,Won,Won_Points,Lost,Lost_Points,Home_Won
Game_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
401161146,"ORL 122, WSH 101",Washington Wizards,Orlando Magic,14/05/2021,Orlando Magic,122,Washington Wizards,101,No
401161147,"NY 117, POR 93",New York Knicks,Portland Trail Blazers,14/05/2021,New York Knicks,117,Portland Trail Blazers,93,Yes
401161148,"MIL 106, MIN 104",Milwaukee Bucks,Minnesota Timberwolves,14/05/2021,Milwaukee Bucks,106,Minnesota Timberwolves,104,Yes
401161149,"LAL 117, PHX 107",Los Angeles Lakers,Phoenix Suns,14/05/2021,Los Angeles Lakers,117,Phoenix Suns,107,Yes
401161150,"CHA 109, CLE 106",Cleveland Cavaliers,Charlotte Hornets,14/05/2021,Charlotte Hornets,109,Cleveland Cavaliers,106,No
...,...,...,...,...,...,...,...,...,...
401307869,"UTAH 109, OKC 93",Oklahoma City Thunder,Utah Jazz,14/05/2021,Utah Jazz,109,Oklahoma City Thunder,93,No
401307870,"DAL 114, TOR 110",Dallas Mavericks,Toronto Raptors,14/05/2021,Dallas Mavericks,114,Toronto Raptors,110,Yes
401307871,"HOU 122, LAC 115",Houston Rockets,LA Clippers,14/05/2021,Houston Rockets,122,LA Clippers,115,Yes
401307872,"MEM 107, SAC 106",Memphis Grizzlies,Sacramento Kings,14/05/2021,Memphis Grizzlies,107,Sacramento Kings,106,Yes


In [16]:
result.to_csv('./games/games.csv')