In [1]:
import mechanicalsoup
from bs4 import BeautifulSoup
from requests_html import HTMLSession
import dateutil.parser
import datetime
import sys
import json
import pandas as pd
import requests
import numpy as np


In [2]:
session = HTMLSession()
browser = mechanicalsoup.StatefulBrowser()
browser.addheaders = [('User-agent', 'Firefox')]


In [3]:
def get_games(date):
    '''
        Get NBA games on a particular day
        The data is being screapped from the espn.com site
        date:   YYYYMMDD format
    '''

    games_dict = {}
    gameIndex = 0

    browser.open("https://www.espn.com/nba/schedule/_/date/"+date) # URL containig the games for the given date 

    # Opening the link
    schedule_link = browser.find_link()
    browser.follow_link(schedule_link)

    page = browser.get_current_page() # get the page source code
    games = page.find(class_="schedule has-team-logos align-left") # select only the tag that contains the games' info
    desc_all = games.find_all('small') # Game description

    # return games_dict
    for game in games.find_all('tr'):
        teams = []

        contents = game.find_all('td')
        for content in contents:
            for td in content.find_all('abbr'):
                teams.append(td.get('title'))

        if contents:
            element_list = str(contents[2]).split('>')
            gameId = str(element_list[1][52:-36]) # Isolates the game ID
            date = str(element_list[0][41:-1]) # Isolates the result
            time = str(dateutil.parser.parse(date))[:-9] # Converts date time from ISO 8601:

            teams.reverse() # Inverts the order to use the Home vs. Away format
            games_dict[gameId] = teams , time # Adds the game to a dictionary using the ID as key

            gameIndex+=1

    return games_dict

In [4]:
def get_results(date):
    '''
        Get the results NBA games on a particular day
        The data is being screapped from the espn.com site
        date:   YYYYMMDD format
    '''

    games_dict = {}
    gameIndex = 0

    browser.open("https://www.espn.com/nba/schedule/_/date/"+date) # URL containig the games for the given date 

    # Opening the link
    schedule_link = browser.find_link()
    browser.follow_link(schedule_link)

    page = browser.get_current_page() # get the page source code
    games = page.find(class_="schedule has-team-logos align-left") # select only the tag that contains the games' info

    for game in games.find_all('tr'):
        teams = []

        contents = game.find_all('td')
        for content in contents:
            for td in content.find_all('abbr'):
                teams.append(td.get('title'))

        if contents:
            element_list = str(contents[2]).split('>')
            gameId = str(element_list[1][28:-37]) # Isolates the game ID
            result = str(element_list[2][:-3]) # Isolates the result

            teams.reverse() # Inverts the order to use the Home vs. Away format
            games_dict[gameId] = teams , result # Adds the game to a dictionary using the ID as key

            gameIndex+=1

    return games_dict

In [5]:
def DictToDf(dic):
    df = pd.DataFrame.from_dict(dic,orient='index')
    
    df.loc[:, 'Home'] = df[0].map(lambda x: x[0])
    df.loc[:, 'Away'] = df[0].map(lambda x: x[1])
    df = df.drop(0,axis=1)
    
    df = df.rename(columns={1: "Result"})
    df.index.name = "Game_ID"
    
    return df

In [6]:
def getAbrev():
    url2 = "https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Basketball_Association/National_Basketball_Association_team_abbreviations"

    response = requests.get(
        url=url2,
    )
    soup = BeautifulSoup(response.content, 'html.parser')

    results = {}
    for row in soup.findAll('tr'):
        aux = row.findAll('td')
        try:
            results[aux[0].string[:-1]] = aux[1].find('a').text
        except:
            pass
    
    results["NY"] =  results["NYK"]
    results["UTAH"] =  results["UTA"]
    results["NO"] =  results["NOP"]
    results["SA"] =  results["SAS"]
    results["WSH"] =  results["WAS"]
    results["LAC"] = "LA Clippers"

    return results

In [7]:
def formatResult(df):
    Won = []
    Won_Points = []
    Lost = []
    Lost_Points = []
    abrev = getAbrev()

    for r in range(len(df.Result)):
        row=df.Result[r]
        teams = row.split(',')
        name_res = [team.split(' ') for team in teams]

        if name_res[0][0] in abrev and  name_res[1][1] in abrev:
            Won.append(abrev[name_res[0][0]])
            Won_Points.append(name_res[0][1])

            Lost.append(abrev[name_res[1][1]])
            Lost_Points.append(name_res[1][2])

        else: 
            raise "TEM ALGUMA ABREV ERRADA"

    df["Won"] = Won
    df["Won_Points"] = Won_Points
    df["Lost"] = Lost
    df["Lost_Points"] = Lost_Points
    
    df['Home_Won'] = np.where((df['Home'] == df['Won']), "Yes", "No")
    
    return df

In [8]:
def main(date):
    '''
    date:   DD/MM/YYYY format
    '''

    now = datetime.datetime.now()

    # Changing the date format to use in the URL
    date_split = date.split('/')
    date = [date_split[2],date_split[1],date_split[0]]
    date = ''.join(date)

    #Checking if the given date is in the past
    if now.year > int(date_split[2]) or now.month > int(date_split[1]) or now.day > int(date_split[0]):
        dic = get_results(date)
        df = DictToDf(dic)
        df = formatResult(df)

        df.to_csv('./games/games'+date+'.csv')
        
        return df

    else:
        dic = get_games(date)
        df = DictToDf(dic)
        df.to_csv('./dates/dates'+date+'.csv')
        
        return df

In [9]:
try:
    df = main(str((sys.argv[1])))

except:
    df = main(str("28/04/2021"))


df

Unnamed: 0,Result,Home,Away,Won,Won_Points,Lost,Lost_Points,Home_Won
401307736,"ORL 109, CLE 104",Cleveland Cavaliers,Orlando Magic,Orlando Magic,109,Cleveland Cavaliers,104,No
401307737,"PHI 127, ATL 83",Philadelphia 76ers,Atlanta Hawks,Philadelphia 76ers,127,Atlanta Hawks,83,Yes
401307738,"WSH 116, LAL 107",Washington Wizards,Los Angeles Lakers,Washington Wizards,116,Los Angeles Lakers,107,Yes
401307739,"BOS 120, CHA 111",Boston Celtics,Charlotte Hornets,Boston Celtics,120,Charlotte Hornets,111,Yes
401307740,"NY 113, CHI 94",New York Knicks,Chicago Bulls,New York Knicks,113,Chicago Bulls,94,Yes
401307741,"MIA 116, SA 111",Miami Heat,San Antonio Spurs,Miami Heat,116,San Antonio Spurs,111,Yes
401307742,"POR 130, MEM 109",Memphis Grizzlies,Portland Trail Blazers,Portland Trail Blazers,130,Memphis Grizzlies,109,No
401307743,"DEN 114, NO 112",Denver Nuggets,New Orleans Pelicans,Denver Nuggets,114,New Orleans Pelicans,112,Yes
401307744,"PHX 109, LAC 101",Phoenix Suns,LA Clippers,Phoenix Suns,109,LA Clippers,101,Yes
401307745,"UTAH 154, SAC 105",Sacramento Kings,Utah Jazz,Utah Jazz,154,Sacramento Kings,105,No
