In [2]:
import mechanicalsoup
from bs4 import BeautifulSoup
from requests_html import HTMLSession
import dateutil.parser
import datetime
import sys
import json
import pandas as pd
import requests
import numpy as np


In [3]:
session = HTMLSession()
browser = mechanicalsoup.StatefulBrowser()
browser.addheaders = [('User-agent', 'Firefox')]


In [4]:
def get_games(date):
    '''
        Get NBA games on a particular day
        The data is being screapped from the espn.com site
        date:   YYYYMMDD format
    '''

    games_dict = {}
    gameIndex = 0

    browser.open("https://www.espn.com/nba/schedule/_/date/"+date) # URL containig the games for the given date 

    # Opening the link
    schedule_link = browser.find_link()
    browser.follow_link(schedule_link)

    page = browser.get_current_page() # get the page source code
    games = page.find(class_="schedule has-team-logos align-left") # select only the tag that contains the games' info
    desc_all = games.find_all('small') # Game description

    # return games_dict
    for game in games.find_all('tr'):
        teams = []

        contents = game.find_all('td')
        for content in contents:
            for td in content.find_all('abbr'):
                teams.append(td.get('title'))

        if contents:
            element_list = str(contents[2]).split('>')
            gameId = str(element_list[1][52:-36]) # Isolates the game ID
            date = str(element_list[0][41:-1]) # Isolates the result
            time = str(dateutil.parser.parse(date))[:-9] # Converts date time from ISO 8601:

            teams.reverse() # Inverts the order to use the Home vs. Away format
            games_dict[gameId] = teams , time # Adds the game to a dictionary using the ID as key

            gameIndex+=1

    return games_dict

In [5]:
def get_results(date):
    '''
        Get the results NBA games on a particular day
        The data is being screapped from the espn.com site
        date:   YYYYMMDD format
    '''

    games_dict = {}
    gameIndex = 0

    browser.open("https://www.espn.com/nba/schedule/_/date/"+date) # URL containig the games for the given date 

    # Opening the link
    schedule_link = browser.find_link()
    browser.follow_link(schedule_link)

    page = browser.get_current_page() # get the page source code
    games = page.find(class_="schedule has-team-logos align-left") # select only the tag that contains the games' info

    for game in games.find_all('tr'):
        teams = []

        contents = game.find_all('td')
        for content in contents:
            for td in content.find_all('abbr'):
                teams.append(td.get('title'))

        if contents:
            element_list = str(contents[2]).split('>')
            gameId = str(element_list[1][28:-37]) # Isolates the game ID
            result = str(element_list[2][:-3]) # Isolates the result

            teams.reverse() # Inverts the order to use the Home vs. Away format
            games_dict[gameId] = teams , result # Adds the game to a dictionary using the ID as key

            gameIndex+=1

    return games_dict

In [6]:
def DictToDf(dic):
    '''
    Convert dictionary generated from scrapping to a Pandas Dataframe
    dic: {'gameID': (['Home', 'Away'], 'WIN_ABREV WIN_POINTS, LOS_ABREV LOS_POINTS'
    '''
    
    df = pd.DataFrame.from_dict(dic,orient='index')
    
    # Getting Home and Away teams from list and creating a column for each of them
    df.loc[:, 'Home'] = df[0].map(lambda x: x[0]) 
    df.loc[:, 'Away'] = df[0].map(lambda x: x[1])
    df = df.drop(0,axis=1)
    
    df = df.rename(columns={1: "Result"})
    df.index.name = "Game_ID"

    return df

In [7]:
def getAbrev():
    '''
    Scrapes Wikipedia page to get a dictionary with NBA teams names and abreviations
    '''
    
    url2 = "https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Basketball_Association/National_Basketball_Association_team_abbreviations"

    response = requests.get(
        url=url2,
    )
    soup = BeautifulSoup(response.content, 'html.parser')

    results = {}
    for row in soup.findAll('tr'):
        aux = row.findAll('td')
        try:
            results[aux[0].string[:-1]] = aux[1].find('a').text
        except:
            pass
    
    results["NY"] =  results["NYK"]
    results["UTAH"] =  results["UTA"]
    results["NO"] =  results["NOP"]
    results["SA"] =  results["SAS"]
    results["WSH"] =  results["WAS"]
    results["LAC"] = "LA Clippers"
    results["GS"] =  results["GSW"]
    
    
    return results

In [8]:
def formatResult(df):
    '''
    Formats the Dataframe 
    '''
    
    Won = []
    Won_Points = []
    Lost = []
    Lost_Points = []
    abrev = getAbrev()

    for r in range(len(df.Result)):
        row=df.Result[r]
        teams = row.split(',')
        name_res = [team.split(' ') for team in teams]

        if name_res[0][0] in abrev and  name_res[1][1] in abrev:
            Won.append(abrev[name_res[0][0]])
            Won_Points.append(name_res[0][1])

            Lost.append(abrev[name_res[1][1]])
            Lost_Points.append(name_res[1][2])

        else: # Days without games or with exibition matches or postponed
           pass

    if name_res[0][0] != "Postponed":
        df["Won"] = Won
        df["Won_Points"] = Won_Points
        df["Lost"] = Lost
        df["Lost_Points"] = Lost_Points

        df['Home_Won'] = np.where((df['Home'] == df['Won']), "Yes", "No")
    
    return df

In [9]:
def getByDay(date):
    '''
    Creates a Dataframe with de scraped and cleaned data for a specific day
    date:   DD/MM/YYYY format
    '''

    now = datetime.datetime.now()

    # Changing the date format to use in the URL
    date_split = date.split('/')
    date = [date_split[2],date_split[1],date_split[0]]
    date = ''.join(date)

    #Checking if the given date is in the past
    if now.year > int(date_split[2]) or now.month > int(date_split[1]) or now.day > int(date_split[0]):
        dic = get_results(date)
        df = DictToDf(dic)
        df["Date"] = [date]*len(df)

        df = formatResult(df)
        
        df.to_csv('./games/games'+date+'.csv')
        
        return df

    else:
        dic = get_games(date)
        df = DictToDf(dic)
        df["Date"] = [end_date]*len(df)
        df.to_csv('./dates/dates'+date+'.csv')
        
        return df

In [10]:
def main(start_date, end_date):
    dfs = []
    
    for day in pd.period_range(start=start_date, end=end_date, freq='D'):
        try:
            dfs.append(getByDay(str(day.strftime("%d/%m/%Y"))))
        except:
            pass
#             print(day)
            
    return dfs

In [13]:
start_date = "14/05/2019"
end_date = "22/05/2021"

dfs = main(start_date, end_date)

In [14]:
result = pd.concat(dfs)
result

Unnamed: 0_level_0,Result,Home,Away,Date,Won,Won_Points,Lost,Lost_Points,Home_Won
Game_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
401163114,"HOU 109, LAC 96",LA Clippers,Houston Rockets,20191003,Houston Rockets,109,LA Clippers,96,No
401163149,"DET 109, CLE 105",Detroit Pistons,Cleveland Cavaliers,20191011,Detroit Pistons,109,Cleveland Cavaliers,105,Yes
401163150,"IND 105, CHI 87",Indiana Pacers,Chicago Bulls,20191011,Indiana Pacers,105,Chicago Bulls,87,Yes
401163151,"BOS 100, ORL 75",Orlando Magic,Boston Celtics,20191011,Boston Celtics,100,Orlando Magic,75,No
401163152,"PHI 100, CHA 87",Charlotte Hornets,Philadelphia 76ers,20191011,Philadelphia 76ers,100,Charlotte Hornets,87,No
...,...,...,...,...,...,...,...,...,...
401326989,"BOS 118, WSH 100",Boston Celtics,Washington Wizards,20210518,Boston Celtics,118,Washington Wizards,100,Yes
401326990,"MEM 100, SA 96",Memphis Grizzlies,San Antonio Spurs,20210519,Memphis Grizzlies,100,San Antonio Spurs,96,Yes
401326991,"LAL 103, GS 100",Los Angeles Lakers,Golden State Warriors,20210519,Los Angeles Lakers,103,Golden State Warriors,100,Yes
401326993,"WSH 142, IND 115",Washington Wizards,Indiana Pacers,20210520,Washington Wizards,142,Indiana Pacers,115,Yes


In [15]:
result.to_csv('./games/games.csv')