# NBA Single Game Predictions - Data Cleaning

## Summary

This notebook will download the necessary datasets from basketball-reference.com

## Implementations

In [2]:
# Python Set Up
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import urllib

import time
import string

In [3]:
def getSoup(url):
    try:
        r = urllib.request.urlopen(url)
    except:
        return None

    return BeautifulSoup(r,"lxml")

**Import Team Game Data**

 download team game logs data from BasketballReference.com

In [3]:
def getTeamAdvanceGamelogs(team):

        soup =  getSoup("https://www.basketball-reference.com/teams/"+ team +"/2018/gamelog-advanced/")
        #print ("https://www.basketball-reference.com/teams/"+ team +"/2018/gamelog/")
        column_names = [th.getText() for th in 
                          soup.findAll('tr')[1].findAll('th')]
        
        # Rename the column names into correct format
        column_names[3] = "Home/Away"
        column_names[6] = "Team Score"
        column_names[7] = "Oponent Score"
        for i in range(8,23):
            column_names[i] = "Team "+ column_names[i] 
            
        

        for i in range(24,28):
            column_names[i] = "Opponent "+ column_names[i] 
            
                     
                     
        data_rows = soup.findAll('tr')[2:]
        game_logs = [[td.getText() for td in data_rows[i].findAll('td')]
                       for i in range(len(data_rows))]
        team_games = pd.DataFrame(game_logs, columns=column_names[1:])
        
        return team_games
        

In [4]:
def getTeamBasicGamelogs(team):

        soup =  getSoup("https://www.basketball-reference.com/teams/"+ team +"/2018/gamelog/")
        #print ("https://www.basketball-reference.com/teams/"+ team +"/2018/gamelog/")
        column_names = [th.getText() for th in 
                          soup.findAll('tr')[1].findAll('th')]
        
        # Rename the column names into correct format
        column_names[3] = "Home/Away"
        column_names[6] = "Team Score"
        column_names[7] = "Oponent Score"
        for i in range(8,23):
            column_names[i] = "Team "+ column_names[i] 
            
        

        for i in range(25,41):
            column_names[i] = "Opponent "+ column_names[i] 
            
                     
                     
        data_rows = soup.findAll('tr')[2:]
        game_logs = [[td.getText() for td in data_rows[i].findAll('td')]
                       for i in range(len(data_rows))]
        team_games = pd.DataFrame(game_logs, columns=column_names[1:])
        
        return team_games
        

In [31]:
def getTeamSplits(team):
    soup =  getSoup("https://www.basketball-reference.com/teams/"+ team +"/2018/splits/")
    column_names = [th.getText() for th in 
                      soup.findAll('tr')[1].findAll('th')]
    for i in range(5,19):
        column_names[i] = "Team "+ column_names[i] 

    for i in range(19,33):
        column_names[i] = "Opponent "+ column_names[i] 
            
    data_rows = soup.findAll('tr')[41:]
    team_splits_data = [[td.getText() for td in data_rows[i].findAll('td')]
                   for i in range(len(data_rows))]
    team_splits = pd.DataFrame(team_splits_data, columns=column_names[1:])
    
    return team_splits
    

In [6]:
def getCleanTeamBasicGamelogs(team):
   
    team_games = getTeamBasicGamelogs(team)

    team_games.dropna(axis=0, how='all', inplace = True)
    team_games = team_games.apply(pd.to_numeric, errors="ignore")
    team_games['Home/Away'] = team_games['Home/Away'].apply(lambda x: "Away" if x =="@" else "Home")

    team_games["Home Indicator"] = np.where(team_games['Home/Away'] == 'Home', 1, 0)
    team_games["Win Indicator"] = np.where(team_games['W/L'] == 'W', 1, 0)
    
    return team_games

In [7]:
def getCleanTeamAdvancedGamelogs(team):
   
    team_games = getTeamAdvanceGamelogs(team)

    team_games.dropna(axis=0, how='all', inplace = True)
    team_games = team_games.apply(pd.to_numeric, errors="ignore")
    team_games['Home/Away'] = team_games['Home/Away'].apply(lambda x: "Away" if x =="@" else "Home")

    team_games["Home Indicator"] = np.where(team_games['Home/Away'] == 'Home', 1, 0)
    team_games["Win Indicator"] = np.where(team_games['W/L'] == 'W', 1, 0)
    
    return team_games

In [8]:
def getAllTeamGamelogs(teams):
    
    team_basic_gamelogs = {}
    team_advanced_gamelogs = {}
    
    team_name = list(teams['Franchise'])
    teams_abrev = list(teams['Acronym'])
    
    team_basic_gamelogs = {team: getCleanTeamBasicGamelogs(team) for team in teams_abrev}
    team_advanced_gamelogs = {team: getCleanTeamAdvancedGamelogs(team) for team in teams_abrev}
    
    return [team_basic_gamelogs, team_advanced_gamelogs]

In [26]:
def getAllTeamSplits(teams):
    
    team_gamelogs = {}
    
    team_name = list(teams['Franchise'])
    teams_abrev = list(teams['Acronym'])
    
    teams_splits = {team: getTeamSplits(team) for team in teams_abrev}
    
    return teams_splits

In [27]:
teams = pd.read_csv('data/teams.csv')

In [13]:
team_basic_gamelogs, team_advanced_gamelogs = getAllTeamGamelogs(teams)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


AttributeError: 'NoneType' object has no attribute 'findAll'

In [32]:
teams_splits = getAllTeamSplits(teams)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [33]:
team_splits ={team: teams_splits[team].assign(Team= team)  
                         for team in teams_splits.keys()}
team_splits = pd.concat(list(team_splits.values()))
team_splits = team_splits.apply(pd.to_numeric, errors="ignore")

In [34]:
team_splits.to_csv('data/Team Splits.csv')

In [15]:
team_basic_gamelogs ={team: team_basic_gamelogs[team].assign(Team= team)  
                         for team in team_basic_gamelogs.keys()}
team_basic_gamelogs = pd.concat(list(team_basic_gamelogs.values()))

In [16]:
team_advanced_gamelogs ={team: team_advanced_gamelogs[team].assign(Team= team)  
                         for team in team_advanced_gamelogs.keys()}
team_advanced_gamelogs = pd.concat(list(team_advanced_gamelogs.values()))

In [17]:
team_basic_gamelogs = team_basic_gamelogs.apply(pd.to_numeric, errors="ignore")
team_advanced_gamelogs = team_advanced_gamelogs.apply(pd.to_numeric, errors="ignore")

In [18]:
team_basic_gamelogs = team_basic_gamelogs[['G', 'Date', 'Team', 'Home/Away', 'Opp', 'W/L', 'Team Score', 'Oponent Score',
                                           'Team FG', 'Team FGA', 'Team FG%', 'Team 3P', 'Team 3PA', 'Team 3P%',
                                           'Team FT', 'Team FTA', 'Team FT%', 'Team ORB', 'Team TRB', 'Team AST',
                                           'Team STL', 'Team BLK', 'Team TOV', 'PF', 'Opponent FG',
                                           'Opponent FGA', 'Opponent FG%', 'Opponent 3P', 'Opponent 3PA',
                                           'Opponent 3P%', 'Opponent FT', 'Opponent FTA', 'Opponent FT%',
                                           'Opponent ORB', 'Opponent TRB', 'Opponent AST', 'Opponent STL',
                                           'Opponent BLK', 'Opponent TOV', 'Opponent PF', 'Home Indicator',
                                           'Win Indicator']]

In [19]:
team_advanced_gamelogs = team_advanced_gamelogs[['Team ORtg', 'Team DRtg', 'Team Pace', 'Team FTr', 'Team 3PAr',
                                               'Team TS%', 'Team TRB%', 'Team AST%', 'Team STL%', 'Team BLK%',
                                                'Team eFG%', 'Team TOV%', 'Team ORB%', 'Team FT/FGA',
                                               'Opponent eFG%', 'Opponent TOV%', 'Opponent DRB%', 'Opponent FT/FGA']]

In [20]:
team_gamelogs = pd.concat([team_basic_gamelogs, team_advanced_gamelogs], axis = 1)

In [21]:
team_gamelogs.to_csv('data/Team Gamelogs.csv')

**Import Current Players Data**

Note: A small portion of this code is partially adapted from "https://github.com/andrewgiessel/basketballcrawler/edit/master/basketballCrawler" (adapted a small portion of functions). I would like to provide my sincere thanks for Andrew to build such a great tool and for us to partially adapt

In [22]:
player_names = []

for letter in string.ascii_lowercase:
    letter_page = getSoup('http://www.basketball-reference.com/players/%s/' % (letter))

    # Only active players have tags strong
    try:
        current_names = letter_page.findAll('strong')
    except:
        pass
    for n in current_names:
        name_data = next(n.children)
        
        try:
            player_names.append((name_data.contents[0], 'http://www.basketball-reference.com' + name_data.attrs['href']))
            
        except Exception as e:
            pass
#    print(player_names)
player_names = dict(player_names)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Get all the players game log urls

In [23]:
players_gamelogs_basic_url = {url: player_names[url][:-5] + '/gamelog/2018' for url in player_names}
players_gamelogs_advanced_url = {url: player_names[url][:-5] + '/gamelog-advanced/2018/' for url in player_names}

In [24]:
def buildDataFrame(text, header):

    if not text:
        return None
    else:
        rows = text[0].findAll('tr')[1:]

        rows = [r for r in rows if len(r.findAll('td')) > 0]
        parsed_table = [[col.getText() for col in row.findAll('td')] for row in rows]
        
        return pd.io.parsers.TextParser(parsed_table, names=header[1:], parse_dates=True).get_chunk()

In [25]:
def buildgameLogs(url, basic_advanced):
    glsoup = getSoup(url)

    reg_season_table = glsoup.findAll('table', attrs={'id': 'pgl_'+ basic_advanced})  # id for reg season table

    # parse the table header.  we'll use this for the creation of the DataFrame
    header = []
    for th in reg_season_table[0].findAll('th'):
        if not th.getText() in header:
            header.append(th.getText())

    # add in headers for home/away and w/l columns. a must to get the DataFrame to parse correctly

    header[5] = u'HomeAway'
    header.insert(7, u'WinLoss')
    regular_season = buildDataFrame(reg_season_table, header).iloc[:,0:29]
    
    return regular_season

In [26]:
def getAllPlayerGameLogs(players_gamelogs_url, basic_advanced):
    players_gamelogs = {}
    for name, url in players_gamelogs_url.items():
        try:
            players_gamelogs[name] = buildgameLogs(url, basic_advanced)

        except:
            pass
    print(basic_advanced + " game logs import completed")    
    return players_gamelogs

In [27]:
players_basic_gamelogs = getAllPlayerGameLogs(players_gamelogs_basic_url, 'basic')
players_advanced_gamelogs = getAllPlayerGameLogs(players_gamelogs_advanced_url, 'advanced')



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


basic game logs import completed
advanced game logs import completed


In [28]:
players_basic_gamelogs ={name: players_basic_gamelogs[name].assign(Player= name)  
                         for name in players_basic_gamelogs.keys()}

players_advanced_gamelogs ={name: players_advanced_gamelogs[name].assign(Player= name)  
                         for name in players_advanced_gamelogs.keys()}

In [29]:
players_basic_gamelogs = pd.concat(list(players_basic_gamelogs.values()), axis=0)[['G', 'Date', 'Player', 'Tm', 
                                                        'HomeAway', 'Opp', 'WinLoss', 'GS', 'MP', 'PTS',
                                                        'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 
                                                        'FTA', 'FT%', 'ORB','DRB', 'TRB', 'AST', 'STL', 
                                                        'BLK', 'TOV', 'PF',  '+/-']]

In [30]:
players_advanced_gamelogs = pd.concat(list(players_advanced_gamelogs.values()), axis=0)

In [31]:
players_advanced_gamelogs = players_advanced_gamelogs[['TS%', 'eFG%', 'ORB%', 
                                                                                         'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 
                                                                                         'TOV%','USG%', 'ORtg', 'DRtg', 'GmSc']]

Clean and engineer the data

In [33]:
players_basic_gamelogs = players_basic_gamelogs.apply(pd.to_numeric, errors="ignore")
players_advanced_gamelogs = players_advanced_gamelogs.apply(pd.to_numeric, errors="ignore")

In [34]:
player_game_logs = pd.concat([players_basic_gamelogs.reset_index(), players_advanced_gamelogs.reset_index()], axis = 1)


In [35]:
player_game_logs['NetRtg'] = player_game_logs['ORtg'] - player_game_logs['DRtg']

In [36]:
player_game_logs.head()

Unnamed: 0,index,G,Date,Player,Tm,HomeAway,Opp,WinLoss,GS,MP,...,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,GmSc,NetRtg
0,0,1.0,2017-10-19,Alex Abrines,OKC,,NYK,W (+21),0,24:15,...,7.2,0.0,2.0,0.0,28.6,11.8,45.0,89.0,-1.4,67.0
1,1,2.0,2017-10-21,Alex Abrines,OKC,@,UTA,L (-9),0,29:04,...,8.4,5.8,1.8,0.0,0.0,11.0,123.0,109.0,6.9,116.0
2,2,3.0,2017-10-22,Alex Abrines,OKC,,MIN,L (-2),0,14:20,...,0.0,0.0,3.3,0.0,0.0,5.9,200.0,115.0,2.8,157.5
3,3,4.0,2017-10-25,Alex Abrines,OKC,,IND,W (+18),0,13:26,...,7.9,10.0,3.6,0.0,0.0,14.8,129.0,97.0,3.8,113.0
4,4,5.0,2017-10-27,Alex Abrines,OKC,@,MIN,L (-3),0,8:27,...,0.0,0.0,0.0,0.0,0.0,5.1,0.0,133.0,-1.9,66.5


Save the data for future usages

In [37]:
player_game_logs.to_csv('data/Player Stats.csv')

Find the games that a player actually plays

In [38]:
players_active_games = player_game_logs[pd.notnull(player_game_logs['NetRtg'])]

In [39]:
players_active_games = players_active_games.copy()
players_active_games['HomeAway'] = players_active_games['HomeAway'].apply(lambda x: "Away" if x =="@" else "Home")

In [41]:
players_active_games['WinLoss'] = players_active_games['WinLoss']

In [42]:
players_active_games.to_csv('data/Player Active Games.csv')

Save the data for future usages

**Injuries Report**

In [31]:
soup =  getSoup("https://www.basketball-reference.com/friv/injuries.cgi")
column_names = ['Player', 'Team']
data_rows = soup.findAll('tr')[1:]
injuries = [[td.getText() for td in data_rows[i].findAll('a')]
               for i in range(len(data_rows))]
injuries = pd.DataFrame(injuries, columns=column_names)

In [32]:
injuries.to_csv('data/Player Injuries.csv')

**Game Schedule**

In [57]:
soup =  getSoup("https://www.basketball-reference.com/leagues/NBA_2018_games-february.html")
column_names = ['Date', 'Time', 'Home', 'Home Score', 'Away', 'Away Score', 'Box Score', '', 'Attendence','']
data_rows = soup.findAll('tr')[1:]
schedule = [[td.getText() for td in (data_rows[i].findAll('th'))] 
            + [td.getText() for td in (data_rows[i].findAll('td'))]
               for i in range(len(data_rows))]
schedule = pd.DataFrame(schedule, columns=column_names)[['Date', 'Home', 'Home Score', 'Away', 'Away Score']]

In [58]:
schedule['Date'] = schedule['Date'].apply(lambda x: x[4:])
schedule['Date'] = pd.to_datetime(schedule['Date'], infer_datetime_format=True)

In [60]:
schedule.to_csv('data/Schduled Games.csv')