In [25]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

# Scraping Data

#### Getting Team Abrviations (ONLY RUN 1st time)

In [2]:
# pulling table of team names and abreviatiosn from wikipedia
url = 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Basketball_Association/National_Basketball_Association_team_abbreviations'
r = requests.get(url)
soup = BeautifulSoup(r.content)

In [3]:
# extracting just the table
table = soup.find('table',{'class':'wikitable sortable'})

In [4]:
# iterating through the table getting each row
team = []
for row in table.find_all('tr'):
    column = row.find_all('td')
    to_append = [x.get_text().strip() for x in column]
    team.append(to_append)

In [5]:
# splitting into two lists
# team_abrev = all teams 3 letter abreviations
# team_names = all teams full names
team_abrev = [x[0] for x in team[1:]]
team_names = [x[1] for x in team[1:]]

Going to instantiate after the first run as to not need to ping the website over and over again

In [12]:
team_abrev = ['ATL','BKN','BOS','CHA','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NOP','NYK','OKC','ORL','PHI','PHX','POR','SAC','SAS','TOR','UTA','WAS']
team_names = ['Atlanta Hawks','Brooklyn Nets','Boston Celtics','Charlotte Hornets','Chicago Bulls','Cleveland Cavaliers','Dallas Mavericks','Denver Nuggets','Detroit Pistons','Golden State Warriors','Houston Rockets','Indiana Pacers','Los Angeles Clippers','Los Angeles Lakers','Memphis Grizzlies','Miami Heat','Milwaukee Bucks','Minnesota Timberwolves','New Orleans Pelicans', 'Oklahoma City Thunder','Orlando Magic','Philadelphia 76ers','Phoenix Suns','Portland Trail Blazers','Sacramento Kings','San Antonio Spurs', 'Utah Jazz','Washington Wizards']

#### Getting the Data for 1 team for 1 season

In [73]:
team_test = 'WAS'
year_test = '2000'
url_test = 'https://www.basketball-reference.com/teams/{}/{}_games.html'.format(team_test,year_test)

In [75]:
r = requests.get(url_test)
soup = BeautifulSoup(r.content)

In [83]:
table = soup.find('table',{'class':'sortable stats_table'})
table_body = table.find('tbody')

In [94]:
table_info = []
for row in table_body.find_all('tr'):
    column = row.find_all('td')
    row_value = [x.get_text() for x in column]
    #print(row_value)
    table_info.append(row_value)

In [96]:
table_info[0]

['Tue, Nov 2, 1999',
 '',
 'Box Score',
 '',
 'Atlanta Hawks',
 'W',
 '',
 '94',
 '87',
 '1',
 '0',
 'W 1',
 '']

`table_info` as of now is just the list of games with the most basic of data. It **does not** include the stats of each game. It includes:
* date
* BLANK
* LINK TO BOX SCORE
* @ if away, BLANK if home
* opponent
* win or loss
* BLANK
* points scored
* opponent points scored
* wins that season
* losses that season
* streak of win or losses
* notes (always blank)

### Getting baseline data for 1 season

#### Teams to Have Changed their name since 2000

* 2002 - Vanvouver Grizzlies (VAN) --> Memphis Grizzlies (MEM)
* 2003 - Charlotte Hornets (CHH) --> New Orleans Hornets (NOH)
* 2005 - Addition of Charlotte 
* 2006 - New Orleans Hornets (NOH) --> New Orleans/Oklahoma City Hornets (NOK)
* 2008 - New Orleans/Oklahoma City Hornets (NOK) --> New Orleans Hornets (NOH)
* 2009 - Seattle SuperSonics (SEA) --> Oklahoma City Thunder (OKC)
* 2013 - New Jersey Nets (NJN) --> Brooklyn Nets (BRK)
* 2014 - New Orleans Hornets (NOH) --> New Orleans Pelicans (NOP)
* 2015 - Charlotte Bobcats (CHA) --> Charlotte Hornets (CHO)

Due to all these changes going to have to change the abreviation list so that when the URL for baseball reference it can find the right team. The way that it works is that the url of the teams 82 game schedule (not including the statistics of the game) is based off the team abbreviation and the year. This will then be used to get the game-by-game statistics.

In [17]:
team_abrev = ['ATL','NJN','BOS','CHH','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','VAN','MIA','MIL','MIN','NYK','SEA','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS']


In [18]:
len(team_abrev)

29

In [15]:
def change_team_abrev(team_abrev,year):
    if year == 2002:
        team_abrev.remove('VAN')
        team_abrev.append('MEM')
    if year == 2003:
        team_abrev.remove('CHH')
        team_abrev.append('NOH')
    if year == 2005:
        team_abrev.append('CHA')
    if year == 2006:
        team_abrev.remove('NOH')
        team_abrev.append('NOK')
    if year == 2008:
        team_abrev.remove('NOK')
        team_abrev.append('NOH')
    if year == 2009:
        team_abrev.remove('SEA')
        team_abrev.append('OKC')
    if year == 2013:
        team_abrev.remove('NJN')
        team_abrev.append('BRK')
    if year == 2014:
        team_abrev.remove('NOH')
        team_abrev.append('NOP')
    if year == 2015:
        team_abrev.remove('CHA')
        team_abrev.append('CHO')
    return team_abrev

In [22]:
# info for the entire season
table_info = []
year_test = [2000]
team_test = ['VAN']
# iterating through each team abreviation
for year in year_test:
    for team in team_test:
        url = 'https://www.basketball-reference.com/teams/{}/{}_games.html'.format(team,year)
        r = requests.get(url)
        soup = BeautifulSoup(r.content)
        table = soup.find('table',{'class':'sortable stats_table'})
        table_body = table.find('tbody')
        table_info = []
        for row in table_body.find_all('tr'):
            column = row.find_all('td')
            row_value = [x.get_text() for x in column]
            table_info.append(row_value)

In [26]:
df = pd.DataFrame(table_info)

In [47]:
df.dropna(axis=0,inplace=True)

#### Getting Box Score
Getting the box score url is based on the home team and the date that the game was played