# Web Scraping Basketball Reference for each team Win Lose Ratio from 2000 to 2018 season

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

The following three cells setup the get request of the url and save the response to the variable r.

In [2]:
#send a get request with the specified url and return the response to the store variable to retrieve the text. Then used
#Beautiful Soup library is a parser that allows me to parse through the html code from the get response.
url = 'https://www.basketball-reference.com/leagues/NBA_2018.html'
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc, 'html.parser')

In [3]:
#To scrape the column headers from the html, I used the find_all method from the soup library to find the corresponding 
#hmtl tags that the information I need is under. I then use the getText method to retrieve the texts within these tags.

columns = []

for i in range(8):
    header = soup.find_all('tr', limit = 2)[1].find_all('th')[i].getText()
    columns.append(header)

columns[0] = 'Team'

In [4]:
#Here I am scraping off the data from each team in the webpage as a list and appending the list in a list called Teams_season.

Teams_Season = []
size_data_rows = len(soup.find_all('tr', limit = 35))

for i in range(2,size_data_rows):
    data_rows = soup.find_all('tr', limit = 35)[i]
    try:
        team = [stat.getText() for stat in data_rows if 'Division' not in stat.getText()]
        Teams_Season.append(team)
    except:
        continue

In [6]:
df = pd.DataFrame(Teams_Season, columns = columns)
df['Year'] = '2018'
df = df.reset_index()

In [7]:
#This dictionary was created so that the full name of each team matches the acronym since the info scraped from 
#the webpage are in full name while the season_stat dataset from kaggle use team acronyms.

team_name_acr = {'Boston Celtics': 'BOS', 'New York Knicks':'NYK', 'Philadelphia 76ers':'PHI','Toronto Raptors':'TOR','New Jersey Nets':'NJN', 
'Chicago Bulls':'CHI', 'Indiana Pacers':'IND', 'Milwaukee Bucks':'MIL', 'Detroit Pistons':'DET','Cleveland Cavaliers':'CLE', 
'Miami Heat':'MIA', 'Atlanta Hawks':'ATL', 'Orlando Magic':'ORL', 'Washington Wizards':'WAS','Charlotte Bobcats':'CHA',
'Oklahoma City Thunder':'OKC', 'Denver Nuggets':'DEN', 'Utah Jazz':'UTA', 'Portland Trail Blazers':'POR',
'Minnesota Timberwolves':'MIN','Los Angeles Lakers':'LAL', 'Los Angeles Clippers':'LAC', 'Phoenix Suns':'PHO', 'Golden State Warriors':'GSW',
'Sacramento Kings':'SAC','San Antonio Spurs':'SAS', 'Memphis Grizzlies':'MEM', 'Dallas Mavericks':'DAL', 'Houston Rockets':'HOU',
'New Orleans Hornets':'NOH', 'New Orleans Pelicans':'NOP', 'Charlotte Hornets':'CHA', 'Vancouver Grizzlies':'VAN', 'Seattle Supersonics': 'SEA', 'Brooklyn Nets': 'BKN'
 }

In [8]:
#This nested for loop uses the dictionary to change the team names to its acronyms so that this data can be merged later.

for i in range(df.shape[0]):
    for key, value in team_name_acr.items():
        try:
            if key in df.loc[i, 'Team']:
                df.loc[i,'Team'] = value
        except:
            continue

In [9]:
#Opening a csv file and append data scraped into the file. The cell below shows the dataframe being exported to csv.

with open('2018_win_ratio.csv', 'w') as f:
    df.to_csv(f, header=True)