### ML Project Data Scraping

In [1]:
# Import Libraries
import requests
from bs4 import BeautifulSoup
import  pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import string
import time
import re
from functools import reduce
trim = re.compile(r'[^\d.,]+')

### Generate Date Links

In [27]:
years = range(2015, 2019)
months = range(1, 13)
days = range(1, 32)

date_links = list()

for year in years:
    for month in months:
        for day in days:
            if month > 9:
                if day > 9:
                    link = "https://www.basketball-reference.com/boxscores/?month={}&day={}&year={}".format(month, day, year)
                    date_links.append(link)
                else:
                    link = "https://www.basketball-reference.com/boxscores/?month={}&day=0{}&year={}".format(month, day, year)
                    date_links.append(link)
            else:
                if day > 9:
                    link = "https://www.basketball-reference.com/boxscores/?month={}&day={}&year={}".format(month, day, year)
                    date_links.append(link)
                else:
                    link = "https://www.basketball-reference.com/boxscores/?month=0{}&day=0{}&year={}".format(month, day, year)
                    date_links.append(link)

### Generate Game Links

In [28]:
# Change the value of data_scrapped to false if you don't have the data yet
data_scrapped = False

# Determine how much data to scrap:
lenght = len(date_links)
length = 100

if data_scrapped == False:
    all_links = list()

    for i in range(0, 100):
        time.sleep(3)
        scrape_url = date_links[i]
        r = requests.get(scrape_url)
        soup = BeautifulSoup(r.content, "lxml")
    
        links = list()
        for link in soup.find_all('a'):
            links.append(link.get('href'))
        
            # Delete duplicates
            links = list(set(links))

            # Create a Pandas DataFrame
            links_df = pd.DataFrame({'col':links})

            # Drop any missing values
            links_df = links_df.dropna(how='any')
        
            val = date_links[i]
            month = val[54:56]
            year = val[69:73]
            day = val[61:63]
            the_str = year+month+day

            # Drop links that don't contain 'boxscores'
            good_links = links_df['col'].str.contains('/boxscores/{}'.format(the_str))   
            links_df = links_df[good_links]

            # Turn the links into a list
            links = list(links_df['col'])
        
        all_links.append(links)
    
        if i % 100 == 0:
            print(i)
    
    # Convert to a flat list
    all_links = reduce(lambda x,y: x+y, all_links)
    
    # Turn into a complete hyperlink
    for i in range(0, len(all_links)):
        all_links[i] = "https://www.basketball-reference.com" + all_links[i]
        
    # Create a Pandas DataFrame
    all_links_df = pd.DataFrame({'col':all_links})
    
    # Write Game Links to csv
    all_links_df.to_csv("all_links.csv")

0


SSLError: HTTPSConnectionPool(host='www.basketball-reference.com', port=443): Max retries exceeded with url: /boxscores/?month=02&day=01&year=2015 (Caused by SSLError(SSLError("bad handshake: SysCallError(-1, 'Unexpected EOF')",),))

### Get Player-Game Data

In [29]:
all_links_df = pd.read_csv("all_links.csv")
game_links = list(all_links_df['col'])

In [30]:
# Define key scrap function
def scrape_element(css_selector):
    return (page.select_one(css_selector)).get_text()

In [140]:
# Change the value of data_scrapped to false if you don't have the data yet
more_data_scrapped = False

player_game_data = pd.DataFrame()

if more_data_scrapped == False:

    for i in range(0, 10):
        time.sleep(5)
        
        # Get web page data
        scrape_url = game_links[i]
        r = requests.get(scrape_url)
        page = BeautifulSoup(r.content, "lxml")
        
        # Get basic and advanced stats
        team1_bas = pd.read_html(game_links[i])[0]
        team1_adv = pd.read_html(game_links[i])[1]
        team2_bas = pd.read_html(game_links[i])[2]
        team2_adv = pd.read_html(game_links[i])[3]
        
        # Remove game totals row
        team1_bas = team1_bas.drop(team1_bas.index[-1])
        team1_adv = team1_adv.drop(team1_adv.index[-1])
        team2_bas = team2_bas.drop(team2_bas.index[-1])
        team2_adv = team2_adv.drop(team2_adv.index[-1])
        
        # Merge basic and advanced stat tables
        team1 = pd.merge(team1_bas, team1_adv, left_index=True, right_index=True)
        team2 = pd.merge(team2_bas, team2_adv, left_index=True, right_index=True)
        
        # Drop ugly second header row
        team1.columns = team1.columns.droplevel(0)
        team2.columns = team2.columns.droplevel(0)
        
        # Change missing values to 0
        team1 = team1.fillna(0)
        team2 = team2.fillna(0)
        
        # Get game-specific data
        team1_name = scrape_element('strong a:nth-of-type(1)')
        team2_name = scrape_element('strong a:nth-of-type(2)')
        #team1_score = scrape_element('.scorebox div:nth-of-type(5)')
        #team2_score = scrape_element('.scorebox div:nth-of-type(11)')
        
        # Add game-specific data to tables
        team1['Team'] = team1_name
        team2['Team'] = team2_name
        team1['Opponent'] = team2_name
        team2['Opponent'] = team1_name
        
        # Remove the 'reserves' row headers
        team1 = team1.drop(team1.index[5])
        team2 = team2.drop(team2.index[5])
        
        # Combine to single table
        frames = [team1, team2]
        game = pd.concat(frames)
        
        # Add game-specific data to table
        date = scrape_element('.scorebox_meta div:nth-of-type(1)')
        location = scrape_element('.scorebox_meta div:nth-of-type(2)')
        game['Date'] = date
        game['Location'] = location
        
        player_game_data = player_game_data.append(game)
   
    # Export as CSV
    #player_game_data.to_csv("player_game_data.csv")

In [141]:
player_game_data

Unnamed: 0,Starters,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,STL%,BLK%,TOV%,USG%,ORtg,DRtg,Team,Opponent,Date,Location
0,Arron Afflalo,39:56,8,14,.571,1,3,.333,2,2,...,1.3,1.6,21.2,19.6,104,115,Denver Nuggets,Chicago Bulls,"8:00 PM, January 1, 2015","United Center, Chicago, Illinois"
1,Ty Lawson,39:52,8,16,.500,1,1,1.000,3,4,...,1.3,0.0,10.1,20.5,119,113,Denver Nuggets,Chicago Bulls,"8:00 PM, January 1, 2015","United Center, Chicago, Illinois"
2,Kenneth Faried,35:56,7,14,.500,0,0,0,4,4,...,2.8,5.3,6.0,19.3,133,99,Denver Nuggets,Chicago Bulls,"8:00 PM, January 1, 2015","United Center, Chicago, Illinois"
3,Wilson Chandler,33:35,8,16,.500,2,5,.400,4,4,...,0.0,0.0,10.1,24.4,123,119,Denver Nuggets,Chicago Bulls,"8:00 PM, January 1, 2015","United Center, Chicago, Illinois"
4,Timofey Mozgov,24:45,1,4,.250,0,0,0,2,2,...,0.0,0.0,0.0,8.2,114,114,Denver Nuggets,Chicago Bulls,"8:00 PM, January 1, 2015","United Center, Chicago, Illinois"
6,Jusuf Nurkic,21:24,3,9,.333,0,1,.000,4,6,...,0.0,8.9,7.9,24.5,104,104,Denver Nuggets,Chicago Bulls,"8:00 PM, January 1, 2015","United Center, Chicago, Illinois"
7,Gary Harris,16:32,2,7,.286,1,3,.333,2,2,...,0.0,0.0,0.0,19.7,115,122,Denver Nuggets,Chicago Bulls,"8:00 PM, January 1, 2015","United Center, Chicago, Illinois"
8,Nate Robinson,14:11,0,7,.000,0,1,.000,0,1,...,0.0,0.0,0.0,21.7,25,117,Denver Nuggets,Chicago Bulls,"8:00 PM, January 1, 2015","United Center, Chicago, Illinois"
9,J.J. Hickson,13:50,0,5,.000,0,0,0,1,2,...,0.0,0.0,25.4,23.6,30,114,Denver Nuggets,Chicago Bulls,"8:00 PM, January 1, 2015","United Center, Chicago, Illinois"
10,Darrell Arthur,Did Not Play,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,Denver Nuggets,Chicago Bulls,"8:00 PM, January 1, 2015","United Center, Chicago, Illinois"


### Bring in Game Data

In [None]:
player_game_data_df = pd.read_csv("player_game_data.csv")