In [2]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time
import json
from collections import namedtuple
import csv
import glob
from time import strptime

In [3]:
def named_tuple_structure():
    return namedtuple('Player', 'active url name start end pos height weight birthdate college')

In [4]:
def makePlayersDict(verbose=False, nice=True):
    url = 'http://www.basketball-reference.com'
    extension = '/players/'
    soup = BeautifulSoup(requests.get(url+extension).text, "html.parser")
    data = soup.find(id='page_content').p
    links = [url+a['href'] for a in data.find_all('a', href=True)]
    Player = named_tuple_structure()
    d = {}
    for l in links:
        if verbose:
            print l
        if nice:
            time.sleep(1)
        soup = BeautifulSoup(requests.get(l).text, "html.parser")
        data = soup.find(id='players')
        lst = []
        for a in data.find_all('tr'):
            try:
                lst.append([1, url+a.td.strong.a['href']])
            except:
                try:
                    lst.append([0, url+a.td.a['href']])
                except AttributeError:
                    pass
        lst1 = [[a_.get_text() for a_ in a.find_all("td")] for a in data.find_all('tr') if a.find('td') is not None]
        letter_d = {v[0] :Player(*k + v) for k, v in zip(lst, lst1)}
        d.update(letter_d)
    return d       

In [5]:
def playerDict_to_csv(path = '../data/player_dict.csv'):
    d=makePlayersDict()
    with open(path, 'wb') as f:
        Player = named_tuple_structure()
        w = csv.writer(f)
        w.writerow((['Name']+list(Player._fields)))
        w.writerows(([name] + list(data) for name, data in d.items()))

In [6]:
#playerDict_to_csv()

In [7]:
def daily_boxscores(year, verbose=False, nice=True, startmonth = 1):
    url = 'http://www.basketball-reference.com'
    extension = '/boxscores/index.cgi?month=1&day=1&year='+str(int(year))
    if startmonth != 1:
        extension = '/boxscores/index.cgi?month='+str(int(startmonth))+'&day=1&year='+str(int(year))
    forward, past = True, True
    results = {}
    while past or forward:
        try:
            soup = BeautifulSoup(requests.get(url+extension).text, "html.parser")
        except ConnectionError:
            print url+extension, "Failed, trying again"
            pass
        else:
            if verbose:
                print url+extension
            data = soup.find(id='page_content')
            if forward and past:
                fut_link = data.find_all('a', href=True)[1]['href']
                forward = False
            if forward:
                link = data.find_all('a', href=True)[1]['href']
            if past:
                link = data.a['href']
            results['-'.join([e.split('=')[1] for e in extension.split('&')])] = [url+s['href'] 
                                                                                  for s in soup.find_all('a', href=True, text='Final')]
            extension = link
            if not len(soup.find_all('h2', text='League Standings')):
                past = False
                if forward:
                    forward = False
                else:
                    forward = True
                    extension = fut_link 
            if nice:
                time.sleep(1)
    return {k : v for k, v in results.iteritems() if v}

In [8]:
def boxscoreDict_to_csv(path = '../data/boxscore_dict_', year = 2015, verbose = False, nice = True, startmonth=1):
    if verbose:
        print path+str(int(year))+'_.csv'
    d=daily_boxscores(year, verbose=verbose, nice=nice,startmonth=startmonth)
    with open(path+str(int(year))+'_.csv', 'wb') as f:
        w = csv.writer(f)
        w.writerow(['Date', 'Boxscore Links'])
        w.writerows([k] + v for k, v in d.items())

In [9]:
#%%time
#starmonth = 1
#for year in range(2015, 1944, -1):
#    if year == 1999:
#        startmonth = 3
#    print year
#    try:
#        boxscoreDict_to_csv(year = year, startmonth = startmonth)
#    except NameError:
#        print "Manual Stop"
#        break
#    except:
#        print "Failed", year

In [10]:
def compileDicts(path = '../data/boxscore_dict_'):
    allFiles = glob.glob(path + "*_.csv")
    games = []
    for file_ in allFiles:
        year = file_.split('_')[-2]
        with open(file_, 'rb') as f:
            w = csv.reader(f)
            d = {}
            for i, row in enumerate(w):
                if i:
                    games=games+[[game, row[0], year] for game in row[1:]]
    df = pd.DataFrame(games, columns=['Link','Date','Season'])
    df['Year'], df['Month'], df['Day'] = zip(*df.Date.map(lambda x : [int(y) for y in [x.split('-')[2]]+x.split('-')[:2]]))
    df_ = df.sort(columns=['Year','Month','Day']).reset_index(drop=True)
    print path+'all.csv'
    df_.to_csv(path_or_buf = path+'all.csv')

In [11]:
#%%time
#compileDicts()

In [12]:
df = pd.read_csv('../data/boxscore_dict_all.csv', index_col=0)

In [13]:
def scrapeTable(t, ind):
    headers = [h.text for h in t.thead.find_all('tr')[-1].find_all('th')]
    body = [[i.text for i in p.find_all('td')] for p in t.tbody.find_all('tr') if p.find_all('td')]
    return headers+body

In [14]:
def boxscore_date(url, nice=True, verbose=False):
    soup = BeautifulSoup(requests.get(url).text, "html.parser")
    if nice:
        time.sleep(1)
    if verbose:
        print url
    data = soup.find(id='page_content')
    try:
        Playoffs = data.table.table.td.strong.text
    except AttributeError:
        Playoffs = np.nan
    try:
        teams = soup.find('td', class_ = 'background_yellow').a.text
        teams = [teams[:3], teams[3:]]
        Home_Team, Away_Team = teams[1], teams[0]
    except AttributeError:
        Home_Team, Away_Team = np.nan
    tables = soup.find_all('table', id=[team+a for team in teams for a in ['_advanced', '_basic']])
    Home_Basic = Home_Advanced = Away_Advanced  = Away_Basic = np.nan 
    for i, t in enumerate(tables):
        val = scrapeTable(t, t['id'].split('_')[-1])
        if t['id'] == Home_Team+'_basic':
            Home_Basic = val
        elif t['id'] == Away_Team+'_basic':
            Away_Basic = val
        elif t['id'] == Home_Team+'_advanced':
            Home_Advanced = val
        elif t['id'] == Away_Team+'_advanced':
            Away_Advanced = val
        else:
            print "You did something wrong", url
            raise ValueError
    return [Home_Team, Away_Team, Home_Basic, Away_Basic, Home_Advanced, Away_Advanced, Playoffs]

In [15]:
def addData(df, verbose = False, nice = True):
    df = df.copy()
    df['Home_Team'], df['Away_Team'], df['Home_Basic'], df['Away_Basic'], df['Home_Advanced'], df['Away_Advanced'], df['Playoffs'] = zip(*df.Link.map(lambda x : boxscore_date(x, verbose=verbose, nice=nice)))    
    return df

In [20]:
df_ = df[df.Year < 1970]
df__ = df_[(df_.Month == 10) & (df_.Year % 40 == 0)]

print df.shape
print df_.shape
print df__.shape

(63157, 6)
(10126, 6)
(18, 6)


In [21]:
%%time
outdf = addData(df_)

ConnectionError: HTTPConnectionPool(host='www.basketball-reference.com', port=80): Max retries exceeded with url: /boxscores/195012130BLB.html (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x000000000F929A90>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',))

In [None]:
def insertCreativeNameHere(path = '../data/boxscore_dict_', year = 2015):
    with open(path+str(int(year))+'.csv', 'rb') as f:
        w = csv.reader(f)
        d = {}
        for i, row in enumerate(w):
            if i:
                d[row[0]] = row[1:]
        counter = 0
        d_ = {}
        for k, vs in d.iteritems():
            d__ = {}
            for v in vs:
                if not counter % 123:
                    print counter/float(2460)
                soup = BeautifulSoup(requests.get(v).text, "html.parser")
                teams = soup.find('td', class_ = 'background_yellow').a.text
                teams = [teams[:3], teams[3:]]
                for team in teams:
                    for s in soup.find_all('table', id=team + '_basic'):
                        if not counter:
                            headers = [h.text for h in s.thead.find_all('tr')[-1].find_all('th')]
                        d__[team] = (v, [[i.text for i in p.find_all('td')] for p in s.tbody.find_all('tr') if p.find_all('td')])
                counter += 2
            d_[k] = d__
        d_['headers'] = headers
        return d_

In [87]:
insertCreativeNameHere

0.0
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
