In [1]:
%load_ext autoreload
%autoreload 2

from bs4 import BeautifulSoup
import requests
from bs4 import Comment, Tag
import pandas as pd
import bs4
from dictionaries import *
PATH = '/Users/doguaraci/Desktop/nba'

## Basketball Reference Game-log Scraper

In [2]:
def get_html(link):
    source = requests.get(link).text
    soup = BeautifulSoup(source,'html5lib') 
    
    return soup

def get_table_html(soup):
    for comment in soup.find_all(text=lambda e: isinstance(e, bs4.Comment)):
        tag = bs4.Tag(name="comment")
        tag.string = comment.strip()
        comment.replace_with(tag)
    a = soup.find_all('comment')
    text = str(a[21])
    text = text.replace('&lt;','<')
    text = text.replace('&gt;','>')
    
    result = BeautifulSoup(text,'html5lib') 
    
    return result.find('table')

def get_match_list(soup):
    games = []
    for a in soup.find_all('tr')[1:]:    
        newdict = {}
        for d in a.find_all('td'):
            newdict[d['data-stat']] = d.text
            games.append(newdict)   
    return games

def create_df(playoffs):
    df = pd.DataFrame(playoffs)
    columns = df.columns
    final = df.drop_duplicates().reset_index()[['age', 'ast', 'blk', 'date_game', 'drb', 'fg', 'fg3', 'fg3_pct', 'fg3a',
        'fg_pct', 'fga', 'ft', 'ft_pct', 'fta', 'game_location', 'game_result',
        'game_score', 'game_season', 'gs', 'mp', 'opp_id', 'orb', 'pf', 'pts',
        'stl', 'team_id', 'tov', 'trb']]
    
    return final

In [3]:
def get_playoff_df(link):

    soup = get_html(link)
    soup = get_table_html(soup)
    playoffs = get_match_list(soup)

    return create_df(playoffs)

## Create ranking

In [9]:
def get_choke(df):
    if (df.game_score<10 and df.difference<15):
        return 1
    else:
        return 0

def process_df(df):

    df['result'] = df.game_result.apply(lambda x: x[0])

    df['difference'] = df.game_result.apply(lambda x: x[3:-1])
    df['difference'] = df.difference.apply(int)
    df['game_score'] = df.game_score.apply(float)

    df['W'] = 0
    df['L'] = 0
    df['stage'] = 0
    
    df['choke'] = df.apply(get_choke, axis=1)
    df['great'] = df.game_score.apply(lambda x: 1 if x>=25 else 0)
    df['monster'] = df.game_score.apply(lambda x: 1 if x>=30 else 0)

    for i in range(df.shape[0]):
        if i == 0:
            df.loc[i,'stage'] = 1

        else:
            if df.loc[i,'opp_id'] == df.loc[i-1,'opp_id']:
                df.loc[i,'stage'] = df.loc[i-1,'stage'] 
                if df.loc[i-1,'result'] == 'W':
                    df.loc[i,'W'] = df.loc[i-1,'W'] + 1
                    df.loc[i,'L'] = df.loc[i-1,'L']
                else: 
                    df.loc[i,'L'] = df.loc[i-1,'L'] + 1
                    df.loc[i,'W'] = df.loc[i-1,'W']
            else:
                df.loc[i,'stage'] = df.loc[i-1,'stage'] + 0.33
    

    criteria2 = ((df["W"] == 0) & (df["L"] == 2)) \
                | ((df["W"] == 1) & (df["L"] == 2)) \
                | ((df["W"] == 1) & (df["L"] == 3)) \
                | ((df["W"] == 2) & (df["L"] == 2)) \
                | ((df["W"] == 2) & (df["L"] == 3)) \
                | ((df["W"] == 3) & (df["L"] == 2)) \
                | ((df["W"] == 3) & (df["L"] == 3))
    
    criteria = ((df["W"] == 0) & (df["L"] == 2)) \
                | ((df["W"] == 1) & (df["L"] == 2)) \
                | ((df["W"] == 1) & (df["L"] == 3)) \
                | ((df["W"] == 2) & (df["L"] == 2)) \
                | ((df["W"] == 2) & (df["L"] == 3)) \
                | ((df["W"] == 3) & (df["L"] == 2)) \
                | ((df["W"] == 3) & (df["L"] == 3)) \
                | ((df['stage'] > 1.5) & ((((df["W"] != 3) & (df["L"] != 0))) | ((df["W"] != 0) & (df["L"] != 3))))
    
    df.loc[criteria2,'stage'] = df.loc[criteria2,'stage'] * 1.15
    
    def weighted(df):
        return df.game_score * df.stage
    
    df['w_game_score'] = df.apply(weighted,axis=1)

    
    return df[criteria]

In [10]:
def get_player_results(name):
    yearlist = player2year[name]
    player = []
    for year in yearlist:
        try:
            link = "https://www.basketball-reference.com/players/c/" + player2code[name] +"/gamelog/" + year
            df = get_playoff_df(link)
            df = process_df(df)
            df['year'] = year
            player.append(df)
        except:
            continue
    result = pd.concat(player).reset_index()
    result['name'] = name
    
    return {'Name': name, \
            'Average':round(result.game_score.mean(),2), \
            'Weighted Average':result.w_game_score.sum() / result.stage.sum(), \
            'Choke_count':result.choke.sum(), \
            'Great_count':result.great.sum(), \
            'Monster_count':result.monster.sum(), \
            'Win_percent':round(result[result['result']=='W'].shape[0] / result.shape[0],2), \
            'Total_games': result.shape[0]}, result

## Get results

In [6]:
allplayer_scores = []
allplayer_games = []
for player in player2year.keys():
    score, games = get_player_results(player)
    allplayer_scores.append(score)
    allplayer_games.append(games)
    print("Done: {}".format(player))

scores_df = pd.DataFrame(allplayer_scores).reset_index()
games_df = pd.concat(allplayer_games).reset_index()

Done: James Harden
Done: Stephen Curry
Done: Anthony Davis
Done: Kevin Durant
Done: LeBron James
Done: Russell Westbrook
Done: Kawhi Leonard
Done: Chris Paul
Done: Kobe Bryant
Done: Tim Duncan
Done: Dwight Howard
Done: Dwayne Wade
Done: Dirk Nowitzki
Done: Kevin Garnett
Done: Steve Nash
Done: Allen Iverson
Done: Shaquille O'Neal
Done: Tracy McGrady
Done: Jason Kidd
Done: Gary Payton
Done: Karl Malone
Done: David Robinson
Done: Scottie Pippen
Done: John Stockton
Done: Michael Jordan
Done: Charles Barkley
Done: Hakeem Olajuwon
Done: Patrick Ewing
Done: Dominique Wilkins
Done: Isiah Thomas
Done: Magic Johnson
Done: Larry Bird


In [15]:
scores_df = scores_df.sort_values(by='Weighted Average',ascending=False)

In [16]:
scores_df = scores_df[scores_df['Total_games'] > 15]

In [17]:
scores_df.to_csv('scores.csv')

In [18]:
games_df.to_csv('games.csv')

In [19]:
a = games_df[games_df['choke']==1].groupby('name')

In [22]:
a['game_score'].agg('mean')

name
Allen Iverson         6.516667
Charles Barkley       6.350000
Chris Paul            8.000000
David Robinson        9.150000
Dirk Nowitzki         6.141667
Dominique Wilkins     3.350000
Dwayne Wade           8.463636
Dwight Howard         7.771429
Gary Payton           8.475000
Hakeem Olajuwon       6.125000
Isiah Thomas          8.722222
James Harden          4.225000
Jason Kidd            6.775000
John Stockton         7.200000
Karl Malone           9.462500
Kevin Durant          6.750000
Kevin Garnett         9.985714
Kobe Bryant           7.737500
Larry Bird            8.700000
LeBron James          8.372727
Magic Johnson         6.733333
Michael Jordan        9.100000
Patrick Ewing         6.850000
Russell Westbrook     7.066667
Scottie Pippen        8.415789
Shaquille O'Neal      8.086667
Stephen Curry         6.614286
Steve Nash            7.918182
Tim Duncan            8.527778
Tracy McGrady        10.200000
Name: game_score, dtype: float64