In [1]:
import requests
from bs4 import BeautifulSoup as soup
import pandas as pd
import uuid
import difflib
import html5lib
pd.options.mode.chained_assignment = None 

In [2]:
player_index = pd.read_csv('../updated_datasets/player_index.csv')
player_index = player_index[player_index.columns[1:]]
player_index

Unnamed: 0,Player,UUID
0,Kevin Garnett,4035d838-5ed7-4507-86ad-0488a9b9c358
1,Shaquille O'Neal,74b99a51-bb65-436f-8bcc-7022af2a490a
2,Alonzo Mourning,8051eb29-ecd7-46b5-b2fa-415777e11e83
3,Juwan Howard,5bdd7fb5-050d-4b0c-befb-5fbf7109397c
4,Hakeem Olajuwon,be9d9c53-d445-4b8c-acc9-ad1c30e8598b
...,...,...
2224,Jordan Schakel,49852ad9-c5e3-4124-bd75-2ad27b5a859f
2225,Jordan Goodwin,70d9ad35-fb19-497a-a358-d77a2a2e89ba
2226,Craig Sword,c29acdbe-9077-44c0-a8b3-f997ddff72a2
2227,Jaime Echenique,727da240-fbce-4643-ba94-f74509e2e746


In [3]:
def awards_data():
    main = soup(requests.get('http://www.espn.com/nba/history/awards').text, 'html.parser')
    links = [a['href'] for a in main.find_all('a', class_='bi', href=True)]
    links.remove('//www.espn.com/nba/history/awards/_/id/34')
    null_players = []
    
    for link in links:
        #grab the header of the website page
        name = str(soup(requests.get(f'http:{link}').text).select('h2')[0]).replace(
            '<h2>', '').replace('</h2>', '').replace('<h2>', '').partition('- ')[-1].split()
        
        #'the' doesn't get iterated through so remove it
        if 'the' in name:
            name.remove('the')
            
        #set variable name to the first two capitalized words of the header
        if len(name) > 1:
            for word in name:
                if word[0] != word[0].upper():
                    name.remove(word)
            var_name = (name[0] + '_' + name[1]).lower().replace('-', '_')
        else:
            var_name = ''.join(name).lower()
        
        #grab the datset
        globals()[var_name] = pd.read_html(requests.get(f'http:{link}').text)[0]
        
        #clean the data
        globals()[var_name].columns = globals()[var_name].iloc[1].str.capitalize()
        globals()[var_name] = globals()[var_name].tail(-2)

        #some tables have blank year columns so filter for that
        globals()[var_name].fillna(method='ffill', inplace=True)

        #more cleaning
        globals()[var_name] = globals()[var_name][globals()[var_name]['Year'].astype('int') >= 2000]
        globals()[var_name] = globals()[var_name][globals()[var_name].columns[:4]]

        #gather uuids
        globals()[var_name] = pd.merge(left=globals()[var_name], right=player_index, how='left', on='Player')
        
        #locate mispelled names
        for row in globals()[var_name][globals()[var_name]['UUID'].isnull()]['Player']:
            null_players.append(row)
            
    null_players = pd.DataFrame(null_players, columns=['Player']).drop_duplicates()

    return null_players

In [4]:
def just_name(row):
    return str(row['Name']).split(', ')[0].split('  ')[-1]

def just_pos(row):
    return str(row['Temp']).split(', ')[-1]

def stats_data():
    null_players = []
    for i in range(22):
        year = (2001 + i)
        var_name = f'stats_{year - 1}'

        table_soup = soup(requests.get(f'http://www.espn.com/nba/statistics/rpm/_/year/{year}/page/1').text, 'html.parser')
        page_count = int(str(table_soup.select('div.page-numbers')).split('of ')[1].split('<')[0])
        table = pd.read_html(str(table_soup.select('table.tablehead')))[0]
        for i in range(2, page_count + 1):
            table_soup = soup(requests.get(f'http://www.espn.com/nba/statistics/rpm/_/year/{year}/page/{i}').text, 'html.parser')
            table = pd.concat([table, pd.read_html(str(table_soup.select('table.tablehead')))[0]])
            
        table.columns = table.iloc[0].str.capitalize()
        table = table.tail(-1)
        table['Temp'] = table['Name']
        table['Name'] = table.apply(just_name, axis=1)
        table['Temp'] = table.apply(just_pos, axis=1)
        table.rename(columns={'Temp': 'Pos', 'Name': 'Player'}, inplace=True)
        cols = table.columns.values
        table = pd.merge(left=table, right=player_index, how='left', on='Player')

        for row in table[table['UUID'].isnull()]['Player']:
            null_players.append(row)
    null_players = pd.DataFrame(null_players, columns=['Player']).drop_duplicates()
    return null_players

In [5]:
def draft_data():
    null_players = []
    for i in range(22):
        year = 2000 + i
        var_name = f'draft_{year}'
        
        globals()[var_name] = pd.read_html(requests.get(f'https://www.basketball-reference.com/draft/NBA_{year}.html').text)[0]
        
        globals()[var_name].columns = globals()[var_name].columns.get_level_values(1)
        globals()[var_name] = globals()[var_name][globals()[var_name].columns[1:4]]
        globals()[var_name].dropna(inplace=True)
        globals()[var_name] = globals()[var_name][globals()[var_name]['Player'] != 'Player']
        globals()[var_name].rename(columns={'Pk':'Pick','Tm':'Team'}, inplace=True)
        globals()[var_name]['Year'] = year
        globals()[var_name] = pd.merge(left=globals()[var_name], right=player_index, how='left', on='Player')
        
        for row in globals()[var_name][globals()[var_name]['UUID'].isnull()]['Player']:
            null_players.append(row)
    null_players = pd.DataFrame(null_players, columns=['Player']).drop_duplicates()

In [6]:
def locate_nulls():
    null_players = awards_data()
    null_players = pd.concat([stats_data(), null_players]).drop_duplicates()
    null_players = pd.concat([draft_data(), null_players]).drop_duplicates().reset_index(drop=True)
    null_players = null_players[null_players['Player'] != 'NAME'] 
    return null_players

In [7]:
null_players = locate_nulls()

In [8]:
null_players

Unnamed: 0,Player
0,Darrel Armstrong
2,Scott Burrell
3,Peja Stojakovic
4,P.J. Brown
5,Jaren Jackson
...,...
170,Duane Washington Jr.
171,Chaundee Brown Jr.
172,Joshua Primo
173,Jeff Dowtin Jr.


In [9]:
def nearest_name(players):
    nearest_name = []
    original_name = []
    for name in players['Player']:
        close_matches = difflib.get_close_matches(name, player_index['Player'], 1)
        for x in close_matches:
                nearest_name.append(x)
                original_name.append(name)
    nearest_name = pd.DataFrame(nearest_name, columns=['New_name'])
    nearest_name['Original_name'] = original_name
    return nearest_name

In [10]:
null_players = nearest_name(null_players)

In [11]:
print(null_players.to_markdown())

|     | New_name               | Original_name            |
|----:|:-----------------------|:-------------------------|
|   0 | Darrell Armstrong      | Darrel Armstrong         |
|   1 | Scottie Barnes         | Scott Burrell            |
|   2 | Predrag Stojakovic     | Peja Stojakovic          |
|   3 | PJ Brown               | P.J. Brown               |
|   4 | Jaren Jackson Jr       | Jaren Jackson            |
|   5 | Quinndary Weatherspoon | Ray Weathers             |
|   6 | AJ Guyton              | A.J. Guyton              |
|   7 | Stanislav Medvedenko   | Slava Medvedenko         |
|   8 | Chuck Person           | Chucky Brown             |
|   9 | Hidayet Turkoglu       | Hedo Turkoglu            |
|  10 | David Wingate          | David Vanterpool         |
|  11 | Jabari Smith Sr        | Jabari Smith             |
|  12 | Radoslav Nesterovic    | Rasho Nesterovic         |
|  13 | Darren Collison        | Sean Colson              |
|  14 | Don MacLean            | Don Mac

In [12]:
null_players.head()

Unnamed: 0,New_name,Original_name
0,Darrell Armstrong,Darrel Armstrong
1,Scottie Barnes,Scott Burrell
2,Predrag Stojakovic,Peja Stojakovic
3,PJ Brown,P.J. Brown
4,Jaren Jackson Jr,Jaren Jackson


In [13]:
null_players.drop([1, 5, 8, 10, 13, 17, 18, 19, 20, 21, 23, 31, 34, 37, 38, 39, 40, 41, 42, 45, 48, 50, 52, 57, 59, 64, 65, 72, 74, 75, 78, 81, 90, 136, 138, 140, 145], axis=0, inplace=True).reset_index(drop=True)
null_players

Unnamed: 0,New_name,Original_name
0,Darrell Armstrong,Darrel Armstrong
2,Predrag Stojakovic,Peja Stojakovic
3,PJ Brown,P.J. Brown
4,Jaren Jackson Jr,Jaren Jackson
6,AJ Guyton,A.J. Guyton
...,...,...
159,Duane Washington Jr,Duane Washington Jr.
160,Chaundee Brown,Chaundee Brown Jr.
161,Josh Primo,Joshua Primo
162,Jeff Dowtin,Jeff Dowtin Jr.


In [30]:
null_players.reset_index(drop=True)
null_players.loc[null_players[null_players['Original_name'] == 'Sasha Pavlovic'].index, 'New_name'] = 'Aleksandar Pavlovic'
null_players.loc[null_players[null_players['Original_name'] == 'Mo Williams'].index, 'New_name'] = 'Maurice Williams'
null_players.loc[null_players[null_players['Original_name'] == 'Ish Smith'].index, 'New_name'] = 'Ishmael Smith'
null_players.loc[null_players[null_players['Original_name'] == 'Brandon Boston Jr.'].index, 'New_name'] = 'BJ Boston'

In [31]:
null_players[null_players['Original_name'] == 'Brandon Boston Jr.']

Unnamed: 0,New_name,Original_name
155,BJ Boston,Brandon Boston Jr.


In [32]:
null_players.to_csv('../updated_datasets/null_players.csv')