In [1]:
import requests
from bs4 import BeautifulSoup as soup
import uuid
import difflib
import html5lib
import time
import pandas as pd
from sqlalchemy import create_engine
import psycopg2
pd.options.mode.chained_assignment = None 

In [2]:
team_abb = pd.read_html(requests.get('http://www.shrpsports.com/nba/explain.htm').text)[-1].drop(columns = [2])
team_abb.rename(columns = {0:'Team', 1:'Abb'}, inplace=True)
team_abb = team_abb[1:].reset_index(drop=True)
team_abb['Abb'] = team_abb['Abb'].str.upper() 
team_abb

Unnamed: 0,Team,Abb
0,Atlanta,ATL
1,Anderson,AND
2,Bal Bullets,BLB
3,Baltimore,BAL
4,Boston,BOS
...,...,...
68,Vancouver,VAN
69,Was Capitols,WSC
70,Was Bullets,WSB
71,Washington,WSH


In [3]:
def generate_index(table):
    player_index = pd.DataFrame(table, columns = ['player'])
    for name in player_index['player'].unique():
        player_index.loc[player_index['player'] == name, 'uuid'] = uuid.uuid4()
    return player_index

In [4]:
def perc_score(row):
    if round(row['perc'] * 10) == 0:
        outcome = (row['perc'] * 100)
    else:
        outcome = (row['perc'] * 100) * row['perc'] * 10
    return round(outcome, 3)

In [5]:
start_year = 1990
end_year = 2021

In [6]:
df_list = []
temp_list = []
player_index = []
table_list = []
for i in range(start_year, end_year + 1):
    #define our variables 
    year = f'{i}-{i + 1}/'
    var_name = f'team_salaries_{i}'
    teams = []

    #scrap all of the website links to the per-team data
    main = soup(requests.get(f'https://hoopshype.com/salaries/{year}').text, 'html.parser')
    links = [a for a in main.find_all('tbody')[0].find_all('a', href=True)]

    #iterate through each link to grab website + team name
    for link in links:
        website = str(link).split('"')[1]
        team = str(link).split('"')[-1].replace('\t', '').replace('</a>', '').replace('>\n', '')
        table = pd.read_html(website)[0]

        #clean the dataset
        table.columns = table.columns.droplevel(0)
        table = table[table.columns[:2]]
        table.rename(columns={f'{year[:4]}/{year[7:9]}':'salary', 'Player':'player'}, inplace=True)
        table[table.columns[1:]] = table[table.columns[1:]].replace(
            '[\$,]', '', regex=True).astype(float)
        table['year'] = year[:4]
        table.dropna(inplace=True)

        #calculate each players salary relative to the team total
        total = pd.to_numeric(str(table[table['player'] == 'Totals']['salary']).split('  ')[-1].split('\n')[0])
        table = table.head(-1)
        table['perc'] = table['salary'] / total

        #determine relative percent-score
        table['perc_score'] = table.apply(perc_score, axis=1)

        #add the player's team and merge with abb. table to grab the team abbreviation
        table['team'] = team
        table = pd.merge(left=table, right=team_abb, how='left', left_on='team', right_on='Team').drop(columns=['team', 'Team'])
        table.rename(columns={'Abb':'team'}, inplace=True)

        #add players to the index
        for name in table['player']:
            if name in player_index:
                continue
            else:
                player_index.append(name)

        globals()[team] = table
        #add to keep track of all of our teams
        teams.append(globals()[team])

    #iterate through and concat our teams list to get one final table for the year
    globals()[var_name] = pd.concat(teams)
    print(f'{var_name} cleaned!')
    table_list.append(var_name)
    temp_list.append(globals()[var_name])

player_index = generate_index(player_index)
    
for index, table in enumerate(temp_list):
    table_name = table_list[index]
    globals()[table_name] = table.merge(player_index)
    df_list.append(globals()[table_name])
    globals()[table_name].to_csv(f'../updated_datasets/salaries_data/team_salaries/{table_name}.csv')
    print(f'{table_name} is finalized!')

team_salaries_1990 cleaned!
team_salaries_1991 cleaned!
team_salaries_1992 cleaned!
team_salaries_1993 cleaned!
team_salaries_1994 cleaned!
team_salaries_1995 cleaned!
team_salaries_1996 cleaned!
team_salaries_1997 cleaned!
team_salaries_1998 cleaned!
team_salaries_1999 cleaned!
team_salaries_2000 cleaned!
team_salaries_2001 cleaned!
team_salaries_2002 cleaned!
team_salaries_2003 cleaned!
team_salaries_2004 cleaned!
team_salaries_2005 cleaned!
team_salaries_2006 cleaned!
team_salaries_2007 cleaned!
team_salaries_2008 cleaned!
team_salaries_2009 cleaned!
team_salaries_2010 cleaned!
team_salaries_2011 cleaned!
team_salaries_2012 cleaned!
team_salaries_2013 cleaned!
team_salaries_2014 cleaned!
team_salaries_2015 cleaned!
team_salaries_2016 cleaned!
team_salaries_2017 cleaned!
team_salaries_2018 cleaned!
team_salaries_2019 cleaned!
team_salaries_2020 cleaned!
team_salaries_2021 cleaned!
team_salaries_1990 is finalized!
team_salaries_1991 is finalized!
team_salaries_1992 is finalized!
team_

In [7]:
table_list

['team_salaries_1990',
 'team_salaries_1991',
 'team_salaries_1992',
 'team_salaries_1993',
 'team_salaries_1994',
 'team_salaries_1995',
 'team_salaries_1996',
 'team_salaries_1997',
 'team_salaries_1998',
 'team_salaries_1999',
 'team_salaries_2000',
 'team_salaries_2001',
 'team_salaries_2002',
 'team_salaries_2003',
 'team_salaries_2004',
 'team_salaries_2005',
 'team_salaries_2006',
 'team_salaries_2007',
 'team_salaries_2008',
 'team_salaries_2009',
 'team_salaries_2010',
 'team_salaries_2011',
 'team_salaries_2012',
 'team_salaries_2013',
 'team_salaries_2014',
 'team_salaries_2015',
 'team_salaries_2016',
 'team_salaries_2017',
 'team_salaries_2018',
 'team_salaries_2019',
 'team_salaries_2020',
 'team_salaries_2021']

In [8]:
player_index

Unnamed: 0,player,uuid
0,Hot Rod Williams,ca94dd3e-c3dc-4682-8c88-0ed654c9d074
1,Danny Ferry,4d2819b0-30c9-40f3-af49-4b950b0fad78
2,Mark Price,1a806ca8-ea9c-4690-8fe4-0b7d13c80d36
3,Brad Daugherty,ae75e1f9-765b-4cde-afbc-b5e9f18e0a26
4,Larry Nance,97054146-250e-4988-aff6-40ce7e6f6c4d
...,...,...
2819,Vit Krejci,9b06ee3b-d2b2-4e58-9689-a11051808fde
2820,Olivier Sarr,d09f39f5-e68c-4f3d-ba3c-1e7946491fcd
2821,Lindy Waters III,a2d20bf5-1dab-4cfc-80e6-e0c848d31e9e
2822,Rob Edwards,f388acbe-7951-43e5-8778-dab1019c1843


In [9]:
team_salaries_1990

Unnamed: 0,player,salary,year,perc,perc_score,team,uuid
0,Hot Rod Williams,3785000.0,1990,0.262792,69.060,CLE,ca94dd3e-c3dc-4682-8c88-0ed654c9d074
1,Danny Ferry,2640000.0,1990,0.183295,33.597,CLE,4d2819b0-30c9-40f3-af49-4b950b0fad78
2,Mark Price,1400000.0,1990,0.097202,9.448,CLE,1a806ca8-ea9c-4690-8fe4-0b7d13c80d36
3,Brad Daugherty,1320000.0,1990,0.091648,8.399,CLE,ae75e1f9-765b-4cde-afbc-b5e9f18e0a26
4,Larry Nance,1260000.0,1990,0.087482,7.653,CLE,97054146-250e-4988-aff6-40ce7e6f6c4d
...,...,...,...,...,...,...,...
348,Mark Acres,437000.0,1990,0.058019,3.366,ORL,ebcb97a8-f344-48c5-9ffa-5b2fd49749d5
349,Jeff Turner,410000.0,1990,0.054434,2.963,ORL,b24f885f-af14-4c5f-acd7-0bc52606918b
350,Scott Skiles,408000.0,1990,0.054169,2.934,ORL,1190be29-8300-464b-8873-4e37ce0bb04f
351,Morlon Wiley,347000.0,1990,0.046070,4.607,ORL,ebc8baa5-2045-4894-9f78-1cc9ef2d5d35


In [10]:
team_salaries_2004

Unnamed: 0,player,salary,year,perc,perc_score,team,uuid
0,Allan Houston,17531250.0,2004,0.170924,29.215,NY,465e59ac-6809-4cb1-a46a-c54e1193b673
1,Anfernee Hardaway,14625000.0,2004,0.142589,20.332,NY,16eee61d-7636-4b5a-972e-b8aeb9b5d54b
2,Stephon Marbury,14625000.0,2004,0.142589,20.332,NY,49401b77-f61b-4893-84a9-9fcb6c460515
3,Tim Thomas,12900000.0,2004,0.125771,15.818,NY,b5a9cb99-e6f7-44ce-9471-fa6afd69ad85
4,Shandon Anderson,7300000.0,2004,0.071173,5.066,NY,40966e4d-7253-436a-a7f9-3fb958f92711
...,...,...,...,...,...,...,...
478,Tamar Slay,695046.0,2004,0.029054,2.905,CHA,a7cb57c0-84e0-40c1-b388-bcaeb54524ff
479,Jason Kapono,620046.0,2004,0.025919,2.592,CHA,9676a674-d0ff-4ddd-b9be-ee46c1d069b3
480,Keith Bogans,620046.0,2004,0.025919,2.592,CHA,8c954282-64f2-4b29-be3a-f613f4999a3f
481,Theron Smith,620046.0,2004,0.025919,2.592,CHA,53f6ab37-0bb5-4052-954b-587fd9e6d7b2


In [17]:
def nearest_name(tables):
    nearest_name = []
    original_name = []
    for table in tables:
        for row in table[table['uuid'].isnull()]['player']:
            close_matches = difflib.get_close_matches(name, player_index['player'], 1)
            for x in close_matches:
                    nearest_name.append(x)
                    original_name.append(name)
    nearest_name = pd.DataFrame(nearest_name, columns=['new_name'])
    nearest_name['original_name'] = original_name
    return nearest_name

In [18]:
null_players = nearest_name(df_list)
null_players

Unnamed: 0,new_name,original_name


In [None]:
print(null_players.to_markdown())

In [19]:
player_index.to_csv('../updated_datasets/player_index.csv')