In [1]:
import requests
from bs4 import BeautifulSoup as soup
import uuid
import difflib
import html5lib
import time
import pandas as pd
from sqlalchemy import create_engine
import psycopg2
pd.options.mode.chained_assignment = None 

In [2]:
team_abb = pd.read_html(requests.get('http://www.shrpsports.com/nba/explain.htm').text)[-1].drop(columns = [2])
team_abb.rename(columns = {0:'Team', 1:'Abb'}, inplace=True)
team_abb = team_abb[1:].reset_index(drop=True)
team_abb['Abb'] = team_abb['Abb'].str.upper() 
team_abb

Unnamed: 0,Team,Abb
0,Atlanta,ATL
1,Anderson,AND
2,Bal Bullets,BLB
3,Baltimore,BAL
4,Boston,BOS
...,...,...
68,Vancouver,VAN
69,Was Capitols,WSC
70,Was Bullets,WSB
71,Washington,WSH


In [3]:
def generate_index(table):
    player_index = pd.DataFrame(table, columns = ['player'])
    for name in player_index['player'].unique():
        player_index.loc[player_index['player'] == name, 'uuid'] = uuid.uuid4()
    return player_index

In [4]:
def perc_score(row):
    if round(row['perc'] * 10) == 0:
        outcome = (row['perc'] * 100)
    else:
        outcome = (row['perc'] * 100) * row['perc'] * 10
    return round(outcome, 3)

In [5]:
start_year = 1990
end_year = 2021

In [9]:
df_list = []
player_index = []
table_list = []
for i in range(start_year, end_year + 1):
    #define our variables 
    year = f'{i}-{i + 1}/'
    var_name = f'team_salaries_{i}'
    teams = []

    #scrap all of the website links to the per-team data
    main = soup(requests.get(f'https://hoopshype.com/salaries/{year}').text, 'html.parser')
    links = [a for a in main.find_all('tbody')[0].find_all('a', href=True)]

    #iterate through each link to grab website + team name
    for link in links:
        website = str(link).split('"')[1]
        team = str(link).split('"')[-1].replace('\t', '').replace('</a>', '').replace('>\n', '')
        table = pd.read_html(website)[0]

        #clean the dataset
        table.columns = table.columns.droplevel(0)
        table = table[table.columns[:2]]
        table.rename(columns={f'{year[:4]}/{year[7:9]}':'salary', 'Player':'player'}, inplace=True)
        table[table.columns[1:]] = table[table.columns[1:]].replace(
            '[\$,]', '', regex=True).astype(float)
        table['year'] = year[:4]
        table.dropna(inplace=True)

        #calculate each players salary relative to the team total
        total = pd.to_numeric(str(table[table['player'] == 'Totals']['salary']).split('  ')[-1].split('\n')[0])
        table = table.head(-1)
        table['perc'] = table['salary'] / total

        #determine relative percent-score
        table['perc_score'] = table.apply(perc_score, axis=1)

        #add the player's team and merge with abb. table to grab the team abbreviation
        table['team'] = team
        table = pd.merge(left=table, right=team_abb, how='left', left_on='team', right_on='Team').drop(columns=['team', 'Team'])
        table.rename(columns={'Abb':'team'}, inplace=True)

        #add players to the index
        for name in table['player']:
            if name in player_index:
                continue
            else:
                player_index.append(name)

        globals()[team] = table
        #add to keep track of all of our teams
        teams.append(globals()[team])

    #iterate through and concat our teams list to get one final table for the year
    globals()[var_name] = pd.concat(teams)
    print(f'{var_name} worked!')
    table_list.append(var_name)

player_index = generate_index(player_index)
    
for index, table in enumerate(df_list):
    table_name = table_list[index]
    globals()[table_name] = table.merge(player_index)

df_list.append(globals()[var_name])
globals()[table_name].to_csv(f'../updated_datasets/salaries_data/team_salaries/{table_name}.csv')

[               player     salary  year      perc  perc_score team
0    Hot Rod Williams  3785000.0  1990  0.262792      69.060  CLE
1         Danny Ferry  2640000.0  1990  0.183295      33.597  CLE
2          Mark Price  1400000.0  1990  0.097202       9.448  CLE
3      Brad Daugherty  1320000.0  1990  0.091648       8.399  CLE
4         Larry Nance  1260000.0  1990  0.087482       7.653  CLE
5          Craig Ehlo   925000.0  1990  0.064223       4.125  CLE
6        Chucky Brown   630000.0  1990  0.043741       4.374  CLE
7          Steve Kerr   548000.0  1990  0.038048       3.805  CLE
8    Derrick Chievous   525000.0  1990  0.036451       3.645  CLE
9     Winston Bennett   525000.0  1990  0.036451       3.645  CLE
10        John Morton   350000.0  1990  0.024300       2.430  CLE
11        Milos Babic   200000.0  1990  0.013886       1.389  CLE
12      Gerald Paddio   120000.0  1990  0.008332       0.833  CLE
13  Darnell Valentine   100000.0  1990  0.006943       0.694  CLE
14       

ValueError: No objects to concatenate

In [None]:
table_list

In [None]:
player_index

In [None]:
team_salaries_1990

In [None]:
team_salaries_2004

In [7]:
def nearest_name(tables):
    nearest_name = []
    original_name = []
    for table in tables:
        for row in player_index['UUID']:
            if pd.isnull(idd):
                for name in table['player']:
                    close_matches = difflib.get_close_matches(name, player_index['player'], 1)
                    for x in close_matches:
                            nearest_name.append(x)
                            original_name.append(name)
    nearest_name = pd.DataFrame(nearest_name, columns=['new_name'])
    nearest_name['original_name'] = original_name
    return nearest_name

In [None]:
null_players = nearest_name(df_list)
null_players

In [None]:
print(null_players.to_markdown())