In [1]:
import pandas
import re

In [2]:
# importing player statistics data
statistics_df = pandas.read_csv("data/player_roster.csv")
statistics_df.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Alex Abrines\abrinal01,SG,24,OKC,75,8,15.1,1.5,3.9,...,0.848,0.3,1.2,1.5,0.4,0.5,0.1,0.3,1.7,4.7
1,2,Quincy Acy\acyqu01,PF,27,BRK,70,8,19.4,1.9,5.2,...,0.817,0.6,3.1,3.7,0.8,0.5,0.4,0.9,2.1,5.9
2,3,Steven Adams\adamsst01,C,24,OKC,76,76,32.7,5.9,9.4,...,0.559,5.1,4.0,9.0,1.2,1.2,1.0,1.7,2.8,13.9
3,4,Bam Adebayo\adebaba01,C,20,MIA,69,19,19.8,2.5,4.9,...,0.721,1.7,3.8,5.5,1.5,0.5,0.6,1.0,2.0,6.9
4,5,Arron Afflalo\afflaar01,SG,32,ORL,53,3,12.9,1.2,3.1,...,0.846,0.1,1.2,1.2,0.6,0.1,0.2,0.4,1.1,3.4


In [3]:
# defining relevant groups of columns in statistics_df

unchanged_statistic_columns = ['Pos',
 'Age']

aggregate_statistic_columns = [
    'G',
    'GS'
]

average_statistic_columns = [
    'MP',
    'FG',
    'FGA',
    '3P',
    '3PA',
    '2P',
    '2PA',
    'FT',
    'FTA',
    'ORB',
    'DRB',
    'AST',
    'STL',
    'BLK',
    'TOV',
    'PF',
    'PTS'
]

computed_statistic_column_dict = {
    'FG%': ['FG', 'FGA'],
    '3P%': ['3P', '3PA'],
    '2P%': ['2P', '2PA'],
    'FT%': ['FT', 'FTA']
}

In [4]:
# importing player contract data

contracts_df = pandas.read_csv("data/contracts.csv")
contracts_df.head()

Unnamed: 0,Rk,Player,Tm,2018-19,2019-20,2020-21,2021-22,2022-23,2023-24,Signed Using,Guaranteed
0,1,Stephen Curry\curryst01,GSW,"$37,457,154","$40,231,758","$43,006,362","$45,780,966",,,Bird Rights,"$166,476,240"
1,2,Chris Paul\paulch01,HOU,"$35,654,150","$38,506,482","$41,358,814","$44,211,146",,,,"$159,730,592"
2,3,Russell Westbrook\westbru01,OKC,"$35,654,150","$38,178,000","$41,006,000","$43,848,000","$46,662,000",,Bird Rights,"$158,686,150"
3,4,LeBron James\jamesle01,LAL,"$35,654,150","$37,436,858","$39,219,565","$41,002,273",,,,"$113,310,573"
4,5,Blake Griffin\griffbl01,DET,"$32,088,932","$34,234,964","$36,595,996","$38,957,028",,,Bird Rights,"$102,919,892"


In [5]:
# defining relevant group of columns in contracts_df

aggregate_contract_columns = [
    '2018-19',
    '2019-20',
    '2020-21',
    '2021-22',
    '2022-23',
    '2023-24',
    'Guaranteed'
]

In [6]:
"""
Handles basic data cleaning for contracts_df, 
i.e. data reformatting, proper type conversion, etc.
"""
def clean_contracts_df(contracts_df):
    for col in ["2018-19", "2019-20", "2020-21", "2021-22", "2022-23", "2023-24", "Guaranteed"]:
        contracts_df[col] = contracts_df[col].apply(lambda x: 0 if type(x) == float else int(re.sub(r'\$|,', '', x.strip())))
        
    contracts_df["Signed Using"] = contracts_df["Signed Using"].apply(lambda x: '' if type(x) == float else x)

In [7]:
"""
Adds all player statistics to player_dict (for the player 
corresponding to the inputted player_dict.)
"""
def add_statistics(player_dict, stats, unchanged_statistic_columns, \
                   aggregate_statistic_columns, average_statistic_columns, \
                   computed_statistic_column_dict):
    for col in unchanged_statistic_columns:
        player_dict[col] = stats.iloc[0,:][col]
    
    for col in aggregate_statistic_columns:
        player_dict[col] = 0
        for i in range(len(stats)):
            player_dict[col] += stats.iloc[i,:][col]
    
    for col in average_statistic_columns:
        player_dict[col] = 0
        num_games_played = 0
        for i in range(len(stats)):
            current_row = stats.iloc[i,:]
            player_dict[col] += current_row[col] * current_row['G']
            num_games_played += current_row['G']
        player_dict[col] = round(player_dict[col] / num_games_played, 2)
    
    for col in computed_statistic_column_dict.keys():
        if player_dict[computed_statistic_column_dict[col][1]] == 0:
            player_dict[col] = 0
        else:
            player_dict[col] = round(player_dict[computed_statistic_column_dict[col][0]] / \
                                 player_dict[computed_statistic_column_dict[col][1]], 2)
    
    player_dict["Player"] = stats.iloc[0,:]["Player"].split("\\")[0].strip()
    player_dict["TRB"] = player_dict["ORB"] + player_dict["DRB"]
    
    if player_dict["FGA"] == 0:
        player_dict["eFG%"] = 0.0
    else:
        player_dict["eFG%"] = round((player_dict["FG"] + 0.5 * player_dict["3P"]) / player_dict["FGA"], 2)

In [8]:
"""
Adds all player contract details to player_dict (for the player 
corresponding to the inputted player_dict.)
"""
def add_contract_details(player_dict, contracts, aggregate_contract_columns):
    for col in aggregate_contract_columns:
        player_dict[col] = 0
        for i in range(len(contracts)):
            player_dict[col] += contracts.iloc[i,:][col]
    
    player_dict["Signed Using"] = "/".join(list(contracts["Signed Using"]))

In [9]:
"""
Returns list of dictionaries, with each dictionary containing 
all consolidated statistical and contractual data for a given player.
"""
def combine_player_info(statistics_df, contracts_df, unchanged_statistic_columns, \
                        aggregate_statistic_columns, average_statistic_columns, \
                        computed_statistic_column_dict, aggregate_contract_columns, player_set):
    result = []
    
    for player in player_set:
        player_stats = statistics_df[statistics_df["Player"] == player]
        player_contract = contracts_df[contracts_df["Player"] == player]
        
        if len(player_stats) == 0 or len(player_contract) == 0:
            continue
        
        player_dict = dict()
        add_statistics(player_dict, player_stats, unchanged_statistic_columns, \
                   aggregate_statistic_columns, average_statistic_columns, computed_statistic_column_dict)
        add_contract_details(player_dict, player_contract, aggregate_contract_columns)
        
        result.append(player_dict)

    return result

In [10]:
# handles basic data preprocessing for contracts_df
clean_contracts_df(contracts_df)

# creates set of players with available statistical information
player_set = set(statistics_df["Player"])

In [11]:
# obtaining all consolidated player information
player_info = combine_player_info(statistics_df, contracts_df, unchanged_statistic_columns, \
                        aggregate_statistic_columns, average_statistic_columns, \
                        computed_statistic_column_dict, aggregate_contract_columns, player_set)

In [12]:
# converting consolidated player information into DataFrame
players_df = pandas.DataFrame(player_info)

In [13]:
players_df.head()

Unnamed: 0,2018-19,2019-20,2020-21,2021-22,2022-23,2023-24,2P,2P%,2PA,3P,...,ORB,PF,PTS,Player,Pos,STL,Signed Using,TOV,TRB,eFG%
0,30431854,37800000,40824000,43848000,46872000,0,5.4,0.53,10.1,3.7,...,0.6,2.3,30.4,James Harden,SG,1.8,Bird Rights,4.4,5.4,0.54
1,6041520,7059480,8930242,0,0,0,4.3,0.46,9.3,0.7,...,1.2,2.8,13.1,Josh Jackson,SF,1.0,1st Round Pick,1.9,4.5,0.45
2,1544951,2305057,0,0,0,0,0.7,0.47,1.5,0.0,...,0.3,0.9,1.7,Damian Jones,C,0.1,1st Round pick,0.3,0.9,0.47
3,13045455,11954546,10863637,0,0,0,1.6,0.52,3.1,2.5,...,0.3,2.2,11.5,Joe Ingles,SF,1.1,Bird Rights,1.9,4.2,0.61
4,1740000,2033160,3665787,0,0,0,0.9,0.47,1.9,0.0,...,0.6,1.4,2.3,Caleb Swanigan,PF,0.2,1st Round Pick,0.7,2.0,0.41


In [14]:
# exporting consolidated player data
players_df.to_csv("/Users/Genghis/Desktop/ORIE_4741/nba-contract-prediction/data/consolidated_players_dataset.csv", index=False)