# Imports

In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)

# Configuration

In [2]:
seasons = ['2019-20', '2018-19', '2017-18', '2016-17', '2015-16']

# Functions

In [3]:
# Returns a dataframe with all relevant data for a game
def process_data(season):
    # =============================================================================================
    # Player boxscores
    # =============================================================================================
    traditional_player_df = pd.read_csv(f"data/{season}/traditional_player_{season}.csv")
    advanced_player_df = pd.read_csv(f"data/{season}/advanced_player_{season}.csv")

    # Remove duplicate columns from advanced player boxscore
    advanced_player_df = advanced_player_df.drop([
        "teamId",
        "teamCity",
        "teamName",
        "teamTricode",
        "teamSlug",
        "firstName",
        "familyName",
        "nameI",
        "playerSlug",
        "position",
        "comment",
        "jerseyNum",
        "minutes",
        "gameDate"
    ], axis=1)

    # Merge traditional and advanced player boxscores
    player_df = traditional_player_df.merge(advanced_player_df, on=['personId', 'gameId'], how='left')

    # =============================================================================================
    # Team boxscores
    # =============================================================================================
    traditional_team_df = pd.read_csv(f"data/{season}/traditional_team_{season}.csv")
    advanced_team_df = pd.read_csv(f"data/{season}/advanced_team_{season}.csv")

    # Remove duplicate columns from advanced team boxscore
    advanced_team_df = advanced_team_df.drop([
        "teamCity", 
        "teamName", 
        "teamTricode", 
        "teamSlug", 
        "minutes",
        "gameDate"
    ], axis=1)

    # Merge traditional and advanced team boxscores
    team_df = traditional_team_df.merge(advanced_team_df, on=['teamId', 'gameId'], how='left')

    # =============================================================================================
    # Add things to team boxscore
    # =============================================================================================
    team_df['home'] = team_df.groupby('gameId').cumcount().apply(lambda x: x == 0)

    # Add won column
    team_df['max_points'] = team_df.groupby('gameId')['points'].transform('max')
    team_df['won'] = team_df['points'] == team_df['max_points']
    team_df['won'] = team_df['won'].astype(int)
    team_df.drop(columns='max_points', inplace=True)

    # Rename minutes to gameMin
    team_df = team_df.rename(columns={'minutes': 'gameMin'})

    # Set opponent teamId
    team_df['opp_teamId'] = team_df.groupby('gameId')['teamId'].transform(lambda ids: ids.iloc[::-1].values)

    # =============================================================================================
    # Combine team team stats to player stats
    # =============================================================================================
    team_team_df = team_df.copy()
    team_team_df = team_team_df.drop([
        "teamCity", 
        "teamName", 
        "teamTricode", 
        "teamSlug",
        "gameDate"
    ], axis=1)
    team_team_df = team_team_df.add_prefix('team_')
    team_team_df = team_team_df.rename(columns={'team_gameMin': 'gameMin', 'team_won': 'won', 'team_home': 'home'})

    player_df = player_df.merge(team_team_df, left_on=['teamId', 'gameId'], right_on=['team_teamId', 'team_gameId'], how='left')
    player_df = player_df.drop(['team_teamId', 'team_gameId', 'team_opp_teamId'], axis=1)

    # =============================================================================================
    # Combine opponent team stats to player stats
    # =============================================================================================
    opp_team_df = team_df.copy()
    opp_team_df = opp_team_df.drop(['gameMin', 'home', 'won', 'gameDate'], axis=1)
    opp_team_df = opp_team_df.add_prefix('opp_')

    player_df = player_df.merge(opp_team_df, left_on=['teamId', 'gameId'], right_on=['opp_opp_teamId', 'opp_gameId'], how='left')
    player_df = player_df.drop(['opp_opp_teamId', 'opp_gameId'], axis=1)

    # =============================================================================================
    # Save dataframe to CSV
    # =============================================================================================
    player_df.to_csv(f"data/{season}/processed_{season}.csv", index=False)

# Main

In [4]:
for season in seasons:
    process_data(season)