Darren Lund

In [1]:
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt

In [4]:
# Load DATA
path = './DATA/'

# Split function (for 2-pt, 3-pt, and ft)
def split(ratio) :
    '''
    Takes a list of number pairs separated by '-' and splits it into two lists, first and last
    
    Inputs :
        ratio (list) - A list of number pairs
        
    Outputs :
        made (list) - A list of the first numbers
        attempt (list) - A list of the last numbers
    '''
    values = [value.split('-') for value in ratio]
    made , attempt = [int(shots[0]) for shots in values], [int(shots[1]) for shots in values]
    return made, attempt

# Different types of shots
shot_types = ['2Pt','3Pt','FT']

# Walk through player files
for dir_path , dir_name , file_names in os.walk(path) :
    # List of players
    players = {}
    for name in file_names :
        # Only worry about cleaned data
        if name[-3:] == 'csv' :
            # Don't get the Scouting report
            if name[:8] != 'Scouting' :
                # Read player data
                players[name[:-6]] = pd.read_csv(os.path.join(dir_path,name))
    # Empty dict for storing team totals later
    team_values = {}
    # Get total team values
    for player in players.keys() :
        for shot_type in shot_types :
            # Split the number of made and attempted
            made , attempt = split(players[player][shot_type].values)
            # Add the number of shots player attempted to total team shots for that game
            if shot_type in team_values.keys() :
                team_values[shot_type] = [team_values[shot_type][i] + attempt[i] for i in range(len(attempt))]
            else :
                team_values[shot_type] = attempt
            # Create the percentage tab
            players[player][shot_type+' %'] = [made[i] / attempt[i] if attempt[i] != 0 else 0 for i in range(len(attempt))]
    # Add %Att for 2s, 3s, and FT
    for player in players.keys() :
        for shot_type in shot_types :
            # Split number of made and attempted
            made , attempt = split(players[player][shot_type].values)
            # Get list of percentages
            perc_att = [attempt[i] / team_values[shot_type][i] if team_values[shot_type][i] != 0 else 0 for i in range(len(attempt))]
            # Create the new column
            players[player][shot_type+' %Att'] = perc_att
        # Calculate apprx points prevented from blocks and steals
        points_prev = [2*(players[player].loc[i]['Blk'] + players[player].loc[i]['Stl'] - players[player].loc[i]['TO']) for i in players[player].index]
        # Add data to player
        players[player]['Pnts-Prev'] = points_prev
        # Get results for point margin
        res = players[player]['Result']
        # Gets 'W' for win and 'L' for loss
        result = [res[i][0] for i in range(len(res))]
        # Resets it to exclude the 'W' or 'L'
        res = [res[i][3:] for i in range(len(res))]
        # Split the scores
        score_1 , score_2 = split(res)
        # Creates the margin list
        margin = [abs(score_1[i]-score_2[i]) if result[i]=='W' else -abs(score_1[i]-score_2[i]) for i in range(len(score_1))]
        # Adds the margin column
        players[player]['Marg'] = margin
        # Refilter to remove extra indices
        players[player] = players[player].filter(['Date','Opponent','Result','OTs','Site',
                                                  'Conference','MP','ORtg','%Ps','Pts','2Pt',
                                                  '3Pt','FT','OR','DR','A','TO','Blk','Stl','PF',
                                                  '2Pt %','3Pt %','FT %','2Pt %Att','3Pt %Att',
                                                  'FT %Att','Pnts-Prev','Marg'])
        # Save new file
        players[player].to_csv(os.path.join(dir_path,player)+'_adj')

The columns I added to individual player data are the percent of the team shots each player made (for 2s, 3s, and free throws), an approximate points prevented variable (calculated by multiplying number of blocks and number of steals by 2, then subtracting 2 for each turnover), and a margin variable indicating how many more (less if negative) points the team had at the end of the game.  The reason for the first three is so that I can look at approximately how many shots each individual player contributed per game of all kinds, so to better understand their offensive contribution to the game as a whole.  In like manner, points prevented is to better understand the defensive contribution each player made.  The margin variable makes it easier to compare the overall result of the game (win or lose by x points) together with each player's individual contributions.

I had to redo a bit of my data cleaning because I realized that I dropped games where a player didn't play at all, which threw off my numbers a bit.  To remedy this, I went back and switch all games that were "Did not play" with a stat of either $0$ or $0-0$.  While I'm currently not sure how exactly I want to handle games that they didn't participate in, I feel this is the best way to currently store the information.  It's simple enough to fix if later on I decide that it needs to be a different value.