In [1]:
import pandas as pd
from scipy.stats import zscore
import numpy as np

# combining all datasets from NBA Data
raw_reg_season_data = pd.concat([pd.read_csv("NBA-Data-2010-2024/regular_season_box_scores_2010_2024_part_1.csv"), pd.read_csv("NBA-Data-2010-2024/regular_season_box_scores_2010_2024_part_2.csv"), pd.read_csv("NBA-Data-2010-2024/regular_season_box_scores_2010_2024_part_3.csv")])

# view first 5 entries
# already in player numeric order

i = 5000

raw_reg_season_data.shape
raw_reg_season_data.iloc[i:i+5, 0:15]


# Most important stats to predict in a parlay:
# matchup, personId, minutes, fieldGoalsMade, fieldGoalsAttempted, threePointersMade, threePointersAttempted,
# freeThrowsMade, freeThrowsAttempted, reboundsOffensive, reboundsDefensive, assists, steals, blocks, turnovers


Unnamed: 0,season_year,game_date,gameId,matchup,teamId,teamCity,teamName,teamTricode,teamSlug,personId,personName,position,comment,jerseyNum,minutes
5000,2015-16,2016-03-01,21500895,ATL @ GSW,1610612737,Atlanta,Hawks,ATL,hawks,200757,Thabo Sefolosha,,,,19:24
5001,2014-15,2014-12-03,21400267,ATL @ MIA,1610612737,Atlanta,Hawks,ATL,hawks,200757,Thabo Sefolosha,,,,17:29
5002,2016-17,2016-12-13,21600369,ATL vs. ORL,1610612737,Atlanta,Hawks,ATL,hawks,200757,Thabo Sefolosha,F,,,29:57
5003,2015-16,2015-12-02,21500272,ATL vs. TOR,1610612737,Atlanta,Hawks,ATL,hawks,200757,Thabo Sefolosha,F,,,33:19
5004,2014-15,2014-12-22,21400413,ATL @ DAL,1610612737,Atlanta,Hawks,ATL,hawks,200757,Thabo Sefolosha,,,,16:41


In [2]:
# Get only the columns I want

reg_season_data = raw_reg_season_data[["personId", "matchup", "fieldGoalsMade", "fieldGoalsAttempted", "threePointersMade", "threePointersAttempted", "freeThrowsMade", "freeThrowsAttempted", "reboundsOffensive", "reboundsDefensive", "assists", "steals", "blocks", "turnovers"]]
reg_season_data.iloc[5000:5005]


Unnamed: 0,personId,matchup,fieldGoalsMade,fieldGoalsAttempted,threePointersMade,threePointersAttempted,freeThrowsMade,freeThrowsAttempted,reboundsOffensive,reboundsDefensive,assists,steals,blocks,turnovers
5000,200757,ATL @ GSW,1,1,0,0,0,0,0,3,1,0,0,1
5001,200757,ATL @ MIA,3,4,1,1,0,0,0,2,2,0,0,0
5002,200757,ATL vs. ORL,6,11,1,3,4,4,1,4,3,2,1,1
5003,200757,ATL vs. TOR,6,12,1,5,0,0,1,4,1,2,2,0
5004,200757,ATL @ DAL,3,3,1,1,0,0,1,1,2,2,1,0


In [3]:
reg_season_data["matchup"], uniques = pd.factorize(reg_season_data["matchup"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reg_season_data["matchup"], uniques = pd.factorize(reg_season_data["matchup"])


In [4]:
reg_season_data.loc[0]

Unnamed: 0,personId,matchup,fieldGoalsMade,fieldGoalsAttempted,threePointersMade,threePointersAttempted,freeThrowsMade,freeThrowsAttempted,reboundsOffensive,reboundsDefensive,assists,steals,blocks,turnovers
0,693,0,0,0,0,0,0,0,0,0,0,0,0,0
0,2449,725,6,13,0,0,2,2,6,8,1,1,0,4
0,2555,1401,0,0,0,0,0,0,0,0,1,0,0,0


In [5]:
#
# Get all numerical data
raw_numerical_data = reg_season_data.iloc[:, 2:]

# Apply z score to each column of numerical data
numerical_data = raw_numerical_data.apply(zscore)

In [6]:
# Combine player ids and matchup with numerical data
reg_season_data = pd.concat([reg_season_data.iloc[:, :2], numerical_data], axis=1)

In [7]:
# Numeric values now with z-score applied
reg_season_data.iloc[5000:5005]

Unnamed: 0,personId,matchup,fieldGoalsMade,fieldGoalsAttempted,threePointersMade,threePointersAttempted,freeThrowsMade,freeThrowsAttempted,reboundsOffensive,reboundsDefensive,assists,steals,blocks,turnovers
5000,200757,115,-0.665503,-0.946941,-0.61191,-0.79952,-0.629859,-0.671663,-0.628267,0.151534,-0.340683,-0.649926,-0.485579,-0.048329
5001,200757,77,-0.028425,-0.450487,0.178003,-0.429729,-0.629859,-0.671663,-0.628267,-0.213881,0.06716,-0.649926,-0.485579,-0.784855
5002,200757,102,0.927193,0.707907,0.178003,0.309854,1.209682,0.831378,0.133759,0.516949,0.475002,1.522437,0.782447,-0.048329
5003,200757,72,0.927193,0.873392,0.178003,1.049437,-0.629859,-0.671663,0.133759,0.516949,-0.340683,1.522437,2.050473,-0.784855
5004,200757,108,-0.028425,-0.615971,0.178003,-0.429729,-0.629859,-0.671663,0.133759,-0.579296,0.06716,1.522437,0.782447,-0.784855


In [8]:
# Will be needed for calculating scores from z-score -> numerical values

reg_season_means = raw_numerical_data.mean()
reg_season_std_devs = raw_numerical_data.std()

In [9]:
# Going from z-score to points to find real values

reg_season_data.iloc[5002]["fieldGoalsMade"] * reg_season_std_devs["fieldGoalsMade"] + reg_season_means["fieldGoalsMade"]

np.float64(6.000003428650093)

In [11]:
reg_season_data[0:200000].to_csv("processed_output_part1.csv", index=False)
reg_season_data[200000:].to_csv("processed_output_part1.csv", index=False)