# Final Project: Data processing

## Initial Data Intake

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os

# Read in data
DATA_PATH = "/content/drive/MyDrive/DATASCI 207/Final Project/NBA data"

In [33]:
# Function to loop through directories to read in files
def load_data(path_to_data):
    '''Load datasets
    Parameters:
    path_to_data (str): This is the path to data

    Returns:
    nba_data: A pandas dataframe
    '''
    # List all folders in data directory
    dir_list = os.listdir(path_to_data)

    # Initialize data
    dfs = []

    # Loop through files
    for dir in dir_list:
      df = pd.read_csv(os.path.join(path_to_data, dir, "raw_scores.txt"),
                       delimiter=',', skiprows=[0], header=None)
      dfs.append(df)

    # Concatenate all DataFrames into a single DataFrame
    nba_df = pd.concat(dfs, ignore_index=True)

    # Name columns
    nba_df.columns = ["game_date", "game_sequence","game_id","team_id","team_abbr",
                      "team_city","game_result","pts_q1","pts_q2","pts_q3",
                      "pts_q4","pts_ot1","pts_ot2","pts_ot3","pts_ot4","pts_ot5",
                      "pts_ot6","pts_ot7","pts_ot8","pts_ot9","pts_ot10","pts_tot",
                      "fg_pct","ft_pct","fg3_pct","ast","reb","tov"]

    return nba_df

In [34]:
# Load data
nba_df = load_data(DATA_PATH)

# Index teams within game
nba_df['team_ind'] = nba_df.groupby('game_id').cumcount()

In [35]:
# Examine data
nba_df.head()

Unnamed: 0,game_date,game_sequence,game_id,team_id,team_abbr,team_city,game_result,pts_q1,pts_q2,pts_q3,...,pts_ot9,pts_ot10,pts_tot,fg_pct,ft_pct,fg3_pct,ast,reb,tov,team_ind
0,2016-10-25,1,21600001,1610612752,NYK,New York,0-1,18,27,19,...,0.0,0.0,88,0.368,0.75,0.333,17,42,18,0
1,2016-10-25,1,21600001,1610612739,CLE,Cleveland,1-0,28,20,34,...,0.0,0.0,117,0.479,0.737,0.371,31,51,14,1
2,2016-10-25,2,21600002,1610612762,UTA,Utah,0-1,26,20,37,...,0.0,0.0,104,0.488,1.0,0.333,19,31,11,0
3,2016-10-25,2,21600002,1610612757,POR,Portland,1-0,26,28,23,...,0.0,0.0,113,0.52,1.0,0.684,22,34,12,1
4,2016-10-25,3,21600003,1610612759,SAS,San Antonio,1-0,31,33,33,...,0.0,0.0,129,0.48,0.885,0.5,25,55,13,0


In [36]:
# Dimensions
print("Shape:", nba_df.shape)

# Check NAs
print(nba_df.isna().sum())

Shape: (17226, 29)
game_date        0
game_sequence    0
game_id          0
team_id          0
team_abbr        0
team_city        0
game_result      0
pts_q1           0
pts_q2           0
pts_q3           0
pts_q4           0
pts_ot1          0
pts_ot2          0
pts_ot3          0
pts_ot4          0
pts_ot5          2
pts_ot6          2
pts_ot7          2
pts_ot8          2
pts_ot9          2
pts_ot10         2
pts_tot          0
fg_pct           0
ft_pct           0
fg3_pct          0
ast              0
reb              0
tov              0
team_ind         0
dtype: int64


## Pre-processing

In [37]:
# Pivot wide by team_ind
varlist = ["game_date", "game_sequence","team_id","team_abbr",
            "team_city","game_result","pts_q1","pts_q2","pts_q3",
            "pts_q4","pts_ot1","pts_ot2","pts_ot3","pts_ot4","pts_ot5",
            "pts_ot6","pts_ot7","pts_ot8","pts_ot9","pts_ot10","pts_tot",
            "fg_pct","ft_pct","fg3_pct","ast","reb","tov"]
nba_df_wide = nba_df.pivot(index='game_id', columns='team_ind', values=varlist).reset_index()

# Inspect data
nba_df_wide.head()

Unnamed: 0_level_0,game_id,game_date,game_date,game_sequence,game_sequence,team_id,team_id,team_abbr,team_abbr,team_city,...,ft_pct,ft_pct,fg3_pct,fg3_pct,ast,ast,reb,reb,tov,tov
team_ind,Unnamed: 1_level_1,0,1,0,1,0,1,0,1,0,...,0,1,0,1,0,1,0,1,0,1
0,21200001,2012-10-30,2012-10-30,1,1,1610612764,1610612739,WAS,CLE,Washington,...,0.6,0.682,0.25,0.35,26,22,39,54,12,20
1,21200002,2012-10-30,2012-10-30,2,2,1610612738,1610612748,BOS,MIA,Boston,...,0.821,0.813,0.462,0.5,24,25,41,36,15,8
2,21200003,2012-10-30,2012-10-30,3,3,1610612742,1610612747,DAL,LAL,Dallas,...,0.778,0.387,0.333,0.231,22,24,40,46,11,14
3,21200004,2012-10-31,2012-10-31,1,1,1610612754,1610612761,IND,TOR,Indiana,...,0.688,0.842,0.455,0.353,22,18,46,42,18,10
4,21200005,2012-10-31,2012-10-31,2,2,1610612743,1610612755,DEN,PHI,Denver,...,0.455,0.81,0.222,0.28,19,18,54,47,20,15
