# Final Project: Data processing

## Initial Data Intake

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os

# Read in data
DATA_PATH = "/content/drive/MyDrive/DATASCI 207/Final Project/NBA data"

In [None]:
# Function to loop through directories to read in files
def load_data(path_to_data):
    '''Load datasets
    Parameters:
    path_to_data (str): This is the path to data

    Returns:
    nba_data: A pandas dataframe
    '''
    # List all folders in data directory
    dir_list = os.listdir(path_to_data)

    # Initialize data
    nba_dfs = []
    vegas_dfs = []

    # Loop through NBA files
    for dir in dir_list:
      df = pd.read_csv(os.path.join(path_to_data, dir, "raw_scores.txt"),
                       delimiter=',', skiprows=[0], header=None)
      nba_dfs.append(df)

    # Loop through Vegas files
    for dir in dir_list:
      df = pd.read_csv(os.path.join(path_to_data, dir, "vegas.txt"),
                       delimiter=',')
      vegas_dfs.append(df)

    # Concatenate all DataFrames into a single DataFrame
    nba_df = pd.concat(nba_dfs, ignore_index=True)
    vegas_df = pd.concat(vegas_dfs, ignore_index=True)

    # Name columns
    nba_df.columns = ["game_date", "game_sequence","game_id","team_id","team_abbr",
                      "team_city","game_result","pts_q1","pts_q2","pts_q3",
                      "pts_q4","pts_ot1","pts_ot2","pts_ot3","pts_ot4","pts_ot5",
                      "pts_ot6","pts_ot7","pts_ot8","pts_ot9","pts_ot10","pts_tot",
                      "fg_pct","ft_pct","fg3_pct","ast","reb","tov"]

    return nba_df, vegas_df

In [None]:
# Load data
nba_df, vegas_df = load_data(DATA_PATH)

In [None]:
# Examine data
nba_df.head()

Unnamed: 0,game_date,game_sequence,game_id,team_id,team_abbr,team_city,game_result,pts_q1,pts_q2,pts_q3,...,pts_ot9,pts_ot10,pts_tot,fg_pct,ft_pct,fg3_pct,ast,reb,tov,team_ind
0,2016-10-25,1,21600001,1610612752,NYK,New York,0-1,18,27,19,...,0.0,0.0,88,0.368,0.75,0.333,17,42,18,0
1,2016-10-25,1,21600001,1610612739,CLE,Cleveland,1-0,28,20,34,...,0.0,0.0,117,0.479,0.737,0.371,31,51,14,1
2,2016-10-25,2,21600002,1610612762,UTA,Utah,0-1,26,20,37,...,0.0,0.0,104,0.488,1.0,0.333,19,31,11,0
3,2016-10-25,2,21600002,1610612757,POR,Portland,1-0,26,28,23,...,0.0,0.0,113,0.52,1.0,0.684,22,34,12,1
4,2016-10-25,3,21600003,1610612759,SAS,San Antonio,1-0,31,33,33,...,0.0,0.0,129,0.48,0.885,0.5,25,55,13,0


## Pre-processing

In [None]:
# Some OT columns have no info
nba_df[['pts_ot5', 'pts_ot6', 'pts_ot7', 'pts_ot8', 'pts_ot9', 'pts_ot10']].describe()

# Drop OT columns
nba_df = nba_df.drop(['pts_ot5', 'pts_ot6', 'pts_ot7', 'pts_ot8', 'pts_ot9', 'pts_ot10'], axis=1)

In [None]:
# Dimensions
print("Shape:", nba_df.shape)

# Check NAs
print(nba_df.isna().sum())

Shape: (17226, 29)
game_date        0
game_sequence    0
game_id          0
team_id          0
team_abbr        0
team_city        0
game_result      0
pts_q1           0
pts_q2           0
pts_q3           0
pts_q4           0
pts_ot1          0
pts_ot2          0
pts_ot3          0
pts_ot4          0
pts_ot5          2
pts_ot6          2
pts_ot7          2
pts_ot8          2
pts_ot9          2
pts_ot10         2
pts_tot          0
fg_pct           0
ft_pct           0
fg3_pct          0
ast              0
reb              0
tov              0
team_ind         0
dtype: int64


## EDA

In [None]:
# Distribution of total points scored
plt.figure(figsize=(10, 6))
sns.histplot(nba_df['pts_tot'], kde=True)
plt.title('Distribution of Total Points Scored')
plt.xlabel('Total Points')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Boxplot of points scored by each quarter
plt.figure(figsize=(12, 6))
sns.boxplot(data=nba_df[['pts_q1', 'pts_q2', 'pts_q3', 'pts_q4']])
plt.title('Boxplot of Points Scored by Quarter')
plt.xlabel('Quarter')
plt.ylabel('Points')
plt.show()

In [None]:
# Team-wise points comparison
plt.figure(figsize=(14, 8))
sns.boxplot(x='team_abbr', y='pts_tot', data=nba_df)
plt.title('Team-wise Total Points Comparison')
plt.xlabel('Team')
plt.ylabel('Total Points')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(16, 10))
numeric_cols = nba_df.select_dtypes(include=['float64', 'int64']).columns
sns.heatmap(nba_df[numeric_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Top scoring teams
top_teams = nba_df.groupby('team_abbr')['pts_tot'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(14, 6))
sns.barplot(x=top_teams.index, y=top_teams.values)
plt.title('Top 10 Scoring Teams')
plt.xlabel('Team')
plt.ylabel('Total Points')
plt.show()

In [None]:
# Boxplot of points for winning and losing teams
plt.figure(figsize=(12, 6))
sns.boxplot(x='win_loss', y='pts_tot', data=nba_df)
plt.title('Points Distribution for Winning and Losing Teams')
plt.xlabel('Win/Loss')
plt.ylabel('Total Points')
plt.show()

In [None]:
# Scatter plot of assists vs. points
plt.figure(figsize=(10, 6))
sns.scatterplot(x='ast', y='pts_tot', data=nba_df)
plt.title('Assists vs. Total Points')
plt.xlabel('Assists')
plt.ylabel('Total Points')
plt.show()

In [None]:
# Scatter plot of rebounds vs. points
plt.figure(figsize=(10, 6))
sns.scatterplot(x='reb', y='pts_tot', data=nba_df)
plt.title('Rebounds vs. Total Points')
plt.xlabel('Rebounds')
plt.ylabel('Total Points')
plt.show()