# Here I find the most most important features for team success using EDA

# 1. Imports

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nba_api.stats.endpoints import leaguegamefinder, teamdashboardbygeneralsplits, playercareerstats
from nba_api.stats.endpoints import leagueleaders
from nba_api.stats.static import teams, players
import seaborn as sns
import re

# 1. Loading in team season statistics, renaming teams, and only keeping numeric columns

In [4]:
team_stats = pd.read_csv('../data/team_stats_cleaned.csv')
team_stats.head(10)
team_stats.columns

Index(['group_set', 'group_value', 'season_year', 'gp', 'w', 'l', 'w_pct',
       'min', 'fgm', 'fga', 'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta',
       'ft_pct', 'oreb', 'dreb', 'reb', 'ast', 'tov', 'stl', 'blk', 'blka',
       'pf', 'pfd', 'pts', 'plus_minus', 'gp_rank', 'w_rank', 'l_rank',
       'w_pct_rank', 'min_rank', 'fgm_rank', 'fga_rank', 'fg_pct_rank',
       'fg3m_rank', 'fg3a_rank', 'fg3_pct_rank', 'ftm_rank', 'fta_rank',
       'ft_pct_rank', 'oreb_rank', 'dreb_rank', 'reb_rank', 'ast_rank',
       'tov_rank', 'stl_rank', 'blk_rank', 'blka_rank', 'pf_rank', 'pfd_rank',
       'pts_rank', 'plus_minus_rank', 'team_name'],
      dtype='object')

In [5]:
columns_to_keep = ['season_year', 'fgm', 'fga','fg3_pct', 'ftm', 'fta','ft_pct', 'oreb', 'dreb', 'reb', 'ast', 'tov', 'stl', 'blk', 'team_name']
team_stats = team_stats[columns_to_keep]
team_stats['fg%'] = team_stats['fgm']/team_stats['fga']
team_stats.head(10)

Unnamed: 0,season_year,fgm,fga,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,tov,stl,blk,team_name,fg%
0,2024-25,1626,3498,0.354,707,901,0.785,456,1259,1715,1137,609.0,391,203,Atlanta Hawks,0.464837
1,2024-25,1548,3358,0.366,615,776,0.793,406,1257,1663,945,440.0,285,208,Boston Celtics,0.460989
2,2024-25,1673,3314,0.405,598,772,0.775,363,1252,1615,1087,481.0,324,171,Cleveland Cavaliers,0.504828
3,2024-25,1507,3422,0.336,611,797,0.767,476,1145,1621,938,569.0,364,206,New Orleans Pelicans,0.440386
4,2024-25,1598,3411,0.372,554,696,0.796,345,1321,1666,1084,560.0,280,173,Chicago Bulls,0.468484
5,2024-25,1613,3359,0.375,692,892,0.776,426,1290,1716,954,545.0,308,224,Dallas Mavericks,0.480202
6,2024-25,1623,3256,0.382,661,868,0.762,392,1234,1626,1130,508.0,313,169,Denver Nuggets,0.498464
7,2024-25,1523,3393,0.365,526,741,0.71,471,1252,1723,1063,536.0,329,202,Golden State Warriors,0.448865
8,2024-25,1542,3474,0.329,658,864,0.762,535,1270,1805,813,495.0,322,194,Houston Rockets,0.443869
9,2024-25,1471,3173,0.363,626,803,0.78,397,1233,1630,895,610.0,359,166,Los Angeles Clippers,0.463599


In [6]:
team_stats = team_stats.sort_values('team_name')
team_stats.head(10)

Unnamed: 0,season_year,fgm,fga,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,tov,stl,blk,team_name,fg%
0,2024-25,1626,3498,0.354,707,901,0.785,456,1259,1715,1137,609.0,391,203,Atlanta Hawks,0.464837
1,2024-25,1548,3358,0.366,615,776,0.793,406,1257,1663,945,440.0,285,208,Boston Celtics,0.460989
14,2024-25,1405,3135,0.366,628,776,0.809,344,1121,1465,931,576.0,271,135,Brooklyn Nets,0.448166
29,2024-25,1337,3148,0.341,538,697,0.772,454,1164,1618,819,561.0,281,173,Charlotte Hornets,0.424714
4,2024-25,1598,3411,0.372,554,696,0.796,345,1321,1666,1084,560.0,280,173,Chicago Bulls,0.468484
2,2024-25,1673,3314,0.405,598,772,0.775,363,1252,1615,1087,481.0,324,171,Cleveland Cavaliers,0.504828
5,2024-25,1613,3359,0.375,692,892,0.776,426,1290,1716,954,545.0,308,224,Dallas Mavericks,0.480202
6,2024-25,1623,3256,0.382,661,868,0.762,392,1234,1626,1130,508.0,313,169,Denver Nuggets,0.498464
28,2024-25,1578,3380,0.362,579,756,0.766,423,1276,1699,977,602.0,279,195,Detroit Pistons,0.466864
7,2024-25,1523,3393,0.365,526,741,0.71,471,1252,1723,1063,536.0,329,202,Golden State Warriors,0.448865


In [7]:
nba_teams = [
    'ATL', 'BOS', 'BRO', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 
    'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 
    'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS'
]
nba_teams.sort()
nba_teams

team_stats['Team'] = nba_teams
team_stats


Unnamed: 0,season_year,fgm,fga,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,tov,stl,blk,team_name,fg%,Team
0,2024-25,1626,3498,0.354,707,901,0.785,456,1259,1715,1137,609.0,391,203,Atlanta Hawks,0.464837,ATL
1,2024-25,1548,3358,0.366,615,776,0.793,406,1257,1663,945,440.0,285,208,Boston Celtics,0.460989,BOS
14,2024-25,1405,3135,0.366,628,776,0.809,344,1121,1465,931,576.0,271,135,Brooklyn Nets,0.448166,BRO
29,2024-25,1337,3148,0.341,538,697,0.772,454,1164,1618,819,561.0,281,173,Charlotte Hornets,0.424714,CHA
4,2024-25,1598,3411,0.372,554,696,0.796,345,1321,1666,1084,560.0,280,173,Chicago Bulls,0.468484,CHI
2,2024-25,1673,3314,0.405,598,772,0.775,363,1252,1615,1087,481.0,324,171,Cleveland Cavaliers,0.504828,CLE
5,2024-25,1613,3359,0.375,692,892,0.776,426,1290,1716,954,545.0,308,224,Dallas Mavericks,0.480202,DAL
6,2024-25,1623,3256,0.382,661,868,0.762,392,1234,1626,1130,508.0,313,169,Denver Nuggets,0.498464,DEN
28,2024-25,1578,3380,0.362,579,756,0.766,423,1276,1699,977,602.0,279,195,Detroit Pistons,0.466864,DET
7,2024-25,1523,3393,0.365,526,741,0.71,471,1252,1723,1063,536.0,329,202,Golden State Warriors,0.448865,GSW


# 2. Data cleaning and pre-processing

In [9]:
def clean_and_extract_team_id(team_name):
    cleaned_name = re.sub(r'\s\(\d+\)', '', team_name)
    team_id = cleaned_name[:3].upper()
    return team_id

relabeled_dict = {
    "Team": "Team1",
    "Team.1": "Team2",
    "W": "Team1wins",
    "W.1": "Team2wins"
}
playoff_results = pd.read_csv('../data/playoff_results.csv') 
playoff_results = playoff_results.rename(columns=relabeled_dict)
playoff_results['Team1'] = playoff_results['Team1'].apply(clean_and_extract_team_id)
playoff_results['Team2'] = playoff_results['Team2'].apply(clean_and_extract_team_id) 
playoff_results.replace('CHA', 'CHO', inplace=True)


team1_wins = playoff_results[['Yr', 'Team1', 'Team1wins']].rename(columns={'Team1': 'Team', 'Team1wins': 'wins'})
team2_wins = playoff_results[['Yr', 'Team2', 'Team2wins']].rename(columns={'Team2': 'Team', 'Team2wins': 'wins'})

combined_wins = pd.concat([team1_wins, team2_wins])

team_totals = combined_wins.groupby(['Yr', 'Team'], as_index=False)['wins'].sum()

team_totals_2024 = team_totals[team_totals['Yr'] == 2024]
print(team_totals_2024)

       Yr Team  wins
125  2024  BOS    16
126  2024  CLE     5
127  2024  DAL    13
128  2024  DEN     7
129  2024  IND     8
130  2024  LOS     3
131  2024  MIA     1
132  2024  MIL     2
133  2024  MIN     9
134  2024  NEW     7
135  2024  OKL     6
136  2024  ORL     3
137  2024  PHI     2
138  2024  PHO     0


In [18]:
team_stats = team_stats.rename(columns = {"season_year": "Yr", "team_name":"Team"})
team_stats.head()

Unnamed: 0,Yr,fgm,fga,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,tov,stl,blk,Team,fg%,Team.1
0,2024-25,1626,3498,0.354,707,901,0.785,456,1259,1715,1137,609.0,391,203,Atlanta Hawks,0.464837,ATL
1,2024-25,1548,3358,0.366,615,776,0.793,406,1257,1663,945,440.0,285,208,Boston Celtics,0.460989,BOS
14,2024-25,1405,3135,0.366,628,776,0.809,344,1121,1465,931,576.0,271,135,Brooklyn Nets,0.448166,BRO
29,2024-25,1337,3148,0.341,538,697,0.772,454,1164,1618,819,561.0,281,173,Charlotte Hornets,0.424714,CHA
4,2024-25,1598,3411,0.372,554,696,0.796,345,1321,1666,1084,560.0,280,173,Chicago Bulls,0.468484,CHI


In [23]:
print(team_stats.columns)
print(team_totals_2024.columns)


Index(['Yr', 'fgm', 'fga', 'fg3_pct', 'ftm', 'fta', 'ft_pct', 'oreb', 'dreb',
       'reb', 'ast', 'tov', 'stl', 'blk', 'fg%'],
      dtype='object')
Index(['Yr', 'Team', 'wins'], dtype='object')


In [20]:
team_stats = team_stats.drop(columns=['Team'])

In [None]:
ranked_data = merged_data.copy()
numeric_cols = ['fgm', 'fga', 'fg3_pct', 'ftm', 'fta', 'ft_pct', 'oreb', 'dreb', 'reb', 
                'ast', 'tov', 'stl', 'blk', 'fg%'] 

for col in numeric_cols:
    ranked_data[f'{col}_rank'] = ranked_data[col].rank(ascending=False)  

ranked_data = ranked_data[['Yr', 'Team', 'wins'] + [f'{col}_rank' for col in numeric_cols]]
ranked_data.head()

Unnamed: 0,Yr,Team,wins,fgm_rank,fga_rank,fg3_pct_rank,ftm_rank,fta_rank,ft_pct_rank,oreb_rank,dreb_rank,reb_rank,ast_rank,tov_rank,stl_rank,blk_rank,fg%_rank
1,2024,BOS,16.0,5.0,2.0,7.5,7.0,7.0,1.0,3.0,2.0,2.0,5.0,10.0,9.0,3.0,6.0
5,2024,CLE,5.0,1.0,5.0,1.0,8.0,8.0,8.0,6.0,3.0,5.0,2.0,9.0,3.0,7.0,1.0
6,2024,DAL,13.0,4.0,1.0,5.0,1.0,1.0,7.0,1.5,1.0,1.0,4.0,3.0,6.0,2.0,4.0
7,2024,DEN,7.0,3.0,6.0,3.0,3.5,3.0,9.0,4.0,6.5,3.0,1.0,5.0,5.0,8.0,2.0
11,2024,IND,8.0,2.0,3.0,6.0,6.0,6.0,5.0,9.0,6.5,7.0,3.0,4.0,7.0,4.0,3.0


# 3. Quick histogram plotting using matplotlib to see what's going on

In [None]:
for col in numeric_cols:
    plt.figure(figsize=(8, 6))
    plt.scatter(ranked_data[f'{col}_rank'], ranked_data['wins'], alpha=0.7, edgecolor='k')
    plt.title(f'Wins vs {col.capitalize()} Rank', fontsize=14)
    plt.xlabel(f'{col.capitalize()} Rank (Lower is Better)', fontsize=12)
    plt.ylabel('Wins', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.show()


# 4. Finding correlation between each feature and # of wins 

In [None]:
correlation_results = []

for col in numeric_cols:
    rank_col = f"{col}_rank"  
    if rank_col in ranked_data.columns:  
        correlation = abs(ranked_data[rank_col].corr(ranked_data['wins']))
        correlation_results.append({'Metric': rank_col, 'Correlation with Playoff Wins': correlation})
    else:
        correlation_results.append({'Metric': rank_col, 'Correlation with Wins': 'N/A'})

correlation_table = pd.DataFrame(correlation_results)

correlation_table_sorted = correlation_table.sort_values('Correlation with Playoff Wins', ascending = False)
print(correlation_table_sorted)

          Metric  Correlation with Playoff Wins
1       fga_rank                       0.775605
8       reb_rank                       0.760971
7      dreb_rank                       0.715582
6      oreb_rank                       0.543108
12      blk_rank                       0.519509
11      stl_rank                       0.497558
0       fgm_rank                       0.409754
9       ast_rank                       0.387803
5    ft_pct_rank                       0.365851
13      fg%_rank                       0.314632
2   fg3_pct_rank                       0.062384
3       ftm_rank                       0.047705
10      tov_rank                       0.021951
4       fta_rank                       0.014634


# Result: Most important features --> fga_rank, reb_rank, dreb_rank, oreb_rank, and blk_rank