## Data Source

In [1]:
url = "https://www.basketball-reference.com/leagues/NBA_2022_totals.html"

## Loading Libraries

In [2]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x) # Settings to display floating numbers

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px



## Data Context

RK - Rank   
Player - Nome do jogador    
Pos - Posição do jogador    
Age - Idade do jogador  
Tm - Time do jogador    
G - Numero de jogos que o jogador participou    
GS - Numero de jogos em que o jogador iniciou a partida  
MP - Minutos jogados    
FG - Arremessos convertidos  
FGA - Tentativas de arremesso   
FG% - Percentual de arremessos convertidos   
3P - Arremessos de 3 pontos convertidos  
3PA - Arremessos de 3 pontos  
3PA% - Percentual de arremessos de 3 pontos convertidas     
2P - Arremessos de 2 pontos     
2PA - Tentativas de arremessos de 2 pontos      
2PA% - Percentual de arremessos de 2 pontos convertidas     
eFG% - Effective Field Goal Percentage - Essa métrica leva em consideração que arremessos de 3 pontos possuem um valor maior que 2 pontos   
FT - Arremessos livres convertidos      
FTA - Arremessos livres     
FT% - Percentual de arremessos livres convertidos   
ORB - Rebotes ofensivos 
DRB - Rebotes defensivos    
TRB - Rebotes totais    
AST - Assistencias  
STL - Roubos de bola    
BLK - Bloqueios / Tocos     
TOV - Perdas de bola (Turn over)        
PF - Faltas pessoais    
PTS - Pontos    

## Perguntas de Negócio

1 - Quais jogadores fizeram mais pontos na temporada    
2 - Considerando o top 5% pontuadores da liga, quais são os que possuem maior aproveitamento nos arremessos de quadra       
3 - Quais jogadores tem maior aproveitamento nos arremessos de 3 pontos     
4 - Quais jogadores fizeram mais pontos por posição     
5 - QUal jogador tem mais tempo em quadra       
6 - Quais as maiores médias de pontos por jogo          
7 - Quais as maiores médias por jogo de atributos defensivos (ORB, TRB, STL, BLK)       
8 - Quais as maiores médias por jogo de turn overs          
9 - Quais jogadores cometem mais faltas     
10 - Quais os melhores arremessadores de bolas de 3 pontos até 20 anos de idade     
11 - Quais times tem melhor aproveitamento em arremessos de quadra      
12 - Quais times tem o pior aproveitamento de lance livre

## Reading Data

In [3]:
df = pd.read_csv('nba_2021_2022.csv')
df

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Player-additional
0,1,Precious Achiuwa,C,22,TOR,73,28,1725,265,603,...,146,327,473,82,37,41,84,151,664,achiupr01
1,2,Steven Adams,C,28,MEM,76,75,1999,210,384,...,349,411,760,256,65,60,115,153,528,adamsst01
2,3,Bam Adebayo,C,24,MIA,56,56,1825,406,729,...,137,427,564,190,80,44,148,171,1068,adebaba01
3,4,Santi Aldama,PF,21,MEM,32,0,360,53,132,...,33,54,87,21,6,10,16,36,132,aldamsa01
4,5,LaMarcus Aldridge,C,36,BRK,47,12,1050,252,458,...,73,185,258,42,14,47,44,78,607,aldrila01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,601,Thaddeus Young,PF,33,TOR,26,0,475,67,144,...,40,75,115,45,31,11,22,43,164,youngth01
808,602,Trae Young,PG,23,ATL,76,76,2652,711,1544,...,50,234,284,737,72,7,303,128,2155,youngtr01
809,603,Omer Yurtseven,C,23,MIA,56,12,706,130,247,...,85,209,294,49,17,20,41,84,299,yurtsom01
810,604,Cody Zeller,C,29,POR,27,0,355,51,90,...,50,75,125,22,8,6,19,56,140,zelleco01


In [4]:
# Data Dimensions

df.shape

(812, 31)

In [5]:
# Descriptive Statistics

df.describe()

Unnamed: 0,Rk,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
count,812.0,812.0,812.0,812.0,812.0,812.0,812.0,797.0,812.0,812.0,...,715.0,812.0,812.0,812.0,812.0,812.0,812.0,812.0,812.0,812.0
mean,304.5,26.05,36.7,16.67,825.19,137.73,299.63,0.43,42.35,120.04,...,0.75,35.45,116.43,151.88,84.71,26.07,16.08,44.83,67.45,375.46
std,172.64,4.06,25.9,23.82,775.79,154.74,330.9,0.14,55.2,148.72,...,0.16,46.02,128.74,168.18,113.39,27.23,21.77,52.96,61.62,426.82
min,1.0,19.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,158.75,23.0,12.0,0.0,121.0,13.75,35.0,0.39,2.0,7.0,...,0.67,4.0,15.0,20.75,7.0,3.0,1.0,5.0,12.0,36.5
50%,300.5,25.0,36.5,4.0,577.5,81.0,182.5,0.44,17.0,54.0,...,0.77,20.0,74.0,100.0,41.0,17.0,9.0,28.0,52.0,214.0
75%,459.0,29.0,61.0,25.0,1414.5,216.25,466.5,0.5,63.25,186.25,...,0.85,46.0,176.5,226.0,113.25,42.0,22.0,65.0,110.0,592.5
max,605.0,41.0,82.0,82.0,2854.0,774.0,1564.0,1.0,285.0,750.0,...,1.0,349.0,813.0,1019.0,737.0,138.0,177.0,303.0,286.0,2155.0


In [6]:
# Verificando o

print(len(df['Tm'].unique()))
df['Tm'].unique()


31


array(['TOR', 'MEM', 'MIA', 'BRK', 'TOT', 'NOP', 'UTA', 'MIL', 'CLE',
       'IND', 'LAL', 'ORL', 'NYK', 'HOU', 'WAS', 'PHO', 'SAC', 'DET',
       'CHO', 'CHI', 'ATL', 'DEN', 'PHI', 'SAS', 'LAC', 'OKC', 'MIN',
       'DAL', 'GSW', 'POR', 'BOS'], dtype=object)

In [7]:
# Eliminando registros com os totais de jogadores que passaram por mais de 1 time ('TOT') na temporada, pois será contabilizado como dado duplicado

df1 = df[(df['Tm'] != 'TOT')]

## Dataset para ser utilizado para análise dos jogadores

In [8]:
# Dataframe agrupado por jogador ( Dados excluídos: Time, posição )

df_player = df1.groupby('Player').sum().reset_index()
df_player[df_player['Player'] == 'James Harden']

Unnamed: 0,Player,Rk,Age,G,GS,MP,FG,FGA,FG%,3P,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
258,James Harden,436,64,65,65,2419,407,992,0.82,148,...,1.76,55,445,500,667,82,36,284,153,1432


In [9]:
# Como foram feitas as somas dos dados dos jogadores para unificá-los em um único registro, será necessário recalcular os valores percentuais.
# Esses valores foram somados nos casos de jogadores que passaram por mais de um time.
# São eles:FG%, 2PA%, 3PA%, eFG% e FT%

df_player['FG%'] = df_player['FG'] / df_player['FGA'] 
df_player['2P%'] = df_player['2P'] / df_player['2PA']
df_player['3P%'] = df_player['3P'] / df_player['3PA']
df_player['FT%'] = df_player['FT'] / df_player['FTA']
df_player['eFG%'] = (df_player['FG'] + (0.5 * df_player['3P'])) / df_player['FGA']

df_player[df_player['Player'] == 'James Harden']

Unnamed: 0,Player,Rk,Age,G,GS,MP,FG,FGA,FG%,3P,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
258,James Harden,436,64,65,65,2419,407,992,0.41,148,...,0.88,55,445,500,667,82,36,284,153,1432


## Criação de features de estatísticas por jogo

In [10]:
# Rebotes por jogo
df_player['RPG'] = df_player['TRB'] / df_player['G']

# Bloqueios por jogo
df_player['BLKPG'] = df_player['BLK'] / df_player['G']

# Roubos por jogo
df_player['SPG'] = df_player['STL'] / df_player['G']

# Pontos por jogo
df_player['PPG'] = df_player['PTS'] / df_player['G']

# Turn overs por jogo
df_player['TOVPG'] = df_player['TOV'] / df_player['G']

# Faltas pessoais por jogo
df_player['PFPG'] = df_player['PF'] / df_player['G']



## Dataset para ser utilizado nas análises dos times

In [11]:
df_teams = df1.groupby('Tm').sum().reset_index()
df_teams.drop(columns=(['Rk', 'Age', 'G', 'GS', 'MP']), inplace=True)


# Renomar a coluna 'PF' (Personal Fouls) para 'TF' (Total Fouls), já que, ao se fazer a soma das faltas dos jogadores, temos o total de faltas pela equipe.
df_teams.rename(columns={'PF':'TF'}, inplace=True)
df_teams.head()

Unnamed: 0,Tm,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,TF,PTS
0,ATL,3401,7241,10.41,1056,2821,7.17,2345,4420,11.63,...,16.52,823,2783,3606,2017,587,348,924,1534,9343
1,BOS,3341,7167,11.46,1085,3044,7.98,2256,4123,12.64,...,16.61,863,2915,3778,2036,591,478,1070,1521,9164
2,BRK,3442,7251,10.43,940,2602,6.83,2502,4649,11.59,...,15.01,844,2796,3640,2071,582,448,1086,1670,9258
3,CHI,3422,7127,10.51,872,2364,5.59,2550,4763,11.72,...,14.16,711,2760,3471,1958,585,336,994,1540,9152
4,CHO,3508,7497,8.41,1143,3130,4.86,2365,4367,9.3,...,12.55,888,2767,3655,2302,707,402,1042,1629,9457


## Assim como no dataset de jogadores, foram feitas as somas dos dados dos times para unificá-los em um único registro. Também será necessário recalcular os valores percentuais.
## São eles:FG%, 2PA%, 3PA%, eFG% e FT%


In [12]:
df_teams['FG%'] = df_teams['FG'] / df_teams['FGA'] 
df_teams['2P%'] = df_teams['2P'] / df_teams['2PA']
df_teams['3P%'] = df_teams['3P'] / df_teams['3PA']
df_teams['FT%'] = df_teams['FT'] / df_teams['FTA']
df_teams['eFG%'] = (df_teams['FG'] + (0.5 * df_teams['3P'])) / df_teams['FGA']
df_teams.head()


Unnamed: 0,Tm,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,TF,PTS
0,ATL,3401,7241,0.47,1056,2821,0.37,2345,4420,0.53,...,0.81,823,2783,3606,2017,587,348,924,1534,9343
1,BOS,3341,7167,0.47,1085,3044,0.36,2256,4123,0.55,...,0.82,863,2915,3778,2036,591,478,1070,1521,9164
2,BRK,3442,7251,0.47,940,2602,0.36,2502,4649,0.54,...,0.81,844,2796,3640,2071,582,448,1086,1670,9258
3,CHI,3422,7127,0.48,872,2364,0.37,2550,4763,0.54,...,0.81,711,2760,3471,1958,585,336,994,1540,9152
4,CHO,3508,7497,0.47,1143,3130,0.37,2365,4367,0.54,...,0.74,888,2767,3655,2302,707,402,1042,1629,9457


In [13]:
#df_teams['RPG'] = df_teams['TRB'] / df_teams['G']
#df_teams['BLKPG'] = df_teams['BLK'] / df_teams['G']
#df_teams['SPG'] = df_teams['STL'] / df_teams['G']
#df_teams['PPG'] = df_teams['PTS'] / df_teams['G']
#df_teams['TOVPG'] = df_teams['TOV'] / df_teams['G']
#df_teams['PFPG'] = df_teams['PF'] / df_teams['G']


## 1 - Quais jogadores fizeram mais pontos na temporada


In [14]:
top10_pts = df_player.sort_values('PTS', ascending=False).reset_index()[['Player', 'PTS']].head(10)
top10_pts

Unnamed: 0,Player,PTS
0,Trae Young,2155
1,DeMar DeRozan,2118
2,Joel Embiid,2079
3,Jayson Tatum,2046
4,Nikola Jokić,2004
5,Giannis Antetokounmpo,2002
6,Luka Dončić,1847
7,Devin Booker,1822
8,Karl-Anthony Towns,1818
9,Donovan Mitchell,1733


In [15]:
fig = px.bar(top10_pts, x = 'Player', y = 'PTS', text='PTS', title='More points in season 2021-2022')
fig.update_traces(textposition = 'outside', cliponaxis = False)
fig.show()

## 2 - Considerando o top 5% pontuadores da liga, quais são os que possuem maior aproveitamento nos arremessos de quadra

In [16]:
df_player.loc[df_player['PTS'] > df_player['PTS'].quantile(q=0.95), ['Player', 'FG', 'FGA', 'FG%', '2P', '2PA', '3P', '3PA', 'eFG%', 'FT', 'FTA', 'FT%', 'PTS']].sort_values('eFG%', ascending=False).reset_index(drop = True).head(10)

Unnamed: 0,Player,FG,FGA,FG%,2P,2PA,3P,3PA,eFG%,FT,FTA,FT%,PTS
0,Nikola Jokić,764,1311,0.58,667,1023,97,288,0.62,379,468,0.81,2004
1,Karl-Anthony Towns,642,1214,0.53,492,848,150,366,0.59,384,467,0.82,1818
2,LeBron James,640,1221,0.52,479,773,161,448,0.59,254,336,0.76,1695
3,Giannis Antetokounmpo,689,1245,0.55,618,1003,71,242,0.58,553,766,0.72,2002
4,Kevin Durant,578,1115,0.52,463,815,115,300,0.57,372,409,0.91,1643
5,Desmond Bane,508,1101,0.46,280,578,228,523,0.56,140,155,0.9,1384
6,Zach LaVine,565,1186,0.48,380,711,185,475,0.55,320,375,0.85,1635
7,Miles Bridges,596,1214,0.49,442,749,154,465,0.55,267,333,0.8,1613
8,Stephen Curry,535,1224,0.44,250,474,285,750,0.55,275,298,0.92,1630
9,Jordan Poole,474,1058,0.45,263,478,211,580,0.55,246,266,0.92,1405


In [17]:
fg_percent = df_player.loc[df_player['PTS'] > df_player['PTS'].quantile(q=0.95), ['Player', 'FG', 'FGA', 'FG%', '2P', '2PA', '3P', '3PA', 'eFG%', 'FT', 'FTA', 'FT%', 'PTS']].sort_values('FG%', ascending=False).reset_index(drop = True).head(10)

fig = px.bar(fg_percent, x = 'Player', y = 'FG%', text_auto= True)
fig.show()

## 3 - Quais jogadores tem maior aproveitamento nos arremessos de 3 pontos     


In [18]:
df_player[['Player', '3P', '3PA', '3P%']].sort_values('3P', ascending=False).reset_index(drop=True).head()

Unnamed: 0,Player,3P,3PA,3P%
0,Stephen Curry,285,750,0.38
1,Buddy Hield,262,716,0.37
2,Fred VanVleet,242,642,0.38
3,Evan Fournier,241,619,0.39
4,Malik Beasley,240,637,0.38


## 4 - Quais jogadores fizeram mais pontos por posição     


In [19]:
for i in df1['Pos'].unique():
    df_aux = df1[df1['Pos'] == i]
    df_aux.sort_values('PTS', ascending=False, inplace=True)
    print(f' Most points in position {i}: ')
    print(df_aux.loc[df_aux['PTS'] == df_aux['PTS'].max(), ['Player', 'PTS'] ])


 Most points in position C: 
          Player   PTS
206  Joel Embiid  2079
 Most points in position PF: 
            Player   PTS
167  DeMar DeRozan  2118
 Most points in position SG: 
          Player   PTS
70  Devin Booker  1822
 Most points in position PG: 
         Player   PTS
808  Trae Young  2155
 Most points in position SF: 
           Player   PTS
708  Jayson Tatum  2046




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/i

In [20]:
point_guard = df1[(df1['Pos'] == 'PG')]
point_guard[point_guard['PTS'] == point_guard['PTS'].max()][['Player', 'Pos', 'PTS']]

Unnamed: 0,Player,Pos,PTS
808,Trae Young,PG,2155


In [21]:
shooting_guard = df1[(df1['Pos'] == 'SG')]
shooting_guard[shooting_guard['PTS'] == shooting_guard['PTS'].max()][['Player', 'Pos', 'PTS']]

Unnamed: 0,Player,Pos,PTS
70,Devin Booker,SG,1822


In [22]:
small_foward = df1[(df1['Pos'] == 'SF')]
small_foward[small_foward['PTS'] == small_foward['PTS'].max()][['Player', 'Pos', 'PTS']]

Unnamed: 0,Player,Pos,PTS
708,Jayson Tatum,SF,2046


In [23]:
power_foward = df1[(df1['Pos'] == 'PF')]
power_foward[power_foward['PTS'] == power_foward['PTS'].max()][['Player', 'Pos', 'PTS']]

Unnamed: 0,Player,Pos,PTS
167,DeMar DeRozan,PF,2118


In [24]:
center = df1[(df1['Pos'] == 'C')]
center[center['PTS'] == center['PTS'].max()][['Player', 'Pos', 'PTS']]

Unnamed: 0,Player,Pos,PTS
206,Joel Embiid,C,2079


## 5 - Qual jogador tem mais tempo em quadra       


In [25]:
df_player.loc[df_player['MP'] == df_player['MP'].max(), ['Player', 'MP']]

Unnamed: 0,Player,MP
419,Mikal Bridges,2854


## 6 - Quais as maiores médias de pontos por jogo          


In [26]:
df_player[['Player', 'PPG']].sort_values('PPG', ascending=False).reset_index(drop=True).head()

Unnamed: 0,Player,PPG
0,Joel Embiid,30.57
1,LeBron James,30.27
2,Giannis Antetokounmpo,29.88
3,Kevin Durant,29.87
4,Luka Dončić,28.42


## 7 - Quais as maiores médias por jogo de atributos defensivos ( TRB, STL, BLK)       


In [27]:
df_defense = df_player[(df_player['RPG'] == df_player['RPG'].max()) | \
                        (df_player['BLKPG'] == df_player['BLKPG'].max()) | \
                        (df_player['SPG'] == df_player['SPG'].max())][['Player', 'RPG', 'BLKPG', 'SPG']]
                        


df_defense

print(f' Best stealer = {df_defense.iloc[0,0]}, with {np.round(df_defense.iloc[0,3], 2)} steals per game')
print(f' Best blocker = {df_defense.iloc[1,0]}, with {np.round(df_defense.iloc[1,2], 2)} blocks per game')
print(f' Best rebounder = {df_defense.iloc[2,0]}, with {np.round(df_defense.iloc[2,1], 2)} rebounds per game')

 Best stealer = Derrick Walton, with 2.33 steals per game
 Best blocker = Myles Turner, with 2.81 blocks per game
 Best rebounder = Rudy Gobert, with 14.67 rebounds per game


## 8 - Quais as maiores médias de turn overs por jogo          


In [28]:
df_player[['Player', 'TOVPG']].sort_values('TOVPG', ascending=False).reset_index()

Unnamed: 0,index,Player,TOVPG
0,383,Luka Dončić,4.49
1,258,James Harden,4.37
2,469,Paul George,4.06
3,553,Trae Young,3.99
4,449,Nikola Jokić,3.80
...,...,...,...
600,131,DeJon Jarreau,0.00
601,121,David Johnson,0.00
602,557,Tre Scott,0.00
603,112,Daniel Oturu,0.00


As maiores médias de turnovers pertencem a jogadores mais habilidosos. Uma explicação possível é que jogadores desse nível tendem a ficar mais tempo com a posse da bola e mais jogadas passam por eles, aumentando as chances de um erro.

## 9 - Quais jogadores cometem mais faltas     


In [29]:
df_player[['Player',  'PF']].sort_values('PF', ascending=False).head()


Unnamed: 0,Player,PF
245,Jae'Sean Tate,286
263,Jaren Jackson Jr.,272
331,Karl-Anthony Towns,267
299,Jonas Valančiūnas,247
220,Herbert Jones,238


In [30]:
df_player[['Player',  'PFPG']].sort_values('PFPG', ascending=False).head()

Unnamed: 0,Player,PFPG
601,Zavier Simpson,3.75
245,Jae'Sean Tate,3.67
331,Karl-Anthony Towns,3.61
263,Jaren Jackson Jr.,3.49
327,Jusuf Nurkić,3.38


## 10 - Quais os melhores arremessadores de bolas de 3 pontos até 20 anos de idade

In [31]:
df_player[(df_player['Age'] < 21) & (df_player['PTS'] > 500) ][['Player', '3P', '3PA', '3P%']].sort_values('3P%', ascending=False)

Unnamed: 0,Player,3P,3PA,3P%
367,LaMelo Ball,220,565,0.39
556,Tre Mann,98,272,0.36
24,Anthony Edwards,215,602,0.36
188,Franz Wagner,96,271,0.35
251,Jalen Green,157,458,0.34
300,Jonathan Kuminga,50,149,0.34
230,Isaiah Stewart,15,46,0.33
62,Cade Cunningham,114,363,0.31
603,Ziaire Williams,76,242,0.31
510,Scottie Barnes,58,193,0.3


## 11 - Quais times tem melhor aproveitamento em arremessos de quadra


In [32]:
df_teams[['Tm', 'FG', 'FG%', 'eFG%']].sort_values('eFG%', ascending=False).reset_index(drop=True)

Unnamed: 0,Tm,FG,FG%,eFG%
0,DEN,3416,0.48,0.56
1,UTA,3327,0.47,0.56
2,GSW,3323,0.47,0.55
3,PHO,3581,0.48,0.55
4,MIA,3246,0.47,0.55
5,MIL,3429,0.47,0.55
6,CHO,3508,0.47,0.54
7,ATL,3401,0.47,0.54
8,BOS,3341,0.47,0.54
9,CHI,3422,0.48,0.54


## 12 - Quais times tem o pior aproveitamento de lance livre


In [33]:
df_teams[['Tm', 'FT', 'FTA', 'FT%']].sort_values('FT%').reset_index(drop=True)

Unnamed: 0,Tm,FT,FTA,FT%
0,HOU,1434,2010,0.71
1,LAL,1380,1884,0.73
2,MEM,1393,1898,0.73
3,CHO,1298,1753,0.74
4,NYK,1473,1980,0.74
5,SAS,1262,1673,0.75
6,OKC,1233,1632,0.76
7,TOR,1327,1749,0.76
8,POR,1346,1771,0.76
9,CLE,1376,1810,0.76
