In [1]:
import pandas as pd
from sql_functions import create_db_connection, execute_query, read_query
import plotly.express as px 
import plotly.graph_objects as go

#connect to database
pw = "*****"
connection = create_db_connection("localhost", "root", pw, "usports")

MySQL Database connection successful


### Individual 3 point attempts
Find the number of players each season who played in at least 1 game and played at least 5 minutes per game

Find the number of players each season who took >= X 3 point attempts per game (and who played in at least 1 game and played at least 5 minutes per game). Normalize this value by the number of players who played in at least 1 game and played at least 5 minutes per game so that results aren't skewed by increasing number of playeres in league.

Visualize as multiple line plots on a single axes


In [2]:
#count number of players who took greater than X threes each season
for num_threes in range(0,9):
    #count the number of players in each season who meet criteria above and also took at least num_threes 3pt shots per game
    if num_threes == 0:
        q = """
        SELECT season, count(name)
        FROM players
        NATURAL JOIN player_shooting
        NATURAL JOIN player_info
        WHERE games_played > 0 AND minutes_pgame >= 5 AND fga3 >= 0 AND fga3 < 1
        GROUP BY season
        ORDER BY season;
        """
        result_threes = read_query(connection, q, None)
    else:
        q = """
            SELECT season, count(name)
            FROM players
            NATURAL JOIN player_shooting
            NATURAL JOIN player_info
            WHERE games_played > 0 AND minutes_pgame >= 5 AND fga3 >= %s
            GROUP BY season
            ORDER BY season;
            """
        result_threes = read_query(connection, q, [num_threes])
        
    new_df = pd.DataFrame(result_threes, columns=['Season', num_threes])
    if num_threes == 0:
       df = new_df
    if num_threes > 0:
       df = df.join(new_df.set_index('Season'), on='Season', how='left')
    
#count number of players who played in at least 1 game and averaged at least 5 minutes per game
q2 = """
    SELECT season, count(name)
    FROM players
    NATURAL JOIN player_info
    WHERE games_played > 0 AND minutes_pgame >= 5
    GROUP BY season
    ORDER BY season;
    """

result_players = read_query(connection, q2, None)
df_numPlayers = pd.DataFrame(result_players, columns=['Season', 'Count'])

three_count_norm = df[[0,1,2,3,4,5,6,7,8]].transpose()/df_numPlayers['Count'].values
three_count_norm.columns = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2022, 2023]

In [3]:

plot = go.Figure(data=[
    go.Line(
    name = '0 <= X < 1',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[0]
    ),
    go.Line(
    name = '1 <= X',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[1]
    ),
    go.Line(
    name = '2 <= X',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[2]
    ),
    go.Line(
    name = '3 <= X',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[3]
    ),
    go.Line(
    name = '4 <= X',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[4]
    ),
    go.Line(
    name = '5 <= X',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[5]
    ),
    go.Line(
    name = '6 <= X',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[6]
    ),
    go.Line(
    name = '7 <= X',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[7]
    ),
    go.Line(
    name = '8 <= X',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[8]
    )],
    layout=go.Layout(
        title="Rate of players with at least X 3FGA per game",
        xaxis_title="Season",
        template="plotly_dark",
        width=800,
        height=600             
                 )
)

plot.update_layout(title_x=0.5)

plot.show()

#save Figure as jpg image
plot.write_image("images/ind_3fga_by_season.jpeg")

Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.



Findings from above:
- number of players taking less than 1 3pt attempt per game as decreased steadily since 2014
- number of players taking >= 1, 2, or 3 3 pt attempts per game has increased since 2014/2015 but plateaued/dipped the last two seasons
- number of players taking >= 4 or >=5 3's per game increased between 2015 and 2019, then plateaued/decreased before increasing again in 2023 season. Likely plateaued because more players are taking threes so they are spread out more between players, then in 2023 les players taking at least 1 or 2 3's per game and more are taking at least 4 or 5 per game

#### Do same thing but with exclusive ranges


In [5]:

#count number of players who took greater than X threes each season
for num_threes in range(0,10):
    #count the number of players in each season who meet criteria above and also took at least num_threes 3pt shots per game
    if num_threes == 0:
        q = """
        SELECT season, count(name)
        FROM players
        NATURAL JOIN player_shooting
        NATURAL JOIN player_info
        WHERE games_played > 0 AND minutes_pgame >= 5 AND fga3 = 0
        GROUP BY season
        ORDER BY season;
        """
        result_threes = read_query(connection, q, None)
    
    else:
        q = """
            SELECT season, count(name)
            FROM players
            NATURAL JOIN player_shooting
            NATURAL JOIN player_info
            WHERE games_played > 0 AND minutes_pgame >= 5 AND fga3 > %s AND fga3 <= %s
            GROUP BY season
            ORDER BY season;
            """
        
        result_threes = read_query(connection, q, [num_threes, num_threes+1])
        
    new_df = pd.DataFrame(result_threes, columns=['Season', num_threes])
    if num_threes == 0:
       df = new_df
    if num_threes > 0:
       df = df.join(new_df.set_index('Season'), on='Season', how='left')
       
    
#count number of players who played in at least 1 game and averaged at least 5 minutes per game
q2 = """
    SELECT season, count(name)
    FROM players
    NATURAL JOIN player_info
    WHERE games_played > 0 AND minutes_pgame >= 5
    GROUP BY season
    ORDER BY season;
    """

result_players = read_query(connection, q2, None)
df_numPlayers = pd.DataFrame(result_players, columns=['Season', 'Count'])

three_count_norm = df[[0,1,2,3,4,5,6,7,8,9]].transpose()/df_numPlayers['Count'].values
three_count_norm.columns = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2022, 2023]

plot = go.Figure(data=[
    go.Line(
    name = 'X = 0',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[0]
    ),
    go.Line(
    name = '0 < X <= 1',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[1]
    ),
    go.Line(
    name = '1 < X <= 2',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[2]
    ),
    go.Line(
    name = '2 < X <= 3',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[3]
    ),
    go.Line(
    name = '3 < X <= 4',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[4]
    ),
    go.Line(
    name = '4 < X <= 5',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[5]
    ),
    go.Line(
    name = '5 < X <= 6',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[6]
    ),
    go.Line(
    name = '6 < X <= 7',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[7]
    ),
    go.Line(
    name = '7 < X <= 8',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[8]
    ),
    go.Line(
    name = '8 < X <= 9',
    x = three_count_norm.columns.values,
    y = three_count_norm.iloc[9]
    )],
    layout=go.Layout(
        title="Rate of players with at between X and Y 3FGA per game",
        xaxis_title="Season",
        template="plotly_dark",
        width=800,
        height=600             
                 ))
plot.show()
#save Figure as jpg image
plot.write_image("images/ind_3fga_by_season_excRanges.jpeg")



plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




Findings from above:
- number of players taking zero 3's per game has decreased almost every year since 2014 (rate of ~0.07 compared to 0.2 in 2014) and was the lowest it's ever been (for what we have data for)
- number of players taking between 0 and 1 has varied but increased in each of the last two seasons
- number of players taking between 1 and 2 has been higher in 6 of the last 7 seasons compared to from 2010-2015
- number of players taking between 3 and 4 has increased in each of the last 3 seasons and in 2023 was the highest its ever been (for what we have data for) - a rate of ~0.075 compared to ~0.033
- number of players taking between 1 and 3 threes decreased a lot in 2023. This correlates with an increase in players shooting between 4 and 5 per game and increase in players shooting between 0 and 1 per game 
- number of players shooting between 8 and 9 per game doesn't seem to have changed much, but the rate of players in 2023 is approx 0.019 compared to 0.004 in 2020 (almost 5x as common)

### Individual 3 point field goal percentage
- Find the number of players who shot 3 point attempts with an accuracy between X% and Y% who played in at least 1 game and averaged at least 5 minutes per game
- Normalize by the number of players who played in at least 1 game and averaged at least 5 minutes per game


In [6]:
pcts = [0, 15, 20, 25, 30, 35, 40, 45, 50]

for idx in range(0,len(pcts)-1):
    lower_bound = pcts[idx]
    upper_bound = pcts[idx+1]
    
    q = """
        SELECT season, count(player_id)
        FROM player_shooting
        NATURAL JOIN player_info
        WHERE fg3_percent >= %s AND fg3_percent < %s AND games_played > 0 AND minutes_pgame >= 5
        GROUP BY season
        ORDER BY season
        """
    result = read_query(connection, q,[lower_bound, upper_bound])
    
    new_df = pd.DataFrame(result, columns=['Season', lower_bound])
    if lower_bound == 0:
       df = new_df
    if lower_bound != 0:
       df = df.join(new_df.set_index('Season'), on='Season', how='left')
     
     
#count number of players who played in at least 1 game and averaged at least 5 minutes per game
q2 = """
    SELECT season, count(player_id)
    FROM player_info
    WHERE games_played > 0 AND minutes_pgame >= 5
    GROUP BY season
    ORDER BY season;
    """

result_players = read_query(connection, q2, None)
df_numPlayers = pd.DataFrame(result_players, columns=['Season', 'Count'])

three_pct_norm = df[[0, 15, 20, 25, 30, 35, 40, 45]].transpose()/df_numPlayers['Count'].values
three_pct_norm.columns = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2022, 2023]

In [7]:
plot = go.Figure(data=[
    go.Line(
    name = '0-15%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[0]
    ),
    go.Line(
    name = '15-20%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[1]
    ),
    go.Line(
    name = '20-25%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[2]
    ),
    go.Line(
    name = '25-30%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[3]
    ),
    go.Line(
    name = '30-35%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[4]
    ),
    go.Line(
    name = '35-40%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[5]
    ),
    go.Line(
    name = '40-45%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[6]
    ),
    go.Line(
    name = '45-50%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[7]
    )],
    layout=go.Layout(
        title="Rate of Players Shooting 3FGA Between X% and Y%",
        xaxis_title="Season",
        template="plotly_dark",
        width=800,
        height=600            
                 )
    )
plot.update_layout(title_x=0.5)

plot.show()

#save Figure as jpg image
plot.write_image("images/ind_3fg_pct_by_season.jpeg")


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




#### Key Findings from above:
- number of players shooting between 0% and 15 % has decreased 2014 and in 2023 was loweset it's ever been (for what we have data for) - rate of ~0.27 compared to 0.4 in 2014
- number of players shooting between 25% and 30% has slightly increased since 2015
- more players are shooting more 3's but overall, across the league the 3pt% hasn't changed much/no consistent trends here - the main takeaway is that there are much fewer "very poor" 3 point shooters, even though there may not be significantly more "very good" 3 point shooters, more players are shooting in a range where teams may respect their shot more

#### Now same thing but only looking at players who take at least 2 3FGA per game

In [8]:
pcts = [0, 15, 20, 25, 30, 35, 40, 45, 50]

for idx in range(0,len(pcts)-1):
    lower_bound = pcts[idx]
    upper_bound = pcts[idx+1]
    
    q = """
        SELECT season, count(player_id)
        FROM player_shooting
        NATURAL JOIN player_info
        WHERE fg3_percent >= %s AND fg3_percent < %s AND fga3 >= 2 AND games_played > 0 AND minutes_pgame >= 5
        GROUP BY season
        ORDER BY season
        """
    result = read_query(connection, q,[lower_bound, upper_bound])
    
    new_df = pd.DataFrame(result, columns=['Season', lower_bound])

    if lower_bound == 0:
       df = new_df
    if lower_bound != 0:
       df = df.join(new_df.set_index('Season'), on='Season', how='outer')
     
     
#count number of players who took at least 2 3 point attempts per game and played in at least 1 game and averaged at least 5 minutes per game
q2 = """
    SELECT season, count(player_id)
    FROM player_info
    NATURAL JOIN player_shooting
    WHERE fga3 >= 2 AND games_played > 0 AND minutes_pgame >= 5
    GROUP BY season
    ORDER BY season;
    """

result_players = read_query(connection, q2, None)
df_numPlayers = pd.DataFrame(result_players, columns=['Season', 'Count'])



In [9]:
three_pct_norm = df[[0, 15, 20, 25, 30, 35, 40, 45]].transpose()/df_numPlayers['Count'].values
three_pct_norm.columns = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2022, 2023]

plot = go.Figure(data=[
    go.Line(
    name = '0-15%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[0]
    ),
    go.Line(
    name = '15-20%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[1]
    ),
    go.Line(
    name = '20-25%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[2]
    ),
    go.Line(
    name = '25-30%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[3]
    ),
    go.Line(
    name = '30-35%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[4]
    ),
    go.Line(
    name = '35-40%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[5]
    ),
    go.Line(
    name = '40-45%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[6]
    ),
    go.Line(
    name = '45-50%',
    x = three_pct_norm.columns.values,
    y = three_pct_norm.iloc[7]
    )],
    layout=go.Layout(
        title="Rate of Players Who Shot At least 2 3FGA Between X% and Y%",
        xaxis_title="Season",
        template="plotly_dark",
        width=800,
        height=600            
                 )
    )
plot.update_layout(title_x=0.5)

plot.show()

#save Figure as jpg image
plot.write_image("images/ind_3fg_pct_by_season_>=2attempts.jpeg")


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




Takeaways from above:
- the rate of "excellent" shooters (40-45%) has decreased over the years
- the rate of "decent" shooters (25-30%) has increased
- even though more threes are being taken, the percentage at which they're made doesn't show consistent trends

### Top individual 3 point %'s Each season
- select the top 25 3 point field goal percentages for each season
- create scatter plot with:
    - x-axis as the season
    - y-axis as 3FG%
    - size of data points represents # of 3FG attempts, bigger = more attempts per game (for stronger visualization, squared it)
    - color of data points represents the rank (1-25) of the 3FG% for that year

In [10]:
q = """
    SELECT season, fga3, fg3_percent, season_rank
    FROM (  SELECT player_shooting.*,
                row_number() OVER (PARTITION BY season ORDER BY fg3_percent DESC) AS season_rank
            FROM player_shooting 
            NATURAL JOIN player_info
            WHERE games_played > 10 AND fgm3 >= 1
          ) player_shooting
    WHERE season_rank >= 1 AND season_rank <= 25
"""


result = read_query(connection, q, None)
df = pd.DataFrame(result, columns=['Season', '3fga', '3fg%', 'season rank'])


fig = px.scatter(df, 'Season', '3fg%', size=df['3fga']**2, color='season rank')
fig.update_layout(template="plotly_dark",
                  width=900,
                  height=600,
                  title='Top 25 3 point shooting percentages by season',
                  xaxis_title='Season',
                  yaxis_title='3FG%',
                  title_x=0.5)
fig.show()

#save Figure as jpg image
fig.write_image("images/ind_top25_3fg_pct_by_season.jpeg")


Key take aways from above:
- 2010 - 2012 contain the highest percentages, but mainly achieved on fewer attempts per game (smaller dot sizes)
- 2014 - poor shooting season, top 10 vary between ~47% and ~38%, only season to have those shooting below 35% make the top 25
- comparing 2022 and 2023 to 2012, more of the top shooters are taking more attempts
- number of players shooting > 40% from three decreased between 2014 and 2020 (with the exception of 2016) but increased in 2022


In [11]:
fig = px.violin(df, x="Season", y="3fg%", box=True, points="all",
          hover_data=df.columns)
fig.update_layout(template="plotly_dark",
                  #width=900,
                  #height=600,
                  title='Top 25 3 point shooting percentages by season',
                  xaxis_title='Season',
                  yaxis_title='3FG%',
                  title_x=0.5)
fig.show()

### Individual Scoring Averages
- look at what percentage of players have averaged between X and Y points in a season (having played >10 games)
- each player/season record/instance is a separate instance 

In [12]:
pts = [0, 5, 10, 15, 20, 25, 30]
df = pd.DataFrame()

for idx in range(0,len(pts)-1):
    lower_bound = pts[idx]
    upper_bound = pts[idx+1]
    
    q = """
        SELECT count(player_id)
        FROM player_shooting
        NATURAL JOIN player_info
        WHERE ppg >= %s AND ppg < %s AND games_played > 10 
        """
    result = read_query(connection, q,[lower_bound, upper_bound])
    
    df[lower_bound] = result[0]
     
#count total number of players since 2010 who played in at least 10 games
q2 = """
    SELECT count(player_id)
    FROM player_info
    WHERE games_played > 10
    ;
    """

num_players_total = read_query(connection, q2, None)

labels = ['0-4.9','5-9.9','10-14.9','15-19.9', '20-24.9', '25-29.9']
plot = px.pie(df, values=df.iloc[0], names=labels, title="Individual PPG Averages Since 2010")
plot.update_traces(textinfo='percent+value')
plot.update_layout(width=500, template="plotly_dark", title_x=0.5, legend=dict(title="PPG"))
plot.show()

#save Figure as jpg image
plot.write_image("images/ppg_ave_piechart.jpeg")

print(num_players_total)

[(6166,)]


In [None]:
num_players_total

