# NBA Data Analysis and Visualization

This notebook is dedicated to gathering and visualizing NBA player data. We will leverage various techniques to scrape, process, and analyze data, culminating in informative visualizations that help understand shooting performance and trends in the NBA.

# Table of Contents

1. [Data Gathering](#data-gathering)
    - [NBA Scraper from Ubiratan Filho](#nba-scraper-from-ubiratan-filho)
    - [Collect Player Data](#collect-player-data)
2. [Interactive Shot Chart](#interactive-shot-chart)
3. [Miscellaneous Data Visualization](#data-visualization)
    - [Shot Distribution Heat Map](#shot-distribution-heat-map)
    - [Shooting Percentage by Shot Zone](#shooting-percentage-by-shot-zone)
    - [Shooting Percentage Over Time](#shooting-percentage-over-time)
    - [Shot Attempts and Percentages by Period](#shot-attempts-and-percentages-by-period)
    - [Scatter Plot of Shot Distance vs. Shooting Percentage](#scatter-plot-of-shot-distance-vs-shooting-percentage)
    - [Shot Outcome by Action Type](#shot-outcome-by-action-type)
    - [Shot Selection Diversity by Shot Type](#shot-selection-diversity-by-shot-type)
5. [Correlation Matrix](#correlation-matrix)

In [None]:
!pip install nba_api plotly ipywidgets

In [None]:
import pandas as pd
import sqlite3
import requests
import shutil
from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import playercareerstats, shotchartdetail
import plotly.express as px
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display

## Data Gathering

### NBA Scraper from Ubiratan Filho

Using `NbaScraper` created by Ubiratan Filho to collect player data efficiently. This tool allows us to automate the data retrieval process.

In [None]:
class NbaScraper:
    """ Class to scrape data from the NBA official website. """
    
    @staticmethod
    def get_json_from_name(name: str, is_player=True) -> int:
        """ Get the json of a player or team from his name """
        if is_player:
            nba_players = players.get_players()
            return [player for player in nba_players if player['full_name'] == name][0]
    
    @staticmethod
    def get_player_career(player_id: int) -> list:
        """ Get the career of a player from his id """
        career = playercareerstats.PlayerCareerStats(player_id=player_id)
        return career.get_data_frames()[0]
    
    @staticmethod
    def get_shot_data(id: int, team_ids: list, seasons: list) -> list:
        """ Get the shot data of a player from his id and seasons """
        df = pd.DataFrame()
        for season in seasons:
            for team in team_ids:
                shot_data = shotchartdetail.ShotChartDetail(
                    team_id=team,
                    player_id=id,
                    context_measure_simple='FGA',
                    season_nullable=season
                )
                df = pd.concat([df, shot_data.get_data_frames()[0]])
        
        return df
    
    @staticmethod
    def get_all_ids(only_active=True) -> list:
        """ Get all the ids of the players """
        nba_players = players.get_players()
        if only_active:
            return [player['id'] for player in nba_players if player['is_active']]
        return [player['id'] for player in nba_players]
    
    @staticmethod
    def get_player_headshot(id: int) -> str:
        """ Get the headshot of a player from his id """
        url = f'https://ak-static.cms.nba.com/wp-content/uploads/headshots/nba/latest/260x190/{id}.png'
        output_path = f'headshots/{id}.png'
        r = requests.get(url, stream=True)
        if r.status_code == 200:
            with open(output_path, 'wb') as f:
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)
    
    @staticmethod                                    
    def get_all_nba_headshots(only_active=False) -> None:
        """ Get the headshots of all the players """
        ids = NbaScraper.get_all_ids(only_active=only_active)
        for id in ids:
            NbaScraper.get_player_headshot(id)

### Collect Player Data

In [None]:
# Get Jason Tatum's player ID
tatum = NbaScraper.get_json_from_name("Jayson Tatum")
tatum_id = tatum['id']

# Import the teams module to get team IDs
from nba_api.stats.static import teams

# Define seasons and get team IDs for the Boston Celtics
seasons = ['2020-21', '2021-22', '2022-23']
team_ids = [team['id'] for team in teams.get_teams() if team['full_name'] == 'Boston Celtics']

# Get shot data for Jason Tatum
shot_data = NbaScraper.get_shot_data(tatum_id, team_ids, seasons)
print(shot_data.head())

## Interactive Shot Chart

In [None]:
# Create the dropdowns for filtering
action_type_dropdown = widgets.Dropdown(
    options=['All'] + shot_data['ACTION_TYPE'].unique().tolist(),
    value='All',
    description='Action Type:'
)

shot_type_dropdown = widgets.Dropdown(
    options=['All'] + shot_data['SHOT_TYPE'].unique().tolist(),
    value='All',
    description='Shot Type:'
)

shot_zone_basic_dropdown = widgets.Dropdown(
    options=['All'] + shot_data['SHOT_ZONE_BASIC'].unique().tolist(),
    value='All',
    description='Shot Zone Basic:'
)

shot_zone_area_dropdown = widgets.Dropdown(
    options=['All'] + shot_data['SHOT_ZONE_AREA'].unique().tolist(),
    value='All',
    description='Shot Zone Area:'
)

shot_made_dropdown = widgets.Dropdown(
    options=['All', 'Made', 'Missed'],
    value='All',
    description='Shot Made/Missed:'
)

# Create a dropdown for periods (1 to 4 for quarters)
period_dropdown = widgets.Dropdown(
    options=['All'] + [1, 2, 3, 4],
    value='All',
    description='Quarter:'
)

# Create a slider for minutes remaining (0 to 48 minutes)
minutes_remaining_slider = widgets.FloatRangeSlider(
    value=[0, 48],  # Assuming game can last up to 48 minutes
    min=0,
    max=48,
    step=1,
    description='Minutes Remaining:',
    continuous_update=False
)

# Create sliders for shot distance
shot_distance_slider = widgets.FloatRangeSlider(
    value=[0, 30],  # Example range
    min=0,
    max=30,
    step=1,
    description='Shot Distance:',
    continuous_update=False
)

In [None]:
# Function to update the shot chart based on the selected filters
def update_shot_chart(shot_made, period, minutes_remaining, shot_distance, action_type, shot_type, shot_zone_basic, shot_zone_area):
    # Filtering the data based on the selections
    filtered_df = shot_data.copy()
    
    # Create the SHOT_OUTCOME column if it doesn't exist
    if 'SHOT_OUTCOME' not in filtered_df.columns:
        filtered_df['SHOT_OUTCOME'] = filtered_df['SHOT_MADE_FLAG'].map({1: 'Made', 0: 'Missed'})

    if shot_made == 'Made':
        filtered_df = filtered_df[filtered_df['SHOT_OUTCOME'] == 'Made']
    elif shot_made == 'Missed':
        filtered_df = filtered_df[filtered_df['SHOT_OUTCOME'] == 'Missed']
    
    # Filter by period if selected
    if period != 'All':
        filtered_df = filtered_df[filtered_df['PERIOD'] == period]

    # Further filtering based on minutes remaining
    min_minutes, max_minutes = minutes_remaining
    filtered_df = filtered_df[(filtered_df['MINUTES_REMAINING'] >= min_minutes) & (filtered_df['MINUTES_REMAINING'] <= max_minutes)]

    # Further filtering based on shot distance
    min_distance, max_distance = shot_distance
    filtered_df = filtered_df[(filtered_df['SHOT_DISTANCE'] >= min_distance) & (filtered_df['SHOT_DISTANCE'] <= max_distance)]

    if action_type != 'All':
        filtered_df = filtered_df[filtered_df['ACTION_TYPE'] == action_type]

    if shot_type != 'All':
        filtered_df = filtered_df[filtered_df['SHOT_TYPE'] == shot_type]

    if shot_zone_basic != 'All':
        filtered_df = filtered_df[filtered_df['SHOT_ZONE_BASIC'] == shot_zone_basic]

    if shot_zone_area != 'All':
        filtered_df = filtered_df[filtered_df['SHOT_ZONE_AREA'] == shot_zone_area]

    # Calculate shot percentage
    total_shots = len(filtered_df)
    shots_made = filtered_df['SHOT_MADE_FLAG'].sum()  # Assuming 1 for made, 0 for missed
    shot_percentage = (shots_made / total_shots * 100) if total_shots > 0 else 0

    # Create the scatter plot with distinct colors for made and missed shots
    fig = px.scatter(
        filtered_df,
        x='LOC_X',
        y='LOC_Y',
        color='SHOT_OUTCOME',  # Use the new column for color mapping
        title=f"{filtered_df['PLAYER_NAME'].iloc[0]} Shot Chart (Percentage: {shot_percentage:.2f}%)" if not filtered_df.empty else 'Shot Chart (No Data)',
        color_discrete_map={'Made': 'green', 'Missed': 'red'}  # Customize colors
    )
    
    # Update layout for a basketball court background
    fig.update_layout(
        plot_bgcolor='rgba(0,0,0,0)',  # Transparent background
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-250, 250]),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-50, 470]),
        images=[dict(
            source='https://upload.wikimedia.org/wikipedia/commons/e/ea/Basketball_court_dimensions.svg',  # Basketball court image
            xref='x',
            yref='y',
            x=0,
            y=0,
            sizex=500,
            sizey=470,
            xanchor='center',
            yanchor='bottom',
            opacity=1,
            layer='below'
        )]
    )

    fig.show()

# Use interactive to tie the widgets to the update function
out = widgets.interactive(
    update_shot_chart,
    shot_made=shot_made_dropdown,
    period=period_dropdown,
    minutes_remaining=minutes_remaining_slider,
    shot_distance=shot_distance_slider,
    action_type=action_type_dropdown,
    shot_type=shot_type_dropdown,
    shot_zone_basic=shot_zone_basic_dropdown,
    shot_zone_area=shot_zone_area_dropdown
)

# Display the interactive output
display(out)

## Miscellaneous Data Visualization

### Shot Distribution Heat Map

In [None]:
# Create the heatmap with smaller rectangles
heatmap_fig = px.density_heatmap(
    shot_data,
    x='LOC_X',
    y='LOC_Y',
    z='SHOT_MADE_FLAG',  # Use made shots as the weight
    title='Shot Distribution Heat Map',
    color_continuous_scale='Viridis',
    nbinsx=50,  # Number of bins for x-axis
    nbinsy=50,  # Number of bins for y-axis
)

heatmap_fig.update_layout(
    xaxis_title='X Coordinate',
    yaxis_title='Y Coordinate',
)

heatmap_fig.show()

### Shooting Percentage by Shot Zone

In [None]:
# Calculate shooting percentage by shot zone
shot_zone_stats = shot_data.groupby('SHOT_ZONE_BASIC').agg(
    total_shots=('SHOT_MADE_FLAG', 'count'),
    shots_made=('SHOT_MADE_FLAG', 'sum')
).reset_index()
shot_zone_stats['shooting_percentage'] = (shot_zone_stats['shots_made'] / shot_zone_stats['total_shots']) * 100

# Create the bar chart
bar_fig = px.bar(
    shot_zone_stats,
    x='SHOT_ZONE_BASIC',
    y='shooting_percentage',
    title='Shooting Percentage by Shot Zone',
    labels={'shooting_percentage': 'Shooting Percentage (%)'},
)

bar_fig.show()

### Shooting Percentage Over Time

In [None]:
# Assuming 'GAME_DATE' is in datetime format
shot_data['GAME_DATE'] = pd.to_datetime(shot_data['GAME_DATE'])

# Calculate shooting percentage over time
time_series_stats = shot_data.groupby('GAME_DATE').agg(
    total_shots=('SHOT_MADE_FLAG', 'count'),
    shots_made=('SHOT_MADE_FLAG', 'sum')
).reset_index()
time_series_stats['shooting_percentage'] = (time_series_stats['shots_made'] / time_series_stats['total_shots']) * 100

# Create the line chart
line_fig = px.line(
    time_series_stats,
    x='GAME_DATE',
    y='shooting_percentage',
    title='Shooting Percentage Over Time',
    labels={'shooting_percentage': 'Shooting Percentage (%)'},
)

line_fig.show()

### Shot Attempts and Percentages by Period

In [None]:
# Assuming shot_data is already defined and includes a 'PERIOD' column
# Create a DataFrame for shot attempts and percentages by period
quarter_stats = shot_data.groupby('PERIOD').agg(
    total_attempts=('SHOT_MADE_FLAG', 'count'),
    shots_made=('SHOT_MADE_FLAG', 'sum'),
).reset_index()

# Calculate shot percentage
quarter_stats['shot_percentage'] = (quarter_stats['shots_made'] / quarter_stats['total_attempts']) * 100

# Create the figure
fig_quarter = go.Figure()

# Add a bar for total attempts
fig_quarter.add_trace(go.Bar(
    x=quarter_stats['PERIOD'],
    y=quarter_stats['total_attempts'],
    name='Total Attempts',
    marker_color='blue',  # Customize color
    text=quarter_stats['total_attempts'],  # Show total attempts on the bars
    textposition='auto',  # Position the text automatically
    width=0.4  # Width of the bars
))

# Add a bar for shot percentage, scale up its values
fig_quarter.add_trace(go.Bar(
    x=quarter_stats['PERIOD'],
    y=quarter_stats['shot_percentage'] * 15,  # Scale shot percentage for better visibility
    name='Shot Percentage',
    marker_color='orange',  # Customize color
    text=[f"{p:.1f}%" for p in quarter_stats['shot_percentage']],  # Show percentage on the bars
    textposition='auto',  # Position the text automatically
    width=0.4  # Width of the bars
))

# Update layout to create a secondary y-axis for percentage
fig_quarter.update_layout(
    title='Shot Attempts and Percentages by Period',
    xaxis_title='Period',
    yaxis_title='Total Attempts',
    yaxis=dict(
        title='Total Attempts',
        titlefont=dict(color='blue'),
        tickfont=dict(color='blue'),
    ),
    yaxis2=dict(
        title='Shot Percentage',
        overlaying='y',
        side='right',
        showgrid=False,
        titlefont=dict(color='orange'),
        tickfont=dict(color='orange'),
        anchor='x',  # Ensure the secondary axis is anchored to the x-axis
    ),
    barmode='group',  # Ensures bars are side by side
)

# Show the figure
fig_quarter.show()

### Scatter Plot of Shot Distance vs. Shooting Percentage

In [None]:
# Create a DataFrame for shot distance and success
shot_distance_stats = shot_data.groupby('SHOT_DISTANCE').agg(
    shots_made=('SHOT_MADE_FLAG', 'sum'),
    total_attempts=('SHOT_MADE_FLAG', 'count')
).reset_index()

shot_distance_stats['shot_percentage'] = (shot_distance_stats['shots_made'] / shot_distance_stats['total_attempts']) * 100

# Create a scatter plot
fig_scatter = px.scatter(
    shot_distance_stats,
    x='SHOT_DISTANCE',
    y='shot_percentage',
    title='Scatter Plot of Shot Distance vs. Shooting Percentage',
    labels={'SHOT_DISTANCE': 'Shot Distance', 'shot_percentage': 'Shooting Percentage'},
)

fig_scatter.update_traces(marker=dict(size=10))  # Increase point size for visibility
fig_scatter.show()


### Shot Outcome by Action Type

In [None]:
# Create a DataFrame for action type success rates
action_type_stats = shot_data.groupby('ACTION_TYPE').agg(
    total_attempts=('SHOT_MADE_FLAG', 'count'),
    shots_made=('SHOT_MADE_FLAG', 'sum'),
).reset_index()

action_type_stats['shot_percentage'] = (action_type_stats['shots_made'] / action_type_stats['total_attempts']) * 100

# Create a bar chart
fig_action_type = px.bar(
    action_type_stats,
    x='ACTION_TYPE',
    y='shot_percentage',
    title='Shot Outcome by Action Type',
    labels={'shot_percentage': 'Shooting Percentage'},
)

fig_action_type.show()

### Shot Selection Diversity by Shot Type

In [None]:
# Create a DataFrame for shot type diversity
shot_type_stats = shot_data.groupby('SHOT_TYPE').agg(
    total_attempts=('SHOT_MADE_FLAG', 'count'),
    shots_made=('SHOT_MADE_FLAG', 'sum'),
).reset_index()

shot_type_stats['shot_percentage'] = (shot_type_stats['shots_made'] / shot_type_stats['total_attempts']) * 100

# Create a pie chart for shot type diversity
fig_shot_diversity = px.pie(
    shot_type_stats,
    names='SHOT_TYPE',
    values='total_attempts',
    title='Shot Selection Diversity by Shot Type',
)

fig_shot_diversity.show()

## Correlation Matrix

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure shot_data is your DataFrame with relevant columns
# Convert categorical columns if necessary
shot_data_encoded = pd.get_dummies(shot_data, columns=['ACTION_TYPE', 'SHOT_TYPE', 'SHOT_ZONE_AREA'])

# Ensure LOC_X and LOC_Y are included in the correlation calculation
# Select the relevant columns for correlation
correlation_columns = [
    'SHOT_MADE_FLAG',
    'SHOT_DISTANCE',
    'PERIOD',
    'MINUTES_REMAINING',
    'SECONDS_REMAINING',
    'LOC_X',
    'LOC_Y'
]

# Add all encoded categorical columns
encoded_columns = shot_data_encoded.columns[shot_data_encoded.columns.str.startswith(('ACTION_TYPE_', 'SHOT_TYPE_', 'SHOT_ZONE_AREA_'))]
correlation_columns += encoded_columns.tolist()

# Calculate the correlation matrix
correlation_matrix = shot_data_encoded[correlation_columns].corr()

# Set the figure size to ensure all labels are visible
plt.figure(figsize=(18, 16))  # Adjusted size for better visibility
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', square=True, cbar_kws={"shrink": .8}, linewidths=0.5)
plt.title('Correlation Matrix', fontsize=20)
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.yticks(rotation=0)  # Keep y-axis labels horizontal
plt.tight_layout()  # Automatically adjust subplot parameters to give specified padding
plt.show()