In [1]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import requests
import time

def scrape_nba_synergy(seasons):
    """
    Scrape NBA Synergy data for specified seasons
    """
    headers = {
        'Host': 'stats.nba.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
        'Accept': 'application/json, text/plain, */*',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'x-nba-stats-origin': 'stats',
        'x-nba-stats-token': 'true',
        'Connection': 'keep-alive',
        'Referer': 'https://stats.nba.com/',
        'Pragma': 'no-cache',
        'Cache-Control': 'no-cache'
    }

    playtypes = [
        'Isolation', 'PRBallHandler', 'PRRollMan', 'Spotup', 'Postup', 
        'Transition', 'Handoff', 'Cut', 'OffScreen', 'Misc'
    ]

    all_seasons_data = []

    for season in seasons:
        print(f"Processing season {season}...")
        synergy_data = []

        for playtype in playtypes:
            url = f'https://stats.nba.com/stats/synergyplaytypes?LeagueID=00&PerMode=Totals&PlayType={playtype}&PlayerOrTeam=P&SeasonType=Regular+Season&SeasonYear={season}&TypeGrouping=offensive'
            
            try:
                response = requests.get(url, headers=headers).json()
                time.sleep(1)  # Rate limiting

                data = response['resultSets'][0]['rowSet']
                columns = response['resultSets'][0]['headers']

                df = pd.DataFrame.from_records(data, columns=columns)

                # Rename columns for clarity
                df.rename(columns={
                    'POSS_PCT': f'{playtype}_POSS_PCT',
                    'EFG_PCT': f'{playtype}_EFG_PCT',
                    'PTS': f'{playtype}_PTS'
                }, inplace=True)

                # Weighted averages and summing points
                g = df.groupby(['PLAYER_NAME','PLAYER_ID'])
                df = pd.merge(
                    g.apply(lambda x: pd.Series(
                        np.average(x[[f'{playtype}_POSS_PCT', f'{playtype}_EFG_PCT']], 
                                 weights=x['POSS'], axis=0),
                        [f'{playtype}_POSS_PCT', f'{playtype}_EFG_PCT']
                    )).reset_index(drop=False),
                    pd.DataFrame(g.sum(numeric_only=True)[f'{playtype}_PTS']).reset_index(drop=False),
                    on=['PLAYER_NAME','PLAYER_ID']
                )

                synergy_data.append(df)

            except Exception as e:
                print(f"Error processing {playtype} for {season}: {str(e)}")
                continue

        if synergy_data:
            # Merge all synergy playtype data
            season_df = synergy_data[0]
            for df in synergy_data[1:]:
                season_df = pd.merge(season_df, df, on=['PLAYER_NAME','PLAYER_ID'], how='outer')

            season_df.fillna(0, inplace=True)
            season_df['SEASON'] = season
            all_seasons_data.append(season_df)

    # Combine all seasons
    final_df = pd.concat(all_seasons_data).reset_index(drop=True)
    
    # Save raw data
    final_df.to_csv('nba_synergy_raw_data.csv', index=False)
    return final_df

    # Generate season strings from 2014-15 to 2024-25
seasons = [f"{year}-{str(year+1)[-2:]}" for year in range(2014, 2025)]
scrape_nba_synergy(seasons)
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_season_data(df, season, n_clusters=13):
    """
    Analyze NBA data for a specific season using PCA and KMeans clustering
    """
    # Filter for season
    season_df = df[df['SEASON'] == season].copy()
    
    # Get feature columns (exclude non-numeric columns)
    features = [x for x in df.columns if x not in ['PLAYER_NAME', 'SEASON']]
    
    # Prepare data for PCA
    X = season_df[features].values
    X = StandardScaler().fit_transform(X)
    
    # Perform PCA
    pca = PCA(n_components=0.99)
    principal_components = pca.fit_transform(X)
    
    # Add PCA components to dataframe
    for i in range(principal_components.shape[1]):
        season_df[f'PCA_{i+1}'] = principal_components[:, i]
    
    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    season_df['Cluster'] = kmeans.fit_predict(principal_components)
    
    # Create visualization
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(principal_components[:, 0], principal_components[:, 1], 
                         c=season_df['Cluster'], cmap='tab20')
    plt.title(f'NBA Player Clustering {season}')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    
    # Add player names as annotations
    for idx, player in enumerate(season_df['PLAYER_NAME']):
        plt.annotate(player, (principal_components[idx, 0], principal_components[idx, 1]),
                    xytext=(5, 5), textcoords='offset points', fontsize=8)
    
    plt.colorbar(scatter)
    plt.tight_layout()
    plt.savefig(f'cluster/nba_clusters_{season}.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Save results
    season_df.to_csv(f'cluster/nba_analysis_{season}.csv', index=False)
    
    # Save cluster summary
    cluster_summary = season_df.groupby('Cluster')[features].mean()
    cluster_summary.to_csv(f'cluster/cluster_summary_{season}.csv')
    
    return season_df


df = pd.read_csv('nba_synergy_raw_data.csv')

# Process each season
for season in df['SEASON'].unique():
    print(f"Analyzing season {season}...")
    analyze_season_data(df, season)
    
print("Analysis complete!")

Processing season 2014-15...
Processing season 2015-16...
Processing season 2016-17...
Processing season 2017-18...
Processing season 2018-19...
Processing season 2019-20...
Processing season 2020-21...
Processing season 2021-22...
Processing season 2022-23...
Processing season 2023-24...
Processing season 2024-25...


Unnamed: 0,PLAYER_NAME,PLAYER_ID,Isolation_POSS_PCT,Isolation_EFG_PCT,Isolation_PTS,PRBallHandler_POSS_PCT,PRBallHandler_EFG_PCT,PRBallHandler_PTS,PRRollMan_POSS_PCT,PRRollMan_EFG_PCT,...,Cut_POSS_PCT,Cut_EFG_PCT,Cut_PTS,OffScreen_POSS_PCT,OffScreen_EFG_PCT,OffScreen_PTS,Misc_POSS_PCT,Misc_EFG_PCT,Misc_PTS,SEASON
0,Aaron Brooks,201166,0.114,0.320,81.0,0.459,0.501,436.0,0.000,0.000,...,0.011,0.444,9.0,0.023,0.60,28.0,0.047,0.238,26.0,2014-15
1,Aaron Gordon,203932,0.074,0.471,16.0,0.063,0.625,13.0,0.092,0.455,...,0.070,0.615,28.0,0.000,0.00,0.0,0.044,0.600,8.0,2014-15
2,Adreian Payne,203940,0.094,0.400,18.0,0.000,0.000,0.0,0.158,0.486,...,0.104,0.458,27.0,0.000,0.00,0.0,0.079,0.250,6.0,2014-15
3,Al Horford,201143,0.055,0.491,59.0,0.000,0.000,0.0,0.246,0.490,...,0.150,0.647,219.0,0.026,0.63,36.0,0.047,0.417,22.0,2014-15
4,Al Jefferson,2744,0.020,0.261,13.0,0.000,0.000,0.0,0.153,0.500,...,0.101,0.510,114.0,0.000,0.00,0.0,0.027,0.800,28.0,2014-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4625,Pelle Larsson,1641796,0.000,0.000,0.0,0.000,0.000,0.0,0.000,0.000,...,0.118,0.600,17.0,0.000,0.00,0.0,0.000,0.000,0.0,2024-25
4626,Thomas Bryant,1628418,0.000,0.000,0.0,0.000,0.000,0.0,0.000,0.000,...,0.000,0.000,0.0,0.000,0.00,0.0,0.000,0.000,0.0,2024-25
4627,Torrey Craig,1628470,0.000,0.000,0.0,0.000,0.000,0.0,0.000,0.000,...,0.000,0.000,0.0,0.000,0.00,0.0,0.000,0.000,0.0,2024-25
4628,Jericho Sims,1630579,0.000,0.000,0.0,0.000,0.000,0.0,0.000,0.000,...,0.347,0.571,26.0,0.000,0.00,0.0,0.208,0.000,3.0,2024-25


In [2]:
#!/usr/bin/env python



Analyzing season 2014-15...
Analyzing season 2015-16...
Analyzing season 2016-17...
Analyzing season 2017-18...
Analyzing season 2018-19...
Analyzing season 2019-20...
Analyzing season 2020-21...
Analyzing season 2021-22...
Analyzing season 2022-23...
Analyzing season 2023-24...
Analyzing season 2024-25...
Analysis complete!
