In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.cluster.hierarchy as shc
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from kneed import KneeLocator

def create_hierarchical_dendrogram(data, year):
    """
    Create a hierarchical clustering dendrogram for a specific year
    
    Parameters:
    -----------
    data : pandas.DataFrame
        Processed NBA player data
    year : int
        Year of the analysis
    """
    plt.figure(figsize=(10, 65))
    plt.title(f'{year} NBA Hierarchical Clustering Dendrogram')
    
    # Select features for clustering
    features = [x for x in data.columns if x not in ['PLAYER_NAME', 'POSITION', 'SEASON', 'Hierarchical_Cluster', 'KMeans_Cluster']]
    
    # Prepare and standardize features
    x = data[features].values
    x_scaled = StandardScaler().fit_transform(x)
    
    # Create dendrogram
    dend = shc.dendrogram(shc.linkage(x_scaled, method='ward'), 
                           labels=list(data.PLAYER_NAME), 
                           orientation='left')
    
    plt.yticks(fontsize=8)
    plt.xlabel('Height')
    plt.tight_layout()
    plt.savefig(f"nba_hierarchical_clustering_dendrogram_{year}.png", format='png', dpi=300)
    plt.close()

def analyze_clustering_metrics(data, year):
    """
    Create visualizations for clustering metrics
    
    Parameters:
    -----------
    data : pandas.DataFrame
        Processed NBA player data
    year : int
        Year of the analysis
    """
    # Select features for clustering
    features = [x for x in data.columns if x not in ['PLAYER_NAME', 'POSITION', 'SEASON', 'Hierarchical_Cluster', 'KMeans_Cluster']]
    
    # Prepare and standardize features
    x = data[features].values
    x_scaled = StandardScaler().fit_transform(x)
    
    # PCA for dimensionality reduction
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(x_scaled)
    
    # Elbow Method and Silhouette Score Analysis
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score
    
    inertia = []
    silhouette_scores = []
    k_range = range(2, 15)
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(principal_components)
        inertia.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(principal_components, kmeans.labels_))
    
    # Elbow Method Plot
    plt.figure(figsize=(10, 5))
    knee_locator = KneeLocator(k_range, inertia, curve="convex", direction="decreasing")
    plt.plot(k_range, inertia, 'bo-')
    plt.title(f'{year} Elbow Method for Optimal k')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.axvline(knee_locator.knee, color='r', linestyle='--')
    plt.tight_layout()
    plt.savefig(f'nba_kmeans_elbow_{year}.png')
    plt.close()
    
    # Silhouette Scores Plot
    plt.figure(figsize=(10, 5))
    plt.plot(k_range, silhouette_scores, 'go-')
    plt.title(f'{year} Silhouette Scores')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.axvline(k_range[np.argmax(silhouette_scores)], color='r', linestyle='--')
    plt.tight_layout()
    plt.savefig(f'nba_silhouette_scores_{year}.png')
    plt.close()

def visualize_clusters(data, year):
    """
    Create 2D visualization of player clusters
    
    Parameters:
    -----------
    data : pandas.DataFrame
        Processed NBA player data
    year : int
        Year of the analysis
    """
    # Select features for clustering
    features = [x for x in data.columns if x not in ['PLAYER_NAME', 'POSITION', 'SEASON', 'Hierarchical_Cluster', 'KMeans_Cluster']]
    
    # Prepare and standardize features
    x = data[features].values
    x_scaled = StandardScaler().fit_transform(x)
    
    # PCA for dimensionality reduction
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(x_scaled)
    
    # Create scatter plot of clusters
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(principal_components[:, 0], 
                          principal_components[:, 1], 
                          c=data['KMeans_Cluster'], 
                          cmap='viridis')
    plt.title(f'{year} NBA Player Clusters')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.colorbar(scatter, label='Cluster')
    plt.tight_layout()
    plt.savefig(f'nba_player_clusters_{year}.png')
    plt.close()

def main():
    # Process and visualize data from 2014 to 2025
    for year in range(2014, 2026):
        try:
            # Load processed data
            data = pd.read_csv(f'cluster/nba_analysis_{year}.csv')
            
            print(f"Visualizing data for year {year}")
            
            # Create visualizations
            create_hierarchical_dendrogram(data, year)
            analyze_clustering_metrics(data, year)
            visualize_clusters(data, year)
            
        except FileNotFoundError:
            print(f"No visualization data found for year {year}")
        except Exception as e:
            print(f"Error visualizing year {year}: {e}")

if __name__ == "__main__":
    main()

No visualization data found for year 2014
No visualization data found for year 2015
No visualization data found for year 2016
No visualization data found for year 2017
No visualization data found for year 2018
No visualization data found for year 2019
No visualization data found for year 2020
No visualization data found for year 2021
No visualization data found for year 2022
No visualization data found for year 2023
No visualization data found for year 2024
Visualizing data for year 2025




Error visualizing year 2025: 'KMeans_Cluster'


<Figure size 1200x800 with 0 Axes>