In [None]:
%pip install nba_api seaborn

import pandas as pd
import time
from nba_api.stats.endpoints import leaguegamefinder, boxscoretraditionalv3, teamgamelog
from nba_api.stats.static import teams
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

In [None]:
# collects Pacer specific data
nba_teams = teams.get_teams()
pacers = [team for team in nba_teams if team['abbreviation'] == 'IND'][0]
pacers_id = pacers['id']

# Game logs data from the 2024-2025 season
gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=pacers_id, season_nullable='2024-25')
games_df = gamefinder.get_data_frames()[0]


games_df = games_df[['GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PTS', 'PLUS_MINUS']]

# Convert WL (Win/Loss) to Binary Target: 1 for Win, 0 for Loss
games_df['Target'] = games_df['WL'].apply(lambda x: 1 if x == 'W' else 0)

# Identify Home vs Away
games_df['Home_Game'] = games_df['MATCHUP'].apply(lambda x: 1 if 'vs.' in x else 0)

print(f"Total Games Fetched: {len(games_df)}")
games_df.head()

In [None]:
def get_pacers_games(season='2024-25', pacers_id=1610612754):
    """Fetch all Pacers games for the season"""
    print(f"Fetching Pacers games for {season}...")

    # Get regular season games
    gamelog = teamgamelog.TeamGameLog(
        team_id=pacers_id,
        season=season,
        season_type_all_star='Regular Season'
    )
    regular_games = gamelog.get_data_frames()[0]

    # Get playoff games
    time.sleep(1)  # Rate limiting
    playoff_log = teamgamelog.TeamGameLog(
        team_id=pacers_id,
        season=season,
        season_type_all_star='Playoffs'
    )
    playoff_games = playoff_log.get_data_frames()[0]

    # Combine all games
    all_games = pd.concat([regular_games, playoff_games], ignore_index=True)

    print(f"Retrieved {len(all_games)} games")
    return all_games


def get_game_boxscore(game_id):
    """Fetch detailed box score for a specific game"""
    try:
        time.sleep(0.6)  # Rate limiting to avoid API throttling
        boxscore = boxscoretraditionalv3.BoxScoreTraditionalV3(game_id=game_id)
        player_stats = boxscore.get_data_frames()[0]
        return player_stats
    except Exception as e:
        print(f"Error fetching game {game_id}: {e}")
        return None


def collect_pacers_data(season='2024-25'):
    """Main function to collect all game and player data for Indiana Pacers"""
    # Get game log
    games_df = get_pacers_games(season)

    # Collect player stats for each game
    all_player_data = []

    for idx, game in games_df.iterrows():
        game_id = game['Game_ID']
        # print(f"Processing game {idx+1}/{len(games_df)}: {game_id}")

        boxscore = get_game_boxscore(game_id)
        if boxscore is not None:
            # Debug: Print columns from first game
            if idx == 0:
                print(f"\nAvailable columns: {list(boxscore.columns)}")

            # Add game metadata to each player row
            boxscore['GAME_ID'] = game_id
            boxscore['GAME_DATE'] = game['GAME_DATE']
            boxscore['MATCHUP'] = game['MATCHUP']
            boxscore['WL'] = game['WL']
            all_player_data.append(boxscore)

    # Combine all player data
    all_player_stats = pd.concat(all_player_data, ignore_index=True)

    print("\nData collection complete!")
    print(f"Total games: {len(games_df)}")
    print(f"Total player-game records: {len(all_player_stats)}")

    return games_df, all_player_stats
games_df, all_player_stats = collect_pacers_data(season='2024-25')
print(games_df)
print(all_player_stats)

In [None]:
# Attributes and referencial data for grouping archetypes relating to the game/team/player meta.
player_attributes = [ "gameId", "teamTricode", "personId", "minutes", "fieldGoalsPercentage", "threePointersPercentage", "reboundsTotal", "assists", "steals", "blocks", "points" ]

In [None]:
# converts mins to float type
def minutes_to_float(min_str):
    try:
        if pd.isna(min_str):
            return 0.0
        # Check if it's already a number
        if isinstance(min_str, (int, float)):
            return float(min_str)
        # Handle "MM:SS" string format
        if ':' in str(min_str):
            parts = str(min_str).split(':')
            return float(parts[0]) + float(parts[1])/60
        # Handle "PT24M" format (ISO 8601)
        return float(str(min_str).replace('PT','').replace('M',''))
    except:
        return 0.0

## Clustering the Data

### Data Preparation

In [None]:
df_clustering = all_player_stats.copy()

numeric_cols = [
    'minutes', 'points', 'reboundsTotal', 'assists', 'steals', 'blocks',
    'turnovers', 'threePointersMade', 'fieldGoalsAttempted', 'freeThrowsAttempted'
]

df_clustering['minutes_float'] = df_clustering['minutes'].apply(minutes_to_float)

# Filter: Only cluster players with significant playing time to avoid noise (e.g., > 10 minutes)
df_clustering = df_clustering[df_clustering['minutes_float'] > 10]

# Normalize the stats per 36 minutes
stats_to_normalize = ['points', 'reboundsTotal', 'assists', 'steals', 'blocks', 'turnovers', 'threePointersMade', 'fieldGoalsAttempted']
for stat in stats_to_normalize:
    # Convert column to numeric, coercing errors to 0
    df_clustering[stat] = pd.to_numeric(df_clustering[stat], errors='coerce').fillna(0)
    # Calculate Per 36
    df_clustering[f'{stat}_Per36'] = (df_clustering[stat] / df_clustering['minutes_float']) * 36

# Select features for clustering
features = [col for col in df_clustering.columns if '_Per36' in col]
X = df_clustering[features].fillna(0)

### K-Means Clustering

In [None]:
# Normalize the features so large numbers (like Points) don't dominate small ones (like Steals)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

k = 5
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df_clustering['Cluster'] = kmeans.fit_predict(X_scaled)

cluster_means = df_clustering.groupby('Cluster')[features].mean()

print("Cluster Centers (Average Per 36 Stats):")
print(cluster_means)

cluster_labels = {}
for i in range(k):
    # Find the stat this cluster excels at relative to the others
    top_stat = cluster_means.loc[i].idxmax().replace('_Per36', '')
    cluster_labels[i] = f"Archetype {i} ({top_stat} focused)"

df_clustering['Archetype_Label'] = df_clustering['Cluster'].map(cluster_labels)

# PCA Scatter plot
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plot_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
plot_df['Cluster'] = df_clustering['Cluster']
plot_df['Label'] = df_clustering['Archetype_Label']

plt.figure(figsize=(12, 8))
sns.scatterplot(
    x='PC1',
    y='PC2',
    hue='Label',
    data=plot_df,
    palette='viridis',
    s=100,
    alpha=0.7,
    edgecolor='k'
)

plt.title('Player Archetypes Clustering (K-Means Results)', fontsize=16)
plt.xlabel('Principal Component 1 (Variance in Stats)', fontsize=12)
plt.ylabel('Principal Component 2', fontsize=12)
plt.legend(title='Archetype', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

# Display sample players from each cluster
for i in range(k):
    print(f"\n--- {cluster_labels[i]} Example Players ---")
    # Show top 3 players in this cluster by minutes played
    examples = df_clustering[df_clustering['Cluster'] == i].sort_values('minutes_float', ascending=False).head(3)
    for _, row in examples.iterrows():
        print(f"{row['firstName']} {row['familyName']} (Pts: {row['points']}, Reb: {row['reboundsTotal']})")

## Feature Engineering for Prediction

### Train/Test Data



In [None]:
pacers_id=1610612754

# Helper column to distinguish Pacers vs Opponent
df_clustering['Is_Pacers'] = df_clustering['teamId'] == pacers_id

# Group by Game, Team(Is_Pacers), and Cluster
game_lineups = df_clustering.groupby(['GAME_ID', 'Is_Pacers', 'Cluster']).size().unstack(fill_value=0)

# Separate Pacers and Opponents stats
pacers_stats = game_lineups.xs(True, level='Is_Pacers').add_prefix('Pacers_Arch_')
opponent_stats = game_lineups.xs(False, level='Is_Pacers').add_prefix('Opp_Arch_')

# Combine into a single DataFrame per game
unique_games = df_clustering['GAME_ID'].unique()
model_data = pd.DataFrame(index=unique_games)
model_data = model_data.join(pacers_stats).join(opponent_stats).fillna(0)

# Add the Target Variable (Win/Loss)
game_results = df_clustering.groupby('GAME_ID')['WL'].first()
model_data['Target'] = game_results.map({'W': 1, 'L': 0})

model_data = model_data.dropna()

print("\n--- Game-Level Feature Vector (First 5 Rows) ---")
print(model_data.head())

# Features (X) and Target (y)
feature_cols = [c for c in model_data.columns if 'Arch_' in c]
X_dat = model_data[feature_cols]
y_dat = model_data['Target']
X_train, X_test, y_train, y_test = train_test_split(X_dat, y_dat, train_size=0.8, test_size=0.2, random_state=42)

### K-NN

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

# Evaluation
print("\n--- KNN Classification Results ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

### Naive Bayes

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)

# Evaluation
print("\n--- Niave Bayes Classification Results ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Evaluattion
print(f"Accuracy:, {accuracy_score(y_test, y_pred):.2f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))