In [2]:
import pandas as pd, numpy as np, json, joblib, torch, torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
csv_path = "NBA_players.csv"
df = pd.read_csv(csv_path)

In [4]:
# --------------------------
# Choose 5-year window & pool of 100 players
# --------------------------
start_year, end_year = 2010, 2015
df['season_start'] = df['season'].apply(lambda x: int(x.split('-')[0]))
df = df[(df['season_start'] >= start_year) & (df['season_start'] <= end_year)]

# Remove duplicate players
df = df.drop_duplicates(subset=['player_name'], keep='first')

if len(df) < 100:
    raise ValueError("Not enough players in the chosen window.")
df = df.sample(100, random_state=42)  # pool of 100

In [5]:
features = ['pts', 'reb', 'ast', 'oreb_pct', 'dreb_pct', 
            'usg_pct', 'ts_pct', 'ast_pct', 'gp', 'net_rating']

df['total_contrib'] = df['pts'] + df['reb'] + df['ast']
median_contrib = df['total_contrib'].median()
df['performance'] = (df['total_contrib'] >= median_contrib).astype(int)

X = df[features].values
y = df['performance'].values

scaler = StandardScaler()
X = scaler.fit_transform(X)
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_t = torch.tensor(y_train, dtype=torch.long).to(device)
X_test_t  = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_t  = torch.tensor(y_test, dtype=torch.long).to(device)

train_ds = TensorDataset(X_train_t, y_train_t)
test_ds  = TensorDataset(X_test_t, y_test_t)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=64, shuffle=False)

In [8]:
class PlayerMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 32), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(32, 2)
        )
    def forward(self, x): return self.net(x)

model = PlayerMLP(input_dim=X_train.shape[1]).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [9]:
num_epochs = 30
loss_history = []
for epoch in range(num_epochs):
    model.train()
    running_loss = 0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * xb.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)
    loss_history.append(epoch_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

Epoch 1/30, Loss: 0.7217
Epoch 2/30, Loss: 0.6897
Epoch 3/30, Loss: 0.6630
Epoch 4/30, Loss: 0.6450
Epoch 5/30, Loss: 0.6183
Epoch 6/30, Loss: 0.6000
Epoch 7/30, Loss: 0.5686
Epoch 8/30, Loss: 0.5643
Epoch 9/30, Loss: 0.5346
Epoch 10/30, Loss: 0.5119
Epoch 11/30, Loss: 0.5041
Epoch 12/30, Loss: 0.4617
Epoch 13/30, Loss: 0.4460
Epoch 14/30, Loss: 0.4445
Epoch 15/30, Loss: 0.4059
Epoch 16/30, Loss: 0.3898
Epoch 17/30, Loss: 0.3756
Epoch 18/30, Loss: 0.3490
Epoch 19/30, Loss: 0.3398
Epoch 20/30, Loss: 0.3240
Epoch 21/30, Loss: 0.2896
Epoch 22/30, Loss: 0.3121
Epoch 23/30, Loss: 0.2811
Epoch 24/30, Loss: 0.2687
Epoch 25/30, Loss: 0.2489
Epoch 26/30, Loss: 0.2218
Epoch 27/30, Loss: 0.2403
Epoch 28/30, Loss: 0.2234
Epoch 29/30, Loss: 0.2210
Epoch 30/30, Loss: 0.1903


In [10]:
torch.save(model.state_dict(), "model_state_dict.pt")
with open("label_names.json", "w") as f: 
    json.dump(["Low", "High"], f)

In [11]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for xb, yb in test_loader:
        preds = model(xb).argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += yb.size(0)
print(f"Test Accuracy: {correct/total:.4f}")

Test Accuracy: 0.9500


In [12]:
with torch.no_grad():
    df['prob_high'] = torch.softmax(model(torch.tensor(X, dtype=torch.float32).to(device)), dim=1)[:,1].cpu().numpy()

assigned_players = set()
roles = {}

# Helper: pick top player for role that hasn't been assigned yet
def pick_unique(df_sorted):
    for idx, row in df_sorted.iterrows():
        if row['player_name'] not in assigned_players:
            assigned_players.add(row['player_name'])
            return row
    return df_sorted.iloc[0]  # fallback

# Define roles heuristically
roles['PG'] = pick_unique(df.sort_values("ast_pct", ascending=False))
roles['SG'] = pick_unique(df.sort_values(["ts_pct","usg_pct"], ascending=[False,False]))
roles['SF'] = pick_unique(df.iloc[((df[['pts','reb','ast']].sum(axis=1) - df[['pts','reb','ast']].sum(axis=1).mean()).abs()).argsort()])
roles['PF'] = pick_unique(df.sort_values(["reb","oreb_pct"], ascending=[False,False]))
roles['C']  = pick_unique(df.sort_values(["reb","dreb_pct"], ascending=[False,False]))

print("\nOptimal Team (Unique Players):")
for role, player in roles.items():
    print(f"{role}: {player['player_name']} (P={player['prob_high']:.2f})")



Optimal Team (Unique Players):
PG: Scott Machado (P=0.05)
SG: Tyson Chandler (P=0.99)
SF: Brandon Rush (P=0.72)
PF: Kevin Love (P=1.00)
C: Blake Griffin (P=1.00)
