In [17]:
import pandas as pd

df1 = pd.read_csv("/Users/ryansung/allstarsvm.csv")

In [5]:
# List of 2022 NBA All-Star players
all_star_players = [
    'Giannis Antetokounmpo', 'LeBron James', 'Lauri Markkanen', 'Luka Doncic', 'Donovan Mitchell', 'Joel Embiid',
    'Ja Morant', 'Kyrie Irving', 'Jayson Tatum', 'Nikola Jokic', 'Bam Adebayo', 'Jaylen Brown', 
    'Shai Gilgeous-Alexander', 'Anthony Edwards', 'DeMar DeRozan', "De'Aaron Fox", 'Jrue Holiday', 
    'Paul George', 'Damian Lillard', 'Tyrese Haliburton', 'Domantas Sabonis', 'Jaren Jackson Jr.', 'Pascal Siakam',
    'Julius Randle', 'Kevin Durant', 'Stephen Curry', 'Zion Williamson'
]

# Creating the ALLSTAR column based on the condition
df1['ALLSTAR'] = df1['PLAYER'].apply(lambda x: 1 if x in all_star_players else 0)
df1.head(50)  # Display the dataframe to verify the result


Unnamed: 0,PLAYER,SEASON,TEAM,AGE,GP,W,L,MIN,PTS,FGM,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,ALLSTAR
0,Luka Doncic,2022,DAL,24,50,28,22,36.5,33.3,11.2,...,8.1,3.7,1.5,0.5,2.7,58.5,27,10,2.4,1
1,Joel Embiid,2022,PHI,29,45,30,15,34.9,33.1,11.0,...,4.1,3.5,1.2,1.5,3.2,55.8,29,1,6.7,1
2,Giannis Antetokounmpo,2022,MIL,28,47,35,12,33.1,31.8,11.3,...,5.4,4.0,0.8,0.8,3.4,55.2,35,3,5.1,1
3,Damian Lillard,2022,POR,32,46,23,23,36.2,31.4,9.5,...,7.3,3.2,0.8,0.3,1.9,47.7,13,1,2.4,1
4,Shai Gilgeous-Alexander,2022,OKC,24,53,25,28,35.4,30.8,10.2,...,5.7,2.9,1.6,1.1,2.7,50.2,3,0,2.7,1
5,Jayson Tatum,2022,BOS,25,55,40,15,37.3,30.6,9.9,...,4.5,2.9,1.0,0.8,2.2,50.3,22,1,7.3,1
6,LeBron James,2022,LAL,38,45,22,23,36.3,30.0,11.5,...,7.0,3.2,1.0,0.6,1.6,52.0,16,1,2.8,1
7,Kevin Durant,2022,PHX,34,39,26,13,36.0,29.7,10.5,...,5.3,3.5,0.8,1.5,2.4,49.1,5,1,4.8,1
8,Stephen Curry,2022,GSW,35,38,20,18,34.5,29.4,9.8,...,6.4,3.2,1.0,0.4,2.2,47.6,11,1,4.3,1
9,Ja Morant,2022,MEM,23,48,31,17,32.7,27.3,9.7,...,8.3,3.5,1.0,0.3,1.8,47.4,17,5,4.9,1


In [9]:
df1.to_excel('/Users/ryansung/allstarsvm.xlsx', index=False)


In [18]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer



# Select the independent variables for the model
feature_columns = [
    'AGE', 'GP', 'W', 'L', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%',
    'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'PF',
    'FP', 'DD2', 'TD3', '+/-'
]
X = df1[feature_columns]
y = df1['ALLSTAR']

# Remove rows with NaN values in the target variable
clean_indices = y.dropna().index
X_clean = X.loc[clean_indices]
y_clean = y.loc[clean_indices]

# Impute missing values in the features with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_clean_imputed = imputer.fit_transform(X_clean)

# Scale the features
scaler = StandardScaler()
X_clean_scaled = scaler.fit_transform(X_clean_imputed)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_clean_scaled, y_clean, test_size=0.2, random_state=42)

# Initialize the SVM classifier with an RBF kernel
svm_clf = SVC(kernel='rbf', gamma='scale')

# Fit the model on the training dataset
svm_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy on the test set:", accuracy)
print(classification_rep)


Accuracy on the test set: 0.9636363636363636
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       209
           1       0.71      0.45      0.56        11

    accuracy                           0.96       220
   macro avg       0.84      0.72      0.77       220
weighted avg       0.96      0.96      0.96       220



In [11]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

# Address class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': ['scale', 'auto', 0.1, 1, 10, 100],  # Kernel coefficient
    'kernel': ['rbf']  # We're focusing on the RBF kernel
}

# Initialize the SVM classifier
svm_clf = SVC()

# Initialize the GridSearchCV object
grid_search = GridSearchCV(svm_clf, param_grid, cv=5, scoring='balanced_accuracy', verbose=2)

# Perform grid search with the balanced training set
grid_search.fit(X_train_smote, y_train_smote)

# Get the best estimator
best_svm_clf = grid_search.best_estimator_

# Predict on the original, unsampled test set
y_pred_best = best_svm_clf.predict(X_test)

# Evaluate the best model
best_accuracy = accuracy_score(y_test, y_pred_best)
best_classification_rep = classification_report(y_test, y_pred_best)

print("Best model accuracy on the test set:", best_accuracy)
print(best_classification_rep)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.

[CV] END .........................C=100, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=100, gamma=1, kernel=rbf; total time=   0.1s
[CV] END .........................C=100, gamma=1, kernel=rbf; total time=   0.1s
[CV] END .........................C=100, gamma=1, kernel=rbf; total time=   0.1s
[CV] END ........................C=100, gamma=10, kernel=rbf; total time=   0.1s
[CV] END ........................C=100, gamma=10, kernel=rbf; total time=   0.1s
[CV] END ........................C=100, gamma=10, kernel=rbf; total time=   0.1s
[CV] END ........................C=100, gamma=10, kernel=rbf; total time=   0.1s
[CV] END ........................C=100, gamma=10, kernel=rbf; total time=   0.1s
[CV] END .......................C=100, gamma=100, kernel=rbf; total time=   0.1s
[CV] END .......................C=100, gamma=100, kernel=rbf; total time=   0.1s
[CV] END .......................C=100, gamma=100, kernel=rbf; total time=   0.1s
[CV] END ...................

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, recall_score
from imblearn.pipeline import Pipeline

# Since we are more interested in increasing the recall for the minority class, we'll create a custom scorer.
# This scorer will focus on the recall for the All-Star class (which is labeled as '1').
recall_scorer = make_scorer(recall_score, pos_label=1)

# Define the parameter distribution for RandomizedSearchCV
param_distributions = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': ['scale', 'auto', 0.1, 1, 10, 100]
}

# Create an imblearn pipeline that first applies SMOTE and then fits an SVM.
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('svc', SVC(kernel='rbf', probability=True))
])

# Initialize RandomizedSearchCV with the pipeline and custom scorer.
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,  # Number of parameter settings that are sampled, increase if computational resources allow
    scoring=recall_scorer,
    cv=5,
    verbose=2,
    random_state=42
)

# Perform the Randomized Search with the original, unsampled training data
random_search.fit(X_train, y_train)

# Get the best estimator
best_pipeline = random_search.best_estimator_

# Predict on the original, unsampled test set
y_pred_best = best_pipeline.predict(X_test)

# Evaluate the best model
best_recall = recall_score(y_test, y_pred_best, pos_label=1)
best_accuracy = accuracy_score(y_test, y_pred_best)
best_classification_rep = classification_report(y_test, y_pred_best)

print(best_classification_rep)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ...........................svc__C=1, svc__gamma=0.1; total time=   0.1s
[CV] END ...........................svc__C=1, svc__gamma=0.1; total time=   0.1s
[CV] END ...........................svc__C=1, svc__gamma=0.1; total time=   0.1s
[CV] END ...........................svc__C=1, svc__gamma=0.1; total time=   0.1s
[CV] END ...........................svc__C=1, svc__gamma=0.1; total time=   0.1s
[CV] END ...........................svc__C=10, svc__gamma=10; total time=   0.3s
[CV] END ...........................svc__C=10, svc__gamma=10; total time=   0.3s
[CV] END ...........................svc__C=10, svc__gamma=10; total time=   0.3s
[CV] END ...........................svc__C=10, svc__gamma=10; total time=   0.3s
[CV] END ...........................svc__C=10, svc__gamma=10; total time=   0.3s
[CV] END .......................svc__C=0.1, svc__gamma=scale; total time=   0.0s
[CV] END .......................svc__C=0.1, svc_

In [17]:
# Adjusting the pipeline to use sklearn's Pipeline and handling class imbalance with class_weight parameter

from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Define the pipeline steps
pipeline_steps = [
    ('scaler', StandardScaler()),  # Feature scaling step
    ('svc', SVC(kernel='rbf', class_weight='balanced', probability=True))  # SVM with balanced class weights
]

# Create the pipeline
pipeline = Pipeline(pipeline_steps)

# Define the parameter distribution for RandomizedSearchCV
param_distributions = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': ['scale', 'auto', 0.1, 1, 10, 100]
}

# Custom scorer that focuses on the recall for the minority class
recall_scorer = make_scorer(recall_score, pos_label=1)

# Initialize RandomizedSearchCV with the pipeline and custom scorer
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,  # Number of parameter settings that are sampled
    scoring=recall_scorer,
    cv=5,
    verbose=2,
    random_state=42
)

# Perform the Randomized Search with the scaled training data
random_search.fit(X_train, y_train)

# Get the best estimator
best_pipeline = random_search.best_estimator_

# Predict on the original, unsampled test set
y_pred_best = best_pipeline.predict(X_test)

# Evaluate the best model
best_recall = recall_score(y_test, y_pred_best, pos_label=1)
best_accuracy = accuracy_score(y_test, y_pred_best)
best_classification_rep = classification_report(y_test, y_pred_best)

print(best_classification_rep)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ...........................svc__C=1, svc__gamma=0.1; total time=   0.0s
[CV] END ...........................svc__C=1, svc__gamma=0.1; total time=   0.0s
[CV] END ...........................svc__C=1, svc__gamma=0.1; total time=   0.0s
[CV] END ...........................svc__C=1, svc__gamma=0.1; total time=   0.0s
[CV] END ...........................svc__C=1, svc__gamma=0.1; total time=   0.0s
[CV] END ...........................svc__C=10, svc__gamma=10; total time=   0.1s
[CV] END ...........................svc__C=10, svc__gamma=10; total time=   0.1s
[CV] END ...........................svc__C=10, svc__gamma=10; total time=   0.1s
[CV] END ...........................svc__C=10, svc__gamma=10; total time=   0.1s
[CV] END ...........................svc__C=10, svc__gamma=10; total time=   0.1s
[CV] END .......................svc__C=0.1, svc__gamma=scale; total time=   0.0s
[CV] END .......................svc__C=0.1, svc_

In [18]:
df_2023 = pd.read_csv('/Users/ryansung/allstarsvm2023.csv')

In [33]:
# Assuming df_2023 is your DataFrame for the 2023 players
# Preprocess the 2023 data (feature selection and scaling)
X_2023 = df_2023[feature_columns]  # Make sure you have the same columns as your training data
X_2023_scaled = scaler.transform(X_2023)  # Use the same scaler as for the training data

# Predict the 2023 All-Stars
y_2023_pred = best_pipeline.predict(X_2023_scaled)

# Add the predictions to the DataFrame for analysis
df_2023['ALLSTAR_Pred'] = y_2023_pred

# Display the players predicted to be All-Stars
predicted_allstars = df_2023[df_2023['ALLSTAR_Pred'] == 1]

# Show the results
predicted_allstars





Unnamed: 0,PLAYER,SEASON,TEAM,AGE,GP,W,L,MIN,PTS,FGM,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,ALLSTAR_Pred
0,Luka Doncic,2023,DAL,24,10,8,2,35.8,32.6,11.2,...,8.5,4.2,1.3,0.5,1.9,56.6,6,2,7.5,1
1,Joel Embiid,2023,PHI,29,9,8,1,33.7,32.4,11.0,...,5.7,3.8,0.7,2.1,2.4,59.5,7,0,7.8,1
2,Devin Booker,2023,PHX,27,2,1,1,36.1,31.5,11.5,...,10.5,5.5,0.5,0.0,4.0,52.3,1,0,1.5,1
3,De'Aaron Fox,2023,SAC,25,3,2,1,35.5,31.3,11.3,...,6.0,2.7,1.3,0.7,4.0,48.9,0,0,10.0,1
4,Stephen Curry,2023,GSW,35,11,6,5,32.5,30.7,9.5,...,3.9,3.7,1.0,0.2,1.5,42.0,0,0,0.0,1
5,Kevin Durant,2023,PHX,35,10,4,6,36.7,30.0,10.1,...,4.6,4.1,0.9,1.2,1.6,47.6,2,0,2.9,1
6,Nikola Jokic,2023,DEN,28,10,8,2,34.6,29.8,12.0,...,8.3,3.4,1.0,0.8,2.2,60.7,10,4,11.7,1
7,Donovan Mitchell,2023,CLE,27,8,4,4,35.9,29.5,10.6,...,5.5,2.6,2.3,0.6,2.4,49.9,0,0,3.6,1
8,Giannis Antetokounmpo,2023,MIL,28,10,6,4,32.5,29.5,11.0,...,3.6,4.3,0.9,1.3,2.9,48.7,6,0,2.5,1
9,Shai Gilgeous-Alexander,2023,OKC,25,9,6,3,35.6,29.4,11.2,...,6.0,2.3,2.1,0.8,2.2,53.2,2,0,5.4,1


In [34]:
df = predicted_allstars

In [45]:
# Filtering the DataFrame based on the specified conditions
filtered_df = df[(df['GP'] > 8) & (df['+/-'] > 0) & (df['W'] >= df['L'])]
filtered_df

Unnamed: 0,PLAYER,SEASON,TEAM,AGE,GP,W,L,MIN,PTS,FGM,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,ALLSTAR_Pred
0,Luka Doncic,2023,DAL,24,10,8,2,35.8,32.6,11.2,...,8.5,4.2,1.3,0.5,1.9,56.6,6,2,7.5,1
1,Joel Embiid,2023,PHI,29,9,8,1,33.7,32.4,11.0,...,5.7,3.8,0.7,2.1,2.4,59.5,7,0,7.8,1
6,Nikola Jokic,2023,DEN,28,10,8,2,34.6,29.8,12.0,...,8.3,3.4,1.0,0.8,2.2,60.7,10,4,11.7,1
8,Giannis Antetokounmpo,2023,MIL,28,10,6,4,32.5,29.5,11.0,...,3.6,4.3,0.9,1.3,2.9,48.7,6,0,2.5,1
9,Shai Gilgeous-Alexander,2023,OKC,25,9,6,3,35.6,29.4,11.2,...,6.0,2.3,2.1,0.8,2.2,53.2,2,0,5.4,1
10,Tyrese Maxey,2023,PHI,23,9,8,1,37.9,28.6,10.3,...,7.2,1.1,1.0,1.0,1.9,50.8,4,0,11.1,1
11,Anthony Edwards,2023,MIN,22,9,7,2,35.9,28.4,10.4,...,5.4,3.7,1.3,0.3,1.8,45.4,1,0,13.4,1
12,Jayson Tatum,2023,BOS,25,10,8,2,36.2,28.4,10.2,...,3.9,2.7,1.2,0.2,2.3,46.4,5,0,17.8,1
16,Trae Young,2023,ATL,25,9,5,4,35.7,24.4,6.7,...,10.2,4.2,1.6,0.0,1.4,43.7,6,0,1.1,1
20,Tyrese Haliburton,2023,IND,23,9,6,3,33.2,23.8,8.3,...,12.2,2.3,1.0,0.7,1.1,49.4,8,0,6.9,1
