In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [2]:
# Load dataset
data = pd.read_excel('Dataset.xlsx', engine='openpyxl')
data.head()

Unnamed: 0,Team,Match Up,Game Date,W/L,MIN,PTS,FGM,FGA,FG%,3PM,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-
0,GSW,GSW vs. PHX,10/24/2023,L,240,104,36,101,35.6,10,...,78.6,18,31,49,19,11,6,11,23,-4
1,PHX,PHX @ GSW,10/24/2023,W,240,108,42,95,44.2,11,...,76.5,17,43,60,23,5,7,19,22,4
2,LAL,LAL @ DEN,10/24/2023,L,240,107,41,90,45.6,10,...,75.0,13,31,44,23,5,4,12,18,-12
3,DEN,DEN vs. LAL,10/24/2023,W,240,119,48,91,52.7,14,...,75.0,9,33,42,29,9,6,12,15,12
4,MEM,MEM vs. NOP,10/25/2023,L,240,104,38,91,41.8,12,...,80.0,8,29,37,23,8,7,13,19,-7


Manipulate Data:

In [3]:
# Convert game date to datetime
data['Game Date'] = pd.to_datetime(data['Game Date'], format='%m/%d/%Y')

# Extract home and guest teams
data['Home Team'] = data['Match Up'].apply(lambda x: x.split(' vs. ')[0] if 'vs.' in x else x.split(' @ ')[1])
data['Guest Team'] = data['Match Up'].apply(lambda x: x.split(' vs. ')[1] if 'vs.' in x else x.split(' @ ')[0])

# Add the correct Label column: 1 if home team won, 0 otherwise
data['Label'] = data.apply(
    lambda row: 1 if ((row['Team'] == row['Home Team']) and (row['W/L'] == 'W')) 
                    or ((row['Team'] == row['Guest Team']) and (row['W/L'] == 'L'))
                else 0,
    axis=1
)
data.head()

Unnamed: 0,Team,Match Up,Game Date,W/L,MIN,PTS,FGM,FGA,FG%,3PM,...,REB,AST,STL,BLK,TOV,PF,+/-,Home Team,Guest Team,Label
0,GSW,GSW vs. PHX,2023-10-24,L,240,104,36,101,35.6,10,...,49,19,11,6,11,23,-4,GSW,PHX,0
1,PHX,PHX @ GSW,2023-10-24,W,240,108,42,95,44.2,11,...,60,23,5,7,19,22,4,GSW,PHX,0
2,LAL,LAL @ DEN,2023-10-24,L,240,107,41,90,45.6,10,...,44,23,5,4,12,18,-12,DEN,LAL,1
3,DEN,DEN vs. LAL,2023-10-24,W,240,119,48,91,52.7,14,...,42,29,9,6,12,15,12,DEN,LAL,1
4,MEM,MEM vs. NOP,2023-10-25,L,240,104,38,91,41.8,12,...,37,23,8,7,13,19,-7,MEM,NOP,0


Calculating bias for later adding Home Advatange feature:

In [4]:
# Create a binary feature for home games
data['Is_Home'] = data['Match Up'].str.contains(' vs. ').astype(int)

# Prepare features and labels
X = data[['Is_Home']]  # Home/Away indicator
y = (data['W/L'] == 'W').astype(int)  # Convert W/L to binary outcome

# Fit a linear regression model
model = LinearRegression()
model.fit(X, y)

# The coefficient of 'Is_Home' represents the home advantage bias
bias = model.coef_[0]
print(f"Calculated Bias (Regression): {bias}")
new_bias = 1/21 * bias

# Remove the 'Is_Home' column
data = data.drop(columns=['Is_Home'])

Calculated Bias (Regression): 0.08617886178861783


In [None]:
# dataset with new feature: stability score
# Initialize the new dataset
new_dataset = []

# Iterate over each match-up
for _, row in data.iterrows():
    game_date = row['Game Date']
    home_team = row['Home Team']
    guest_team = row['Guest Team']
    
    # Filter games before the current game date for both teams
    home_team_games = data[(data['Team'] == home_team) & (data['Game Date'] < game_date)]
    guest_team_games = data[(data['Team'] == guest_team) & (data['Game Date'] < game_date)]
    
    # Calculate W/L rates (win rates) for both teams
    home_team_wins = (home_team_games['W/L'] == 'W').sum()
    home_team_total = len(home_team_games)
    guest_team_wins = (guest_team_games['W/L'] == 'W').sum()
    guest_team_total = len(guest_team_games)
    
    home_win_rate = home_team_wins / home_team_total if home_team_total > 0 else 0
    guest_win_rate = guest_team_wins / guest_team_total if guest_team_total > 0 else 0
    
    # Calculate W/L difference
    wl_difference = home_win_rate - guest_win_rate
    
    # Calculate Stability using mean and variance
    if not home_team_games.empty:
        home_mean_performance = home_team_games.iloc[:, 4:].mean(numeric_only=True)
        home_variance_performance = home_team_games.iloc[:, 4:].var(numeric_only=True)
        home_stability = (home_mean_performance / (home_variance_performance + 1e-6)).mean()
        home_stability = min(max(home_stability, -100), 100)
    else:
        home_stability = 0

    if not guest_team_games.empty:
        guest_mean_performance = guest_team_games.iloc[:, 4:].mean(numeric_only=True)
        guest_variance_performance = guest_team_games.iloc[:, 4:].var(numeric_only=True)
        guest_stability = (guest_mean_performance / (guest_variance_performance + 1e-6)).mean()
        guest_stability = min(max(guest_stability, -100), 100)  # Limit stability within [-100, 100]
    else:
        guest_stability = 0

    # Stability difference
    stability = home_stability - guest_stability
    
    # Prepare the new row
    new_row = {
        'Game Date': game_date,
        'Home Team': home_team,
        'Guest Team': guest_team,
        'Label': int(row['Label']),  # Ensure Label is directly copied and kept as integer
        'W/L Difference': wl_difference,
        'Stability': stability
    }
    
    # Add statistics differences (like in the original approach)
    home_team_stats = home_team_games.iloc[:, 4:].mean(numeric_only=True)
    guest_team_stats = guest_team_games.iloc[:, 4:].mean(numeric_only=True)
    
    if not home_team_stats.empty and not guest_team_stats.empty:
        stats_diff = home_team_stats - guest_team_stats
        # Update new_row but ensure no conflict with 'Label'
        new_row.update({k: v for k, v in stats_diff.to_dict().items() if k != 'Label'})
    
    new_dataset.append(new_row)
# Convert to DataFrame
new_dataset_df_1 = pd.DataFrame(new_dataset)

# Save to file for inspection
output_file = 'output_dataset/Dataset_With_Stability.xlsx'
new_dataset_df_1.to_excel(output_file, index=False)

output_file

In [5]:
def standardize_match(row):
    teams = sorted([row['Home Team'], row['Guest Team']])
    standardized_row = {
        'Standard Home Team': teams[0],
        'Standard Guest Team': teams[1],
        'Game Date': row['Game Date'],
        'W/L': 'W' if (row['Home Team'] == teams[0] and row['W/L'] == 'W') or
                       (row['Guest Team'] == teams[0] and row['W/L'] == 'L') else 'L'
    }
    return standardized_row

In [6]:
# Initialize the new dataset
new_dataset = []

# Iterate over each match-up
for _, row in data.iterrows():
    game_date = row['Game Date']
    home_team = row['Home Team']
    guest_team = row['Guest Team']
    
    # Filter games before the current game date for both teams
    home_team_games = data[(data['Team'] == home_team) & (data['Game Date'] < game_date)]
    guest_team_games = data[(data['Team'] == guest_team) & (data['Game Date'] < game_date)]

    # Calculate home team's historical home win rate
    home_team_home_games = home_team_games[home_team_games['Match Up'].str.contains('vs.')]
    home_team_home_wins = (home_team_home_games['W/L'] == 'W').sum()
    home_team_home_total = len(home_team_home_games)
    home_team_home_win_rate = home_team_home_wins / home_team_home_total if home_team_home_total > 0 else 0

    # Calculate guest team's historical away win rate
    guest_team_away_games = guest_team_games[guest_team_games['Match Up'].str.contains('@')]
    guest_team_away_wins = (guest_team_away_games['W/L'] == 'W').sum()
    guest_team_away_total = len(guest_team_away_games)
    guest_team_away_win_rate = guest_team_away_wins / guest_team_away_total if guest_team_away_total > 0 else 0

    home_away_win_rate_diff = home_team_home_win_rate - guest_team_away_win_rate

    # Filter previous competitions between home and guest teams
    previous_matches = data[
        ((data['Home Team'] == home_team) & (data['Guest Team'] == guest_team)) |
        ((data['Home Team'] == guest_team) & (data['Guest Team'] == home_team))
    ]
    previous_matches = previous_matches[previous_matches['Game Date'] < game_date]

    standardized_matches = previous_matches.apply(standardize_match, axis=1, result_type='expand')

    standardized_matches = standardized_matches.drop_duplicates(subset=['Game Date', 'Standard Home Team', 'Standard Guest Team'])

    # Calculate Previous Competitions feature (only direct matches between home and guest teams)
    previous_competitions_score = 0
    for _, match in standardized_matches.iterrows():
        if match['Standard Home Team'] == home_team and match['W/L'] == 'W':
            previous_competitions_score += 10
        elif match['Standard Home Team'] == home_team and match['W/L'] == 'L':
            previous_competitions_score -= 10
    
    # Calculate W/L rates (win rates) for both teams
    home_team_wins = (home_team_games['W/L'] == 'W').sum()
    home_team_total = len(home_team_games)
    guest_team_wins = (guest_team_games['W/L'] == 'W').sum()
    guest_team_total = len(guest_team_games)
    
    home_win_rate = home_team_wins / home_team_total if home_team_total > 0 else 0
    guest_win_rate = guest_team_wins / guest_team_total if guest_team_total > 0 else 0
    
    # Calculate W/L difference
    wl_difference = home_win_rate - guest_win_rate
    
    # Calculate Stability using mean and variance
    if not home_team_games.empty:
        home_mean_performance = home_team_games.iloc[:, 4:].mean(numeric_only=True)
        home_variance_performance = home_team_games.iloc[:, 4:].var(numeric_only=True)
        home_stability = (home_mean_performance / (home_variance_performance + 1e-6)).mean()
        home_stability = min(max(home_stability, -100), 100)
    else:
        home_stability = 0

    if not guest_team_games.empty:
        guest_mean_performance = guest_team_games.iloc[:, 4:].mean(numeric_only=True)
        guest_variance_performance = guest_team_games.iloc[:, 4:].var(numeric_only=True)
        guest_stability = (guest_mean_performance / (guest_variance_performance + 1e-6)).mean()
        guest_stability = min(max(guest_stability, -100), 100)  # Limit stability within [-100, 100]
    else:
        guest_stability = 0

    # Stability difference
    stability = home_stability - guest_stability
    
    # Prepare the new row
    new_row = {
        'Game Date': game_date,
        'Home Team': home_team,
        'Guest Team': guest_team,
        'Label': int(row['Label']),  # Ensure Label is directly copied and kept as integer
        'W/L Difference': wl_difference,
        'Stability': stability,
        'Previous Competitions': previous_competitions_score,  # Add new feature
        'Home-Away Win Rate Difference': home_away_win_rate_diff
    }
    
    # Add statistics differences (like in the original approach)
    home_team_stats = home_team_games.iloc[:, 4:].mean(numeric_only=True)
    guest_team_stats = guest_team_games.iloc[:, 4:].mean(numeric_only=True)
    
    if not home_team_stats.empty and not guest_team_stats.empty:
        stats_diff = home_team_stats - guest_team_stats
        # Update new_row but ensure no conflict with 'Label'
        new_row.update({k: v for k, v in stats_diff.to_dict().items() if k != 'Label'})
    
    new_dataset.append(new_row)

# Convert to DataFrame
new_dataset_df_1 = pd.DataFrame(new_dataset)

# Save to file for inspection
output_file = 'output_dataset/Dataset_with_Stability_and_Previous_Competitions_Corrected.xlsx'
new_dataset_df_1.to_excel(output_file, index=False)

output_file

'output_dataset/Dataset_with_Stability_and_Previous_Competitions_Corrected.xlsx'

In [7]:
# Remove rows with NaN values and drop duplicate rows
dataset_df_1 = new_dataset_df_1.dropna().drop_duplicates()

# Save to file for inspection
output_file = 'output_dataset/cleaned_final_dataset_with_features.xlsx'
dataset_df_1.to_excel(output_file, index=False)
output_file

'output_dataset/cleaned_final_dataset_with_features.xlsx'

In [8]:
X = dataset_df_1.drop(columns=['Label', 'Game Date', 'Home Team', 'Guest Team']) # Features: all columns except 'Label'
y = dataset_df_1['Label']  # Labels: the 'Label' column
X = X[300:]
y = y[300:]
X.head()

Unnamed: 0,W/L Difference,Stability,Previous Competitions,Home-Away Win Rate Difference,MIN,PTS,FGM,FGA,FG%,3PM,...,FTA,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-
663,0.090909,98.479197,0,0.133333,-1.136364,3.227273,-0.318182,-0.727273,-0.059091,-1.045455,...,6.590909,0.409091,-1.454545,-1.045455,0.090909,0.636364,0.545455,0.636364,2.681818,4.227273
668,-0.25,-98.72691,10,0.10989,4.166667,-5.0,-3.666667,0.541667,-4.370833,0.541667,...,0.625,0.958333,-2.375,-1.416667,-6.5,1.166667,-0.75,-0.666667,0.75,-7.25
669,0.090909,-0.281936,0,0.0,1.136364,-0.454545,-0.272727,-4.454545,2.072727,-2.409091,...,1.727273,-2.454545,-0.909091,-3.363636,-1.772727,0.181818,3.409091,-0.772727,-2.727273,1.727273
670,0.196687,-0.199775,0,0.363636,1.293996,6.763975,1.024845,1.635611,0.198137,4.488613,...,-0.21118,-0.449275,2.567288,2.118012,-0.05176,-1.515528,0.939959,-0.708075,-1.451346,8.258799
671,-0.073593,0.256473,10,0.181818,-1.244589,-3.186147,-0.367965,-2.19697,0.801948,-3.190476,...,-0.127706,-0.04329,0.181818,0.138528,-2.71645,1.580087,1.274892,-0.437229,-0.588745,3.757576


Using data standardization and L1 regularization to complete Feature Selection.

In [9]:
# Step 1: Standardize the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)  # Standardize X to have zero mean and unit variance

# Step 2: Fit Logistic Regression with L1 regularization for binary model
lasso_log_reg = LogisticRegression(penalty='l1', solver='liblinear', C=1.0, random_state=42)  # L1 penalty for feature selection
lasso_log_reg.fit(X_standardized, y)

# Step 3: Identify selected features
selected_features = X.columns[(lasso_log_reg.coef_ != 0).flatten()]  # Keep only features with non-zero coefficients
X_selected = X[selected_features]  # Subset original X with selected features

print(f"Selected features: {list(selected_features)}")
print(X_selected.head())

Selected features: ['Stability', 'Previous Competitions', 'Home-Away Win Rate Difference', 'MIN', 'FGA', 'FG%', '3PA', '3P%', 'FTM', 'FTA', 'AST', 'STL', 'BLK', 'TOV', 'PF', '+/-']
     Stability  Previous Competitions  Home-Away Win Rate Difference  \
663  98.479197                      0                       0.133333   
668 -98.726910                     10                       0.109890   
669  -0.281936                      0                       0.000000   
670  -0.199775                      0                       0.363636   
671   0.256473                     10                       0.181818   

          MIN       FGA       FG%        3PA       3P%       FTM       FTA  \
663 -1.136364 -0.727273 -0.059091  -2.818182 -0.559091  4.909091  6.590909   
668  4.166667  0.541667 -4.370833   1.541667  0.212500  1.791667  0.625000   
669  1.136364 -4.454545  2.072727  -8.045455  1.759091  2.500000  1.727273   
670  1.293996  1.635611  0.198137  10.296066  2.087992  0.225673 -0.211180

Training & Testing using 5-fold cv (including Random Forest, Logistic Regression, Decision Tree, AdaBoost, and QDA):

In [10]:
# Ensure data is shuffled before splitting
X_shuffled, y_shuffled = shuffle(X_selected, y, random_state=0)

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(random_state=0),
    'Logistic Regression': LogisticRegression(random_state=0, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=0),
    'AdaBoost': AdaBoostClassifier(random_state=0),
    'QDA': QuadraticDiscriminantAnalysis()
}

# Initialize K-Fold with shuffle
kf = KFold(n_splits=5, shuffle=True, random_state=0)

# Store accuracies for each model
model_accuracies = {model_name: [] for model_name in models}

for train_index, test_index in kf.split(X_shuffled):
    # Split the data into training and testing sets
    X_train, X_test = X_shuffled.iloc[train_index], X_shuffled.iloc[test_index]
    y_train, y_test = y_shuffled.iloc[train_index], y_shuffled.iloc[test_index]
    
    for model_name, model in models.items():
        # Fit the model
        model.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = model.predict(X_test)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        model_accuracies[model_name].append(accuracy)

# Print results
for model_name, accuracies in model_accuracies.items():
    print(f"{model_name} Accuracy for each fold: {accuracies}")
    print(f"{model_name} Mean accuracy: {sum(accuracies) / len(accuracies):.2f}")

Random Forest Accuracy for each fold: [0.6166666666666667, 0.6222222222222222, 0.6089385474860335, 0.6201117318435754, 0.6703910614525139]
Random Forest Mean accuracy: 0.63
Logistic Regression Accuracy for each fold: [0.6, 0.6611111111111111, 0.6536312849162011, 0.6089385474860335, 0.7206703910614525]
Logistic Regression Mean accuracy: 0.65
Decision Tree Accuracy for each fold: [0.5944444444444444, 0.5888888888888889, 0.5418994413407822, 0.5921787709497207, 0.5977653631284916]
Decision Tree Mean accuracy: 0.58
AdaBoost Accuracy for each fold: [0.6222222222222222, 0.6444444444444445, 0.5977653631284916, 0.5810055865921788, 0.6927374301675978]
AdaBoost Mean accuracy: 0.63
QDA Accuracy for each fold: [0.5944444444444444, 0.6, 0.6033519553072626, 0.6256983240223464, 0.659217877094972]
QDA Mean accuracy: 0.62


In [11]:
# Step 1: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=8)

X_train = X_train[300:]
y_train = y_train[300:]

# Step 2: Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 3: Build the neural network model
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),  # First hidden layer
    Dropout(0.3),  # Dropout for regularization
    Dense(32, activation='relu'),  # Second hidden layer
    Dense(1, activation='sigmoid')  # Output layer (binary classification)
])

# Step 4: Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 5: Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Step 6: Evaluate the model on the test set
y_pred = (model.predict(X_test) > 0.5).astype(int)  # Convert probabilities to binary predictions
accuracy = accuracy_score(y_test, y_pred)

print(f"Neural Network Test Accuracy: {accuracy:.2f}")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78