<a href="https://colab.research.google.com/github/dylan-govender/COMP721-Machine-Learning-Project/blob/main/Game_Outcome_Prediction_Part_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **A. Installations**

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install xgboost



# **B. Imports**

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report

# **1. Load the NBA Data**

In [None]:
# Player Statistics
players = pd.read_csv('/content/drive/MyDrive/COMP721/Project/databasebasketball/players.txt')
player_regular_season = pd.read_csv('/content/drive/MyDrive/COMP721/Project/databasebasketball/player_regular_season.txt')
player_regular_season_career = pd.read_csv('/content/drive/MyDrive/COMP721/Project/databasebasketball/player_regular_season_career.txt')
player_playoffs = pd.read_csv('/content/drive/MyDrive/COMP721/Project/databasebasketball/player_playoffs.txt')
player_playoffs_career = pd.read_csv('/content/drive/MyDrive/COMP721/Project/databasebasketball/player_playoffs_career.txt')
player_allstar = pd.read_csv('/content/drive/MyDrive/COMP721/Project/databasebasketball/player_allstar.txt')

# Team Statistics
teams = pd.read_csv('/content/drive/MyDrive/COMP721/Project/databasebasketball/teams.txt')
team_season = pd.read_csv('/content/drive/MyDrive/COMP721/Project/databasebasketball/team_season.txt')

draft = pd.read_csv('/content/drive/MyDrive/COMP721/Project/databasebasketball/draft.txt', encoding='ISO-8859-1', on_bad_lines='skip')

# Coach Statistics
coaches_season = pd.read_csv('/content/drive/MyDrive/COMP721/Project/databasebasketball/coaches_season.txt')
coaches_career = pd.read_csv('/content/drive/MyDrive/COMP721/Project/databasebasketball/coaches_career.txt')


# **2. Dataset Preprocessing**

## **2.1. Coach Career Data**

In [None]:
# Calculating career win ratios for season and playoffs
coaches_career['season_win_ratio'] = coaches_career['season_win'] / (coaches_career['season_win'] + coaches_career['season_loss'])
coaches_career['playoff_win_ratio'] = coaches_career['playoff_win'] / (coaches_career['playoff_win'] + coaches_career['playoff_loss'])


## **2.2. Coach Season Data**

In [None]:
# Season win ratio per team per year
coaches_season['win_ratio'] = coaches_season['season_win'] / (coaches_season['season_win'] + coaches_season['season_loss'])
team_coaching_stats = coaches_season.groupby(['team', 'year']).agg({
    'win_ratio': 'mean',
    'playoff_win': 'mean',
    'playoff_loss': 'mean'
}).reset_index()


## **2.3. Draft Data**

In [None]:
draft['selection'] = draft['selection'].replace(0, None)  # Handle missing data (0 means undrafted)
draft_positions = draft.groupby('team').agg({
    'selection': 'mean'
}).reset_index()


## **2.4. Player All-Star Data**

In [None]:
# Count all-star appearances by team and year
allstar_counts = player_allstar.groupby(['year', 'ilkid']).size().reset_index(name='allstar_appearances')


## **2.5. Player Playoff and Regular Season Statistics**

In [None]:
# Aggregating regular season and playoff stats for each player
player_regular_avg = player_regular_season.groupby('team').agg({
    'pts': 'mean',
    'gp': 'mean',
    'oreb': 'mean',
    'dreb': 'mean',
    'reb': 'mean',
    'asts': 'mean',
    'stl': 'mean',
    'blk': 'mean',
    'turnover': 'mean',
    'pf': 'mean',
    'fga': 'mean',
    'fgm': 'mean',
    'fta': 'mean',
    'ftm': 'mean',
    'tpa': 'mean',
    'tpm': 'mean'
}).reset_index()

player_playoff_avg = player_playoffs.groupby('team').agg({
    'pts': 'mean',
    'gp': 'mean',
    'oreb': 'mean',
    'dreb': 'mean',
    'reb': 'mean',
    'asts': 'mean',
    'stl': 'mean',
    'blk': 'mean',
    'turnover': 'mean',
    'pf': 'mean',
    'fga': 'mean',
    'fgm': 'mean',
    'fta': 'mean',
    'ftm': 'mean',
    'tpa': 'mean',
    'tpm': 'mean'
}).reset_index()


## **2.6. Team Regular Season Statistics**

In [None]:
# Calculate win ratio and other key team stats
team_season['win_loss_ratio'] = team_season['won'] / (team_season['won'] + team_season['lost'])
team_stats = team_season[['team', 'year', 'win_loss_ratio']]
team_stats = team_season.groupby(['team', 'year']).agg({
    'win_loss_ratio': 'mean',
    'o_pts': 'mean',
    'o_fgm': 'mean',
    'o_fga': 'mean',
    'o_ftm': 'mean',
    'o_fta': 'mean',
    'o_oreb': 'mean',
    'o_dreb': 'mean',
    'o_reb': 'mean',
    'o_asts': 'mean',
    'o_pf': 'mean',
    'o_stl': 'mean',
    'o_to': 'mean',
    'o_blk': 'mean',
    'o_3pm': 'mean',
    'o_3pa': 'mean',
    'd_pts': 'mean',
    'd_fgm': 'mean',
    'd_fga': 'mean',
    'd_ftm': 'mean',
    'd_fta': 'mean',
    'd_oreb': 'mean',
    'd_dreb': 'mean',
    'd_reb': 'mean',
    'd_asts': 'mean',
    'd_pf': 'mean',
    'd_stl': 'mean',
    'd_to': 'mean',
    'd_blk': 'mean',
    'd_3pm': 'mean',
    'd_3pa': 'mean'
}).reset_index()


# **3. Model Implementation**

## **3.1. Create Match-Up Data**

In [None]:
full_data = team_stats.merge(team_coaching_stats, on=['team', 'year'], how='left')
full_data = full_data.merge(draft_positions, on='team', how='left')
full_data = full_data.merge(player_regular_avg, on='team', how='left')




In [None]:
full_data.head()

Unnamed: 0,team,year,win_loss_ratio,o_pts,o_fgm,o_fga,o_ftm,o_fta,o_oreb,o_dreb,...,stl,blk,turnover,pf,fga,fgm,fta,ftm,tpa,tpm
0,ANA,1967,0.320513,8704.0,3172.0,7606.0,2141.0,2916.0,0.0,0.0,...,0.0,0.0,79.210526,114.842105,406.263158,170.052632,153.421053,112.473684,36.578947,11.157895
1,AND,1949,0.578125,5589.0,1943.0,6254.0,1703.0,2343.0,0.0,0.0,...,0.0,0.0,0.0,112.875,390.875,121.4375,146.4375,106.4375,0.0,0.0
2,ATL,1968,0.585366,9123.0,3605.0,7844.0,1913.0,2785.0,0.0,0.0,...,38.331624,23.023932,61.225641,122.747009,442.929915,204.271795,146.697436,110.410256,32.095726,10.617094
3,ATL,1969,0.585366,9646.0,3817.0,7907.0,2012.0,2669.0,0.0,0.0,...,38.331624,23.023932,61.225641,122.747009,442.929915,204.271795,146.697436,110.410256,32.095726,10.617094
4,ATL,1970,0.439024,9348.0,3614.0,7779.0,2120.0,2975.0,0.0,0.0,...,38.331624,23.023932,61.225641,122.747009,442.929915,204.271795,146.697436,110.410256,32.095726,10.617094


In [None]:
print(full_data.shape)

(1187, 53)


In [None]:
print(list(full_data.columns))

['team', 'year', 'win_loss_ratio', 'o_pts', 'o_fgm', 'o_fga', 'o_ftm', 'o_fta', 'o_oreb', 'o_dreb', 'o_reb', 'o_asts', 'o_pf', 'o_stl', 'o_to', 'o_blk', 'o_3pm', 'o_3pa', 'd_pts', 'd_fgm', 'd_fga', 'd_ftm', 'd_fta', 'd_oreb', 'd_dreb', 'd_reb', 'd_asts', 'd_pf', 'd_stl', 'd_to', 'd_blk', 'd_3pm', 'd_3pa', 'win_ratio', 'playoff_win', 'playoff_loss', 'selection', 'pts', 'gp', 'oreb', 'dreb', 'reb', 'asts', 'stl', 'blk', 'turnover', 'pf', 'fga', 'fgm', 'fta', 'ftm', 'tpa', 'tpm']


## **3.2. Train-Test Split**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

# Preparing data for training
X = full_data.drop(columns=['o_pts', 'team', 'year', 'selection', 'playoff_win', 'playoff_loss', 'win_loss_ratio',
                            'gp', 'pts', 'oreb', 'dreb', 'reb', 'asts', 'stl', 'blk', 'turnover', 'pf',
                            'fga', 'fgm', 'fta', 'ftm', 'tpa', 'tpm'])

"""
ALL FEATURES:
['team', 'year', 'win_loss_ratio', 'o_pts', 'o_fgm', 'o_fga', 'o_ftm', 'o_fta', 'o_oreb', 'o_dreb',
 'o_reb', 'o_asts', 'o_pf', 'o_stl', 'o_to', 'o_blk', 'o_3pm', 'o_3pa', 'd_pts', 'd_fgm', 'd_fga', 'd_ftm', 'd_fta', 'd_oreb',
 'd_dreb', 'd_reb', 'd_asts', 'd_pf', 'd_stl', 'd_to', 'd_blk', 'd_3pm', 'd_3pa', 'win_ratio', 'playoff_win', 'playoff_loss',
 'selection', 'pts', 'gp', 'oreb', 'dreb', 'reb', 'asts', 'stl', 'blk', 'turnover', 'pf', 'fga', 'fgm', 'fta', 'ftm', 'tpa', 'tpm']"""

y = full_data['o_pts']

# Use SimpleImputer to fill NaN values in feature set X
imputer = SimpleImputer(strategy='mean')  # or 'median' or 'most_frequent' based on your data
X_imputed = imputer.fit_transform(X)


"""# Drop rows with NaN values in the feature set X
X_imputed = X.dropna()

# Drop the corresponding rows in y (the target variable)
y = y[X_imputed.index]  # Ensure that y corresponds to the rows of X"""

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)


# **4. Model Evaluation**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import xgboost as xgb

# Define regression models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost Regressor': xgb.XGBRegressor(n_estimators=100, random_state=42),
    'Support Vector Regressor': SVR(kernel='rbf', C=1.0, epsilon=0.2),
}

# Scaling the data for models that are sensitive to feature scales
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Function to evaluate and print model performance
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)
    # Predict on test data
    y_pred = model.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    evs = explained_variance_score(y_test, y_pred)

    # Print the results
    print(f"{model_name} Performance:")
    print(f"Mean Absolute Error: {mae:.3f}")
    print(f"Mean Squared Error: {mse:.3f}")
    print(f"Root Mean Squared Error: {rmse:.3f}")
    print(f"R-squared: {r2:.3f}")
    print(f"Explained Variance Score: {evs:.3f}")
    print("-" * 30)

# Evaluate each model
for model_name, model in models.items():
    if model_name in ['Linear Regression', 'Support Vector Regressor']:
        # Scale data for linear models
        evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test, model_name)
    else:
        # Non-linear models can work with unscaled data
        evaluate_model(model, X_train, X_test, y_train, y_test, model_name)

Linear Regression Performance:
Mean Absolute Error: 0.016
Mean Squared Error: 0.017
Root Mean Squared Error: 0.130
R-squared: 1.000
Explained Variance Score: 1.000
------------------------------
Decision Tree Performance:
Mean Absolute Error: 124.903
Mean Squared Error: 24497.374
Root Mean Squared Error: 156.516
R-squared: 0.984
Explained Variance Score: 0.984
------------------------------
Random Forest Performance:
Mean Absolute Error: 68.991
Mean Squared Error: 9098.551
Root Mean Squared Error: 95.386
R-squared: 0.994
Explained Variance Score: 0.994
------------------------------
Gradient Boosting Performance:
Mean Absolute Error: 49.503
Mean Squared Error: 4272.620
Root Mean Squared Error: 65.365
R-squared: 0.997
Explained Variance Score: 0.997
------------------------------
XGBoost Regressor Performance:
Mean Absolute Error: 56.918
Mean Squared Error: 5960.682
Root Mean Squared Error: 77.205
R-squared: 0.996
Explained Variance Score: 0.996
------------------------------
Support Ve