In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pygam import LinearGAM, s
from sklearn.model_selection import train_test_split

In [2]:
df_2024 = pd.read_csv(
    "/Users/danishmak/Documents/Data Engineering/Final_Project_DE/archive/cbb23.csv"
)

In [3]:
df_train = pd.read_csv(
    "/Users/danishmak/Documents/Data Engineering/Final_Project_DE/archive/cbb22.csv"
)

In [4]:
df_train.columns

Index(['TEAM', 'CONF', 'G', 'W', 'ADJOE', 'ADJDE', 'BARTHAG', 'EFG_O',
       'EFGD_D', 'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD', '2P_O', '2P_D',
       '3P_O', '3P_D', 'ADJ_T', 'WAB', 'POSTSEASON', 'SEED'],
      dtype='object')

In [5]:
df_train["G"]

0      32
1      40
2      34
3      37
4      39
       ..
353    25
354    32
355    25
356    29
357    27
Name: G, Length: 358, dtype: int64

In [6]:
df_train = df_train.replace(np.nan, 0)
df_2024 = df_2024.replace(np.nan, 0)

# Extract relevant features and target variable from the 2023 data
X_train = df_train[
    [
        "G",
        "ADJOE",
        "ADJDE",
        "BARTHAG",
        "TOR",
        "TORD",
        "ORB",
        "DRB",
        "FTR",
        "FTRD",
        "2P_O",
        "2P_D",
        "3P_O",
        "3P_D",
        "ADJ_T",
        "WAB",
        "SEED",
    ]
]
y_train = df_train["W"]

# Build and fit the GAM model on 2023 data
gam = LinearGAM(
    s(0)
    + s(1)
    + s(2)
    + s(3)
    + s(4)
    + s(5)
    + s(6)
    + s(7)
    + s(8)
    + s(9)
    + s(10)
    + s(11)
    + s(12)
    + s(13)
    + s(14)
)
gam.fit(X_train, y_train)

# Prepare 2024 data and predict team wins
X_2024 = df_2024[
    [
        "G",
        "ADJOE",
        "ADJDE",
        "BARTHAG",
        "TOR",
        "TORD",
        "ORB",
        "DRB",
        "FTR",
        "FTRD",
        "2P_O",
        "2P_D",
        "3P_O",
        "3P_D",
        "ADJ_T",
        "WAB",
        "SEED",
    ]
]
teams_2024 = df_2024["TEAM"]  # Ensure TEAM column is present for ranking

# Predict on the 2024 data
y_pred_2024 = gam.predict(X_2024)

# Combine predictions with team names
results_2024 = pd.DataFrame({"TEAM": teams_2024, "Predicted_Wins": y_pred_2024})

# Sort by predicted wins in descending order
ranked_results_2023 = results_2024.sort_values(
    by="Predicted_Wins", ascending=False
).reset_index(drop=True)

# Print ranked teams with predicted wins for 2024
print(ranked_results_2023)

             TEAM  Predicted_Wins
0            UCLA       31.169957
1         Alabama       31.098605
2         Houston       30.886264
3     Connecticut       30.203810
4             UAB       29.828762
..            ...             ...
358  LIU Brooklyn        3.664749
359     Green Bay        3.330791
360  Delaware St.        2.816891
361   Florida A&M        1.865045
362      Hartford        0.619251

[363 rows x 2 columns]


In [7]:
ranked_results_2023

Unnamed: 0,TEAM,Predicted_Wins
0,UCLA,31.169957
1,Alabama,31.098605
2,Houston,30.886264
3,Connecticut,30.203810
4,UAB,29.828762
...,...,...
358,LIU Brooklyn,3.664749
359,Green Bay,3.330791
360,Delaware St.,2.816891
361,Florida A&M,1.865045


In [9]:
import pandas as pd
import numpy as np
from pygam import LinearGAM, s

# Team stats for Duke and Kentucky
team_1_stats = {"NetRtg": 27.77, "ORtg": 119.9, "DRtg": 92.1, "ADJ_T": 70}  # Duke
team_2_stats = {"NetRtg": 17.42, "ORtg": 115.7, "DRtg": 98.3, "ADJ_T": 68}  # Kentucky

# Compute differential statistics for model input
NetRtg_diff = team_1_stats["NetRtg"] - team_2_stats["NetRtg"]
ORtg_diff = team_1_stats["ORtg"] - team_2_stats["ORtg"]
DRtg_diff = team_1_stats["DRtg"] - team_2_stats["DRtg"]

# Prepare the input with 17 features, filling only the relevant columns
X_game = pd.DataFrame(
    np.zeros((1, 17)),
    columns=[
        "G",
        "ADJOE",
        "ADJDE",
        "BARTHAG",
        "TOR",
        "TORD",
        "ORB",
        "DRB",
        "FTR",
        "FTRD",
        "2P_O",
        "2P_D",
        "3P_O",
        "3P_D",
        "ADJ_T",
        "WAB",
        "SEED",
    ],
)

# Assign differential values to the selected columns
X_game["BARTHAG"] = NetRtg_diff
X_game["ADJOE"] = ORtg_diff
X_game["ADJDE"] = DRtg_diff

# Predict outcome and get confidence intervals
game_outcome_pred = gam.predict(X_game)
confidence_interval = gam.prediction_intervals(X_game, width=0.95)

# Forecast points for each team using ORtg and pace (approximated by ADJ_T)
# Estimated possessions for the game
avg_pace = (team_1_stats["ADJ_T"] + team_2_stats["ADJ_T"]) / 2

# Points scored estimates (Points = (ORtg * possessions) / 100)
team_1_points = (team_1_stats["ORtg"] * avg_pace) / 100
team_2_points = (team_2_stats["ORtg"] * avg_pace) / 100

# Display results
print(
    f"Predicted Winner: {'Team 1 (Duke)' if game_outcome_pred[0] > 0.5 else 'Team 2 (Kentucky)'}"
)
# print(f"Confidence Interval for Prediction: {confidence_interval[0]}")
print(f"Estimated Points: Duke {team_1_points:.2f} - Kentucky {team_2_points:.2f}")

Predicted Winner: Team 1 (Duke)
Confidence Interval for Prediction: [-301.29790414 1169.75581132]
Estimated Points: Duke 82.73 - Kentucky 79.83
