In [1]:
import pandas as pd
import numpy as np
import re
import warnings

warnings.filterwarnings("ignore")

games = pd.read_csv("predict/games_2024_2025.csv")
teams = pd.read_csv("predict/kenpom_2024_2025.csv").dropna(subset="Team")
teams = teams[teams['Rk'] != "Rk"]

In [2]:
# Define the regex pattern
pattern = r"(\D+?)\s(\d+),\s(?:\d+\s)?(\D+?)\s(\d+)"

# Function to extract teams and scores using regex
def extract_teams_scores(text):
    match = re.search(pattern, text)
    if match:
        return pd.Series({
            'Home Team': match.group(1).strip(),
            'Home Score': int(match.group(2)),
            'Away Team': match.group(3).strip(),
            'Away Score': int(match.group(4))
        })
    else:
        return pd.Series({'Home Team': None, 'Home Score': None, 'Away Team': None, 'Away Score': None})

# Apply the function to each row in the 'GameInfo' column
games[['Home Team', 'Home Score', 'Away Team', 'Away Score']] = games['Game'].apply(extract_teams_scores)

# Display the DataFrame
games['home_score'] = games['Home Score']
games['home_team'] = games['Home Team']
games['away_team'] = games['Away Team']
games['away_score'] = games['Away Score']
games['total_score'] = games['away_score'] + games['home_score']

games = games[["date", "home_score", "home_team", "away_score", "away_team", "total_score"]]

In [3]:
teams['date'] = pd.to_datetime(teams['date'])

# Sort by date
teams = teams.sort_values(by=['Team', 'date']).reset_index(drop=True)

# Define columns for rolling calculations
numeric_columns = ['Rk', 'AdjEM', 'AdjO', 'AdjO_Rk', 'AdjD', 'AdjD_Rk', 'AdjT', 'AdjT_Rk']

# Define rolling windows
rolling_windows = [1, 3, 5, 10, 20]

# Create rolling features
for window in rolling_windows:
    for col in numeric_columns:
        teams[col] = pd.to_numeric(teams[col])
        # Create rolling mean feature
        teams[f'{col}_rolling_{window}'] = (
            teams.groupby('Team')[col]  # Group by 'Team'
            .transform(lambda x: x.rolling(window, min_periods=1).mean())
        )

today = "2024-11-17"

today_teams = teams[teams['date'] == today]

In [4]:
from datetime import timedelta
# Ensure date columns are in datetime format
games['date'] = pd.to_datetime(games['date'])
teams['date'] = pd.to_datetime(teams['date'])
# Function to merge games with rankings using date and team names with date tolerance
def merge_with_tolerance(games_df, rankings_df, days_tolerance=2):
    # Merge exact date matches
    merged_df = games_df.merge(rankings_df, how='left', left_on=['date', 'home_team'], right_on=['date', 'Team'])
    merged_df = merged_df.merge(rankings_df, how='left', left_on=['date', 'away_team'], right_on=['date', 'Team'], suffixes=('_home', '_away'))

    # Try matching dates up to `days_tolerance` days before if no exact match is found
    for days_back in range(1, days_tolerance + 1):
        unmatched_home = merged_df[merged_df['Rk_home'].isna()]
        unmatched_away = merged_df[merged_df['Rk_away'].isna()]

        # Home team fallback match
        fallback_home = games_df[games_df['date'].isin(unmatched_home['date'] - timedelta(days=days_back))]
        fallback_merged_home = fallback_home.merge(rankings_df, how='left', left_on=['date', 'home_team'], right_on=['date', 'Team'])
        
        # Away team fallback match
        fallback_away = games_df[games_df['date'].isin(unmatched_away['date'] - timedelta(days=days_back))]
        fallback_merged_away = fallback_away.merge(rankings_df, how='left', left_on=['date', 'away_team'], right_on=['date', 'Team'])

        # Update original DataFrame with fallback data
        merged_df.update(fallback_merged_home)
        merged_df.update(fallback_merged_away)
    
    return merged_df

# Apply the function
merged_df = merge_with_tolerance(games, teams)

# Display the merged DataFrame
print(merged_df)

          date  home_score         home_team  away_score  \
0   2024-11-04       101.0           Gonzaga        63.0   
1   2024-11-04        80.0          Ohio St.        72.0   
2   2024-11-04        64.0               UCF        61.0   
3   2024-11-04        83.0           Memphis        75.0   
4   2024-11-04        85.0       Santa Clara        78.0   
..         ...         ...               ...         ...   
911 2024-11-19        82.0  Northern Arizona        47.0   
912 2024-11-19       101.0             Idaho        58.0   
913 2024-11-19        95.0       USC Upstate        63.0   
914 2024-11-19       121.0  Northwestern St.        49.0   
915 2024-11-19        98.0        Mercyhurst        45.0   

                away_team  total_score  Rk_home         Team_home Conf_home  \
0                  Baylor        164.0      9.0           Gonzaga       WCC   
1                   Texas        152.0     30.0          Ohio St.       B10   
2               Texas A&M        125.0    

In [5]:
data = merged_df
data['date'] = pd.to_datetime(data['date'])
data = data.sort_values(by="date", ascending=True)

data["home_opp_score_rank"] = data["Rk_away"] - data['home_score']
data['away_opp_score_rank'] = data['Rk_home'] - data['away_score']
data['home_sos'] = data['Rk_away']
data['away_sos'] = data['Rk_home']


home_df = data[['date', 'home_team', 'home_score', 'away_score', 'home_sos', "home_opp_score_rank"]].rename(columns={
    'home_team': 'team', 'home_score': 'score', 'away_score': 'opponent_score', 'home_sos': 'sos', "home_opp_score_rank": "opp_score_rank"
})
away_df = data[['date', 'away_team', 'away_score', 'home_score','away_sos', "away_opp_score_rank"]].rename(columns={
    'away_team': 'team', 'away_score': 'score', 'home_score': 'opponent_score','away_sos': 'sos', "away_opp_score_rank": "opp_score_rank"
})
games = pd.concat([home_df, away_df])
games.sort_values(by=['team', 'date'], inplace=True)

# Define the rolling windows
rolling_windows = [1, 3, 7]

# Calculate rolling averages for each window and store as separate columns
for window in rolling_windows:
    games[f'rolling_avg_score_{window}'] = games.groupby('team')['score'].shift().rolling(window=window, min_periods=1).mean()
    games[f'rolling_avg_score_allowed_{window}'] = games.groupby('team')['opponent_score'].shift().rolling(window=window, min_periods=1).mean()
    games[f'rolling_sos_{window}'] = games.groupby('team')['sos'].shift().rolling(window=window, min_periods=1).mean()
    games[f'rolling_opp_score_rank_{window}'] = games.groupby('team')['opp_score_rank'].shift().rolling(window=window, min_periods=1).mean()


games['days_since_last_game'] = games.groupby('team')['date'].diff().dt.days

# Keep only necessary columns
columns_to_keep = ['date', 'team', 'days_since_last_game', 'sos'] + \
                  [f'rolling_avg_score_{w}' for w in rolling_windows] + \
                  [f'rolling_sos_{w}' for w in rolling_windows] + \
                  [f'rolling_avg_score_allowed_{w}' for w in rolling_windows] + \
                  [f'rolling_opp_score_rank_{w}' for w in rolling_windows]
games = games[columns_to_keep]

In [6]:
games = games.sort_values("date", ascending=True).drop_duplicates(subset="team", keep="last")

In [7]:
daily_games = pd.read_csv("predict/daily-games.csv")

In [8]:
# First, merge `daily_games` with `today_teams` based on home and away teams separately
pred_df = pd.merge(daily_games, today_teams, left_on="home", right_on="Team", suffixes=("", "_home"))

# Merge `today_teams` data for the away team
pred_df = pd.merge(pred_df, today_teams, left_on="away", right_on="Team", suffixes=("", "_away"))

# # Rename columns in `games` with `home_` prefix, then merge for the home team data
pred_df_2 = pd.merge(daily_games, games, left_on="home", right_on="team", how="left", suffixes=("", "_home"))

# Rename columns in `games` with `away_` prefix, then merge for the away team data
pred_df_2 = pd.merge(pred_df_2, games, left_on="away", right_on="team", how="left", suffixes=("", "_away"))

In [9]:
preds = pd.merge(pred_df, pred_df_2, on=["home","away"])

In [10]:
home_cols = ['Rk', 'Conf', 'AdjEM', 'AdjO', 'AdjO_Rk', 'AdjD', 'AdjD_Rk', 'AdjT', 'AdjT_Rk', 'Rk_rolling_1',
       'AdjEM_rolling_1', 'AdjO_rolling_1', 'AdjO_Rk_rolling_1',
       'AdjD_rolling_1', 'AdjD_Rk_rolling_1', 'AdjT_rolling_1',
       'AdjT_Rk_rolling_1', 'Rk_rolling_3', 'AdjEM_rolling_3',
       'AdjO_rolling_3', 'AdjO_Rk_rolling_3', 'AdjD_rolling_3',
       'AdjD_Rk_rolling_3', 'AdjT_rolling_3', 'AdjT_Rk_rolling_3',
       'Rk_rolling_5', 'AdjEM_rolling_5', 'AdjO_rolling_5',
       'AdjO_Rk_rolling_5', 'AdjD_rolling_5', 'AdjD_Rk_rolling_5',
       'AdjT_rolling_5', 'AdjT_Rk_rolling_5', 'Rk_rolling_10',
       'AdjEM_rolling_10', 'AdjO_rolling_10', 'AdjO_Rk_rolling_10',
       'AdjD_rolling_10', 'AdjD_Rk_rolling_10', 'AdjT_rolling_10',
       'AdjT_Rk_rolling_10', 'Rk_rolling_20', 'AdjEM_rolling_20',
       'AdjO_rolling_20', 'AdjO_Rk_rolling_20', 'AdjD_rolling_20',
       'AdjD_Rk_rolling_20', 'AdjT_rolling_20', 'AdjT_Rk_rolling_20']
home_cols_roll = ['days_since_last_game', 'rolling_avg_score_1',
       'rolling_avg_score_3', 'rolling_avg_score_7', 'rolling_sos_1',
       'rolling_sos_3', 'rolling_sos_7', 'rolling_avg_score_allowed_1',
       'rolling_avg_score_allowed_3', 'rolling_avg_score_allowed_7',
       'rolling_opp_score_rank_1', 'rolling_opp_score_rank_3',
       'rolling_opp_score_rank_7', 'sos']
for col in home_cols:
    preds[f'{col}_home'] = preds[col]
for col in home_cols_roll:
    preds[f'home_{col}'] = preds[col]

away_cols_roll = ['days_since_last_game_away', 'rolling_avg_score_1_away',
       'rolling_avg_score_3_away', 'rolling_avg_score_7_away',
       'rolling_sos_1_away', 'rolling_sos_3_away', 'rolling_sos_7_away',
       'rolling_avg_score_allowed_1_away',
       'rolling_avg_score_allowed_3_away',
       'rolling_avg_score_allowed_7_away',
       'rolling_opp_score_rank_1_away', 'rolling_opp_score_rank_3_away',
       'rolling_opp_score_rank_7_away','sos_away']
for col in away_cols_roll:
    preds[f"away_{col.strip('_away')}"] = preds[col]

In [11]:
preds = preds[['home', 'away', 'Rk_home','Conf_home', 'AdjEM_home', 'AdjO_home',
       'AdjO_Rk_home', 'AdjD_home', 'AdjD_Rk_home', 'AdjT_home',
       'AdjT_Rk_home', 'Rk_rolling_1_home',
       'AdjEM_rolling_1_home', 'AdjO_rolling_1_home',
       'AdjO_Rk_rolling_1_home', 'AdjD_rolling_1_home',
       'AdjD_Rk_rolling_1_home', 'AdjT_rolling_1_home',
       'AdjT_Rk_rolling_1_home', 'Rk_rolling_3_home',
       'AdjEM_rolling_3_home', 'AdjO_rolling_3_home',
       'AdjO_Rk_rolling_3_home', 'AdjD_rolling_3_home',
       'AdjD_Rk_rolling_3_home', 'AdjT_rolling_3_home',
       'AdjT_Rk_rolling_3_home', 'Rk_rolling_5_home',
       'AdjEM_rolling_5_home', 'AdjO_rolling_5_home',
       'AdjO_Rk_rolling_5_home', 'AdjD_rolling_5_home',
       'AdjD_Rk_rolling_5_home', 'AdjT_rolling_5_home',
       'AdjT_Rk_rolling_5_home', 'Rk_rolling_10_home',
       'AdjEM_rolling_10_home', 'AdjO_rolling_10_home',
       'AdjO_Rk_rolling_10_home', 'AdjD_rolling_10_home',
       'AdjD_Rk_rolling_10_home', 'AdjT_rolling_10_home',
       'AdjT_Rk_rolling_10_home', 'Rk_rolling_20_home',
       'AdjEM_rolling_20_home', 'AdjO_rolling_20_home',
       'AdjO_Rk_rolling_20_home', 'AdjD_rolling_20_home',
       'AdjD_Rk_rolling_20_home', 'AdjT_rolling_20_home',
       'AdjT_Rk_rolling_20_home', 'Rk_away', 'Conf_away',
       'AdjEM_away', 'AdjO_away', 'AdjO_Rk_away', 'AdjD_away',
       'AdjD_Rk_away', 'AdjT_away', 'AdjT_Rk_away',
       'Rk_rolling_1_away', 'AdjEM_rolling_1_away', 'AdjO_rolling_1_away',
       'AdjO_Rk_rolling_1_away', 'AdjD_rolling_1_away',
       'AdjD_Rk_rolling_1_away', 'AdjT_rolling_1_away',
       'AdjT_Rk_rolling_1_away', 'Rk_rolling_3_away',
       'AdjEM_rolling_3_away', 'AdjO_rolling_3_away',
       'AdjO_Rk_rolling_3_away', 'AdjD_rolling_3_away',
       'AdjD_Rk_rolling_3_away', 'AdjT_rolling_3_away',
       'AdjT_Rk_rolling_3_away', 'Rk_rolling_5_away',
       'AdjEM_rolling_5_away', 'AdjO_rolling_5_away',
       'AdjO_Rk_rolling_5_away', 'AdjD_rolling_5_away',
       'AdjD_Rk_rolling_5_away', 'AdjT_rolling_5_away',
       'AdjT_Rk_rolling_5_away', 'Rk_rolling_10_away',
       'AdjEM_rolling_10_away', 'AdjO_rolling_10_away',
       'AdjO_Rk_rolling_10_away', 'AdjD_rolling_10_away',
       'AdjD_Rk_rolling_10_away', 'AdjT_rolling_10_away',
       'AdjT_Rk_rolling_10_away', 'Rk_rolling_20_away',
       'AdjEM_rolling_20_away', 'AdjO_rolling_20_away',
       'AdjO_Rk_rolling_20_away', 'AdjD_rolling_20_away',
       'AdjD_Rk_rolling_20_away', 'AdjT_rolling_20_away',
       'AdjT_Rk_rolling_20_away',  'home_sos',
       'away_sos', 'home_days_since_last_game',
       'home_rolling_avg_score_1', 'home_rolling_avg_score_3',
       'home_rolling_avg_score_7', 'home_rolling_sos_1',
       'home_rolling_sos_3', 'home_rolling_sos_7',
       'home_rolling_avg_score_allowed_1',
       'home_rolling_avg_score_allowed_3',
       'home_rolling_avg_score_allowed_7',
       'home_rolling_opp_score_rank_1', 'home_rolling_opp_score_rank_3',
       'home_rolling_opp_score_rank_7', 'away_days_since_last_game',
       'away_rolling_avg_score_1', 'away_rolling_avg_score_3',
       'away_rolling_avg_score_7', 'away_rolling_sos_1',
       'away_rolling_sos_3', 'away_rolling_sos_7',
       'away_rolling_avg_score_allowed_1',
       'away_rolling_avg_score_allowed_3',
       'away_rolling_avg_score_allowed_7',
       'away_rolling_opp_score_rank_1', 'away_rolling_opp_score_rank_3',
       'away_rolling_opp_score_rank_7']]

In [12]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest
import pandas as pd
import joblib
import os

categorical_columns = ['Conf_home', 'Conf_away']
target_columns = ['home_score', 'away_score', 'total', 'margin']
model_dir = "lgb_models"
high_outlier_thresholds = {'home_score': 91, 'away_score': 78, 'total': 167}
low_outlier_thresholds = {'home_score': 56, 'away_score': 52, 'total': 118}
for cat_col in categorical_columns:
    le = LabelEncoder()
    preds[cat_col] = le.fit_transform(preds[cat_col])

features_for_outlier_detection = preds.drop(columns=["home","away"]).fillna(preds.mean())

# Apply Isolation Forest for outlier detection and add outlier score as a feature
iso_forest = IsolationForest(contamination=0.1, random_state=42)
iso_forest.fit(features_for_outlier_detection)
preds['outlier_score'] = iso_forest.decision_function(features_for_outlier_detection)

for target_column in target_columns:
    model_path = os.path.join(model_dir, f"{target_column}_best_model.joblib")
    model = joblib.load(model_path)


    preds[f"{target_column}_prediction"] = model.predict(preds[['Rk_home','Conf_home', 'AdjEM_home', 'AdjO_home',
       'AdjO_Rk_home', 'AdjD_home', 'AdjD_Rk_home', 'AdjT_home',
       'AdjT_Rk_home', 'Rk_rolling_1_home',
       'AdjEM_rolling_1_home', 'AdjO_rolling_1_home',
       'AdjO_Rk_rolling_1_home', 'AdjD_rolling_1_home',
       'AdjD_Rk_rolling_1_home', 'AdjT_rolling_1_home',
       'AdjT_Rk_rolling_1_home', 'Rk_rolling_3_home',
       'AdjEM_rolling_3_home', 'AdjO_rolling_3_home',
       'AdjO_Rk_rolling_3_home', 'AdjD_rolling_3_home',
       'AdjD_Rk_rolling_3_home', 'AdjT_rolling_3_home',
       'AdjT_Rk_rolling_3_home', 'Rk_rolling_5_home',
       'AdjEM_rolling_5_home', 'AdjO_rolling_5_home',
       'AdjO_Rk_rolling_5_home', 'AdjD_rolling_5_home',
       'AdjD_Rk_rolling_5_home', 'AdjT_rolling_5_home',
       'AdjT_Rk_rolling_5_home', 'Rk_rolling_10_home',
       'AdjEM_rolling_10_home', 'AdjO_rolling_10_home',
       'AdjO_Rk_rolling_10_home', 'AdjD_rolling_10_home',
       'AdjD_Rk_rolling_10_home', 'AdjT_rolling_10_home',
       'AdjT_Rk_rolling_10_home', 'Rk_rolling_20_home',
       'AdjEM_rolling_20_home', 'AdjO_rolling_20_home',
       'AdjO_Rk_rolling_20_home', 'AdjD_rolling_20_home',
       'AdjD_Rk_rolling_20_home', 'AdjT_rolling_20_home',
       'AdjT_Rk_rolling_20_home', 'Rk_away', 'Conf_away',
       'AdjEM_away', 'AdjO_away', 'AdjO_Rk_away', 'AdjD_away',
       'AdjD_Rk_away', 'AdjT_away', 'AdjT_Rk_away',
       'Rk_rolling_1_away', 'AdjEM_rolling_1_away', 'AdjO_rolling_1_away',
       'AdjO_Rk_rolling_1_away', 'AdjD_rolling_1_away',
       'AdjD_Rk_rolling_1_away', 'AdjT_rolling_1_away',
       'AdjT_Rk_rolling_1_away', 'Rk_rolling_3_away',
       'AdjEM_rolling_3_away', 'AdjO_rolling_3_away',
       'AdjO_Rk_rolling_3_away', 'AdjD_rolling_3_away',
       'AdjD_Rk_rolling_3_away', 'AdjT_rolling_3_away',
       'AdjT_Rk_rolling_3_away', 'Rk_rolling_5_away',
       'AdjEM_rolling_5_away', 'AdjO_rolling_5_away',
       'AdjO_Rk_rolling_5_away', 'AdjD_rolling_5_away',
       'AdjD_Rk_rolling_5_away', 'AdjT_rolling_5_away',
       'AdjT_Rk_rolling_5_away', 'Rk_rolling_10_away',
       'AdjEM_rolling_10_away', 'AdjO_rolling_10_away',
       'AdjO_Rk_rolling_10_away', 'AdjD_rolling_10_away',
       'AdjD_Rk_rolling_10_away', 'AdjT_rolling_10_away',
       'AdjT_Rk_rolling_10_away', 'Rk_rolling_20_away',
       'AdjEM_rolling_20_away', 'AdjO_rolling_20_away',
       'AdjO_Rk_rolling_20_away', 'AdjD_rolling_20_away',
       'AdjD_Rk_rolling_20_away', 'AdjT_rolling_20_away',
       'AdjT_Rk_rolling_20_away',  'home_sos',
       'away_sos', 'home_days_since_last_game',
       'home_rolling_avg_score_1', 'home_rolling_avg_score_3',
       'home_rolling_avg_score_7', 'home_rolling_sos_1',
       'home_rolling_sos_3', 'home_rolling_sos_7',
       'home_rolling_avg_score_allowed_1',
       'home_rolling_avg_score_allowed_3',
       'home_rolling_avg_score_allowed_7',
       'home_rolling_opp_score_rank_1', 'home_rolling_opp_score_rank_3',
       'home_rolling_opp_score_rank_7', 'away_days_since_last_game',
       'away_rolling_avg_score_1', 'away_rolling_avg_score_3',
       'away_rolling_avg_score_7', 'away_rolling_sos_1',
       'away_rolling_sos_3', 'away_rolling_sos_7',
       'away_rolling_avg_score_allowed_1',
       'away_rolling_avg_score_allowed_3',
       'away_rolling_avg_score_allowed_7',
       'away_rolling_opp_score_rank_1', 'away_rolling_opp_score_rank_3',
       'away_rolling_opp_score_rank_7', 'outlier_score']])

In [13]:
preds.to_csv(f"/Users/nickdimmitt/Desktop/lumber/ncaab/predictions-2024-11-20.csv")