In [2]:
import pandas as pd
import numpy as np
import os

DATE = "2025-11-09"
CLEAN_DATE = DATE.replace("-","")
daily_preds = pd.read_csv(f"s3://collegebasketballinsiders/predictions/{CLEAN_DATE}/preds.csv")

spreads_df = pd.read_csv(f"../barttorvik/data/odds/{DATE}/spreads_final.csv")
totals_df = pd.read_csv(f"../barttorvik/data/odds/{DATE}/totals_final.csv")

map_df = pd.read_csv("s3://collegebasketballinsiders/general/map.csv", index_col=0)

In [3]:
import numpy as np
import pandas as pd

def _nz(x):  # pd.notna shortcut
    return pd.notna(x)

def reconcile_game_all_preds(
    row,
    # Weights (set any to 0.0 to ignore that signal)
    w_full_score=1.0,        # pred_home_score, pred_away_score
    w_half_score=1.0,        # pred_home_1h, pred_away_1h, pred_home_2h, pred_away_2h
    w_totals=1.0,            # pred_total, pred_1h_total, pred_2h_total
    w_margin=1.0,            # pred_margin / pred_home_margin / pred_away_margin
    w_margin_1h=0.7,         # pred_1h_margin
    w_margin_2h=0.7,         # pred_2h_margin
    w_half_prior=0.05,       # soft prior: H1≈0.5*S_h, A1≈0.5*S_a (stability when halves missing)
    use_both_game_margins=True,  # if pred_home_margin & pred_away_margin exist, use both at half weight
    clip_nonneg=True,        # non-negative post-solve
):
    """
    Solve for x = [S_h, S_a, H1, A1]^T using all available predictions.
    Linear observation matrix A and target y are built from the row.
    """

    # Unknowns: x = [S_h, S_a, H1, A1]
    # Build A rows (coeffs) and y values with weights
    A_rows, y_vals, w_vals = [], [], []

    # Fetch predictions if they exist (gracefully allow missing)
    Sh  = row.get('pred_home_score')
    Sa  = row.get('pred_away_score')
    H1  = row.get('pred_home_1h')
    A1p = row.get('pred_away_1h')
    H2  = row.get('pred_home_2h')
    A2  = row.get('pred_away_2h')

    T   = row.get('pred_total')        # game total
    T1  = row.get('pred_1h_total')     # 1H total
    T2  = row.get('pred_2h_total')     # 2H total

    Mg      = row.get('pred_margin')          # game margin (home - away)
    Mh      = row.get('pred_home_margin')     # home-side margin (should equal Mg)
    Ma      = row.get('pred_away_margin')     # away-side margin (use -Ma)
    M1      = row.get('pred_1h_margin')       # 1H margin
    M2      = row.get('pred_2h_margin')       # 2H margin

    # --- Full scores
    # Sh ≈ [1,0,0,0]·x
    if _nz(Sh) and w_full_score > 0:
        A_rows.append([1.0, 0.0, 0.0, 0.0]); y_vals.append(Sh); w_vals.append(w_full_score)
    # Sa ≈ [0,1,0,0]·x
    if _nz(Sa) and w_full_score > 0:
        A_rows.append([0.0, 1.0, 0.0, 0.0]); y_vals.append(Sa); w_vals.append(w_full_score)

    # --- 1H scores
    # H1 ≈ [0,0,1,0]·x
    if _nz(H1) and w_half_score > 0:
        A_rows.append([0.0, 0.0, 1.0, 0.0]); y_vals.append(H1); w_vals.append(w_half_score)
    # A1 ≈ [0,0,0,1]·x
    if _nz(A1p) and w_half_score > 0:
        A_rows.append([0.0, 0.0, 0.0, 1.0]); y_vals.append(A1p); w_vals.append(w_half_score)

    # --- 2H scores (S_h - H1) and (S_a - A1)
    # H2 ≈ [1,0,-1, 0]·x
    if _nz(H2) and w_half_score > 0:
        A_rows.append([1.0, 0.0, -1.0, 0.0]); y_vals.append(H2); w_vals.append(w_half_score)
    # A2 ≈ [0,1, 0,-1]·x
    if _nz(A2) and w_half_score > 0:
        A_rows.append([0.0, 1.0, 0.0, -1.0]); y_vals.append(A2); w_vals.append(w_half_score)

    # --- Totals
    # T ≈ (S_h + S_a) = [1,1,0,0]·x
    if _nz(T) and w_totals > 0:
        A_rows.append([1.0, 1.0, 0.0, 0.0]); y_vals.append(T); w_vals.append(w_totals)
    # T1 ≈ (H1 + A1) = [0,0,1,1]·x
    if _nz(T1) and w_totals > 0:
        A_rows.append([0.0, 0.0, 1.0, 1.0]); y_vals.append(T1); w_vals.append(w_totals)
    # T2 ≈ (S_h + S_a - H1 - A1) = [1,1,-1,-1]·x
    if _nz(T2) and w_totals > 0:
        A_rows.append([1.0, 1.0, -1.0, -1.0]); y_vals.append(T2); w_vals.append(w_totals)

    # --- Margins
    # Game margin (home - away) = [1,-1,0,0]·x
    if _nz(Mg) and w_margin > 0:
        A_rows.append([1.0,-1.0,0.0,0.0]); y_vals.append(Mg); w_vals.append(w_margin)

    # If both side-specific margins exist, optionally use both at half weight
    if use_both_game_margins and w_margin > 0:
        used = False
        if _nz(Mh):
            A_rows.append([1.0,-1.0,0.0,0.0]); y_vals.append(Mh); w_vals.append(0.5 * w_margin); used = True
        if _nz(Ma):
            A_rows.append([1.0,-1.0,0.0,0.0]); y_vals.append(-Ma); w_vals.append(0.5 * w_margin); used = True
        # If neither present, nothing added
    else:
        # Use whichever is present if Mg missing
        if (not _nz(Mg)) and w_margin > 0:
            if _nz(Mh):
                A_rows.append([1.0,-1.0,0.0,0.0]); y_vals.append(Mh); w_vals.append(w_margin)
            elif _nz(Ma):
                A_rows.append([1.0,-1.0,0.0,0.0]); y_vals.append(-Ma); w_vals.append(w_margin)

    # 1H margin: (H1 - A1) = [0,0,1,-1]·x
    if _nz(M1) and w_margin_1h > 0:
        A_rows.append([0.0,0.0,1.0,-1.0]); y_vals.append(M1); w_vals.append(w_margin_1h)

    # 2H margin: (H2 - A2) = (S_h - S_a) - (H1 - A1) = [1,-1,-1,1]·x
    if _nz(M2) and w_margin_2h > 0:
        A_rows.append([1.0,-1.0,-1.0,1.0]); y_vals.append(M2); w_vals.append(w_margin_2h)

    # --- Soft priors to stabilize if halves missing: H1 ≈ 0.5*S_h and A1 ≈ 0.5*S_a
    if w_half_prior > 0:
        # H1 - 0.5*S_h ≈ 0  ->  [-0.5, 0, 1, 0]·x ≈ 0
        A_rows.append([-0.5, 0.0, 1.0, 0.0]); y_vals.append(0.0); w_vals.append(w_half_prior)
        # A1 - 0.5*S_a ≈ 0  ->  [0, -0.5, 0, 1]·x ≈ 0
        A_rows.append([0.0,-0.5, 0.0, 1.0]); y_vals.append(0.0); w_vals.append(w_half_prior)

    # If nothing to solve, return NaNs
    if not A_rows:
        return pd.Series({
            'cons_home_score': np.nan,
            'cons_away_score': np.nan,
            'cons_margin':     np.nan,
            'cons_home_1h':    np.nan,
            'cons_home_2h':    np.nan,
            'cons_away_1h':    np.nan,
            'cons_away_2h':    np.nan,
            'cons_total':      np.nan,
            'cons_1h_total':   np.nan,
            'cons_2h_total':   np.nan,
        })

    A = np.asarray(A_rows, dtype=float)  # (m,4)
    y = np.asarray(y_vals, dtype=float)  # (m,)
    w = np.asarray(w_vals, dtype=float)  # (m,)

    # Weighted least squares: argmin || W^{1/2}(Ax - y) ||^2
    Wsqrt = np.sqrt(w)[:, None]
    Aw = A * Wsqrt
    yw = y * Wsqrt.ravel()

    # Solve (stable)
    x_hat, *_ = np.linalg.lstsq(Aw, yw, rcond=None)
    S_h, S_a, H1_hat, A1_hat = x_hat.tolist()

    if clip_nonneg:
        S_h  = max(S_h, 0.0)
        S_a  = max(S_a, 0.0)
        H1_hat = min(max(H1_hat, 0.0), S_h)   # 0 <= H1 <= S_h
        A1_hat = min(max(A1_hat, 0.0), S_a)   # 0 <= A1 <= S_a

    H2_hat = S_h - H1_hat
    A2_hat = S_a - A1_hat
    M_hat  = S_h - S_a

    return pd.Series({
        'cons_home_score': S_h,
        'cons_away_score': S_a,
        'cons_margin':     M_hat,
        'cons_home_1h':    H1_hat,
        'cons_home_2h':    H2_hat,
        'cons_away_1h':    A1_hat,
        'cons_away_2h':    A2_hat,
        'cons_total':      S_h + S_a,
        'cons_1h_total':   H1_hat + A1_hat,
        'cons_2h_total':   H2_hat + A2_hat,
    })

def reconcile_dataframe_all_preds(
    df,
    w_full_score=1.0,
    w_half_score=1.0,
    w_totals=1.0,
    w_margin=1.0,
    w_margin_1h=0.7,
    w_margin_2h=0.7,
    w_half_prior=0.05,
    use_both_game_margins=True,
    clip_nonneg=True,
):
    """
    Apply reconciliation to every row in a predictions DataFrame.

    Expected column *names* (all optional; any subset is fine):
      - Scores: pred_home_score, pred_away_score
      - Halves: pred_home_1h, pred_home_2h, pred_away_1h, pred_away_2h
      - Totals: pred_total, pred_1h_total, pred_2h_total
      - Margins: pred_margin (home - away), pred_home_margin, pred_away_margin,
                 pred_1h_margin, pred_2h_margin

    Returns a copy with appended consensus columns:
      cons_home_score, cons_away_score, cons_margin,
      cons_home_1h, cons_home_2h, cons_away_1h, cons_away_2h,
      cons_total, cons_1h_total, cons_2h_total
    """
    # Ensure all columns exist so .get works inside the row function
    needed = [
        'pred_home_score','pred_away_score',
        'pred_home_1h','pred_home_2h','pred_away_1h','pred_away_2h',
        'pred_total','pred_1h_total','pred_2h_total',
        'pred_margin','pred_home_margin','pred_away_margin',
        'pred_1h_margin','pred_2h_margin',
    ]
    df = df.copy()
    for c in needed:
        if c not in df.columns:
            df[c] = np.nan

    out = df.apply(
        lambda r: reconcile_game_all_preds(
            r,
            w_full_score=w_full_score,
            w_half_score=w_half_score,
            w_totals=w_totals,
            w_margin=w_margin,
            w_margin_1h=w_margin_1h,
            w_margin_2h=w_margin_2h,
            w_half_prior=w_half_prior,
            use_both_game_margins=use_both_game_margins,
            clip_nonneg=clip_nonneg,
        ),
        axis=1
    )
    return pd.concat([df.reset_index(drop=True), out], axis=1)
result = reconcile_dataframe_all_preds(daily_preds)

In [6]:
result

Unnamed: 0.1,Unnamed: 0,game_id,date_utc,home_team,home_team_id,away_team,away_team_id,pred_home_1h,pred_away_1h,pred_home_2h,...,cons_home_score,cons_away_score,cons_margin,cons_home_1h,cons_home_2h,cons_away_1h,cons_away_2h,cons_total,cons_1h_total,cons_2h_total
0,17,401812600,2025-11-09 00:00:00,Harvard Crimson,108,New Hampshire Wildcats,189,39.352274,29.555477,38.499782,...,81.226994,63.92138,17.305614,43.472639,37.754355,32.200474,31.720905,145.148373,75.673114,69.47526
1,18,401828296,2025-11-09,Campbell Fighting Camels,41,Western Michigan Broncos,341,38.821977,33.564036,40.433768,...,82.057743,71.991638,10.066105,41.297832,40.759911,33.997699,37.993939,154.049381,75.295531,78.75385
2,19,401812574,2025-11-09,American University Eagles,9,Pennsylvania Quakers,224,36.755321,34.442364,37.770994,...,75.122584,74.449823,0.672762,37.224638,37.897946,35.239807,39.210016,149.572407,72.464445,77.107962
3,20,401826732,2025-11-09,App State Mountaineers,10,North Carolina Central Eagles,199,30.907638,32.796383,35.640148,...,67.390158,68.069396,-0.679238,32.023136,35.367022,31.565316,36.50408,135.459554,63.588452,71.871102
4,21,401822758,2025-11-09,James Madison Dukes,132,Coppin State Eagles,60,38.825562,28.896606,41.334775,...,78.730604,59.151691,19.578913,40.995716,37.734888,30.7144,28.437291,137.882295,71.710116,66.172179
5,22,401823022,2025-11-09,Marquette Golden Eagles,158,Indiana Hoosiers,122,34.318764,30.763407,38.341404,...,76.130873,68.002999,8.127874,37.206907,38.923967,32.700603,35.302396,144.133873,69.90751,74.226362
6,23,401823485,2025-11-09,Mercer Bears,165,Lipscomb Bisons,143,38.866209,31.884201,44.025588,...,81.814254,69.052198,12.762056,41.51632,40.297934,34.257785,34.794413,150.866452,75.774105,75.092347
7,24,401818548,2025-11-09,Brown Bears,30,Vermont Catamounts,327,28.802182,31.762016,29.983214,...,60.465171,68.207502,-7.74233,29.45734,31.007831,33.427122,34.780379,128.672673,62.884462,65.78821
8,25,401823417,2025-11-09,Bowling Green Falcons,28,Le Moyne Dolphins,370,38.938962,28.960094,38.741725,...,78.909916,61.341899,17.568017,41.179568,37.730348,30.607123,30.734776,140.251815,71.786691,68.465124
9,26,401823256,2025-11-09,North Dakota Fighting Hawks,201,Cal State Northridge Matadors,39,32.623906,31.753254,36.853971,...,72.051335,68.54768,3.503655,35.421713,36.629622,33.394615,35.153064,140.599015,68.816329,71.782686


In [7]:
result = result[["date_utc", "home_team", "away_team", "home_team_id", "away_team_id", "cons_home_score", "cons_away_score", "cons_margin", "cons_total", "cons_home_1h", "cons_home_2h", "cons_away_1h", "cons_away_2h"]]

In [8]:
result.columns = ["date", "home", "away", "home_team_id", "away_team_id", "home_score", "away_score", "margin", "total", "home_1h", "home_2h", "away_1h", "away_2h"]

In [9]:
totals_df = totals_df.merge(map_df[["odds", "team_id"]], left_on="home", right_on="odds")[["team_id", "open", "current", "movement"]]

In [11]:
result = result.merge(totals_df, left_on='home_team_id', right_on='team_id', how="left")[["date", "home", "home_team_id", "away", "home_score", "away_score", "margin", "total", "home_1h", "home_2h", "away_1h", "away_2h", "open", "current", "movement"]]

In [13]:
result.columns = ['date', 'home', "home_team_id", 'away', 'home_score', 'away_score', 'margin', 'total',
       'home_1h', 'home_2h', 'away_1h', 'away_2h', 'total_open', 'total_current',
       'total_movement']

In [14]:
spreads_df = spreads_df.merge(map_df[["odds", "team_id"]], left_on="home", right_on="odds")[["team_id", "open", "current", "movement"]]


In [15]:
result = result.merge(spreads_df, left_on='home_team_id', right_on='team_id', how="left")[["date", "home", "away", "home_score", "away_score", "margin", "total", "home_1h", "home_2h", "away_1h", "away_2h", 'total_open', 'total_current', 'total_movement', "open", "current", "movement"]]

In [16]:
result.columns = ['date', 'home', 'away', 'home_score', 'away_score', 'margin', 'total',
       'home_1h', 'home_2h', 'away_1h', 'away_2h', 'total_open',
       'total_current', 'total_movement', 'spread_open', 'spread_current', 'spread_movement']

In [17]:
result['total_diff'] = abs(result['total'] - result['total_current'])
result['margin_diff'] = abs(result['margin'] - (-1*result['spread_current']))

In [20]:
missing = result[result['spread_current'].isna()]
result = result[~result['spread_current'].isna()]
sharp_spreads = pd.read_csv(f"../barttorvik/data/odds/{DATE}/sharp_spreads.csv")
sharp_spreads = sharp_spreads.merge(map_df, left_on="home_team", right_on="odds")
sharp_totals = pd.read_csv(f"../barttorvik/data/odds/{DATE}/sharp_totals.csv")
sharp_totals = sharp_totals.merge(map_df, left_on="home_team", right_on="odds")

In [21]:
missing = missing.merge(sharp_spreads[["espn","draftkings"]], left_on="home", right_on="espn")
missing = missing.merge(sharp_totals[["espn","draftkings"]], left_on="home", right_on="espn")

In [22]:
missing

Unnamed: 0,date,home,away,home_score,away_score,margin,total,home_1h,home_2h,away_1h,...,total_movement,spread_open,spread_current,spread_movement,total_diff,margin_diff,espn_x,draftkings_x,espn_y,draftkings_y


In [23]:
missing['spread_current'] = missing['draftkings_x']
missing['total_current'] = missing['draftkings_y']
missing['total_diff'] = abs(missing['total'] - missing['total_current'])
missing['margin_diff'] = abs(missing['margin'] - (-1*missing['spread_current']))
missing = missing[['date', 'home', 'away', 'home_score', 'away_score', 'margin', 'total',
       'home_1h', 'home_2h', 'away_1h', 'away_2h', 'total_open',
       'total_current', 'total_movement', 'spread_open', 'spread_current',
       'spread_movement', 'total_diff', 'margin_diff']]

In [24]:
result = pd.concat([missing, result], axis=0)

In [25]:
result.to_csv("/Users/nickdimmitt/Desktop/daily_preds.csv")
result.to_csv(f"s3://collegebasketballinsiders/predictions/{CLEAN_DATE}/daily_preds.csv")