In [1]:
import pandas as pd

# Load data
df = pd.read_csv("oddsData.csv")

# Create a consistent game_id: sorted teams + date
df['team_pair'] = df[['team', 'opponent']].apply(lambda x: tuple(sorted(x)), axis=1)
df['game_id'] = df['team_pair'].astype(str) + '_' + df['date'].astype(str)

# Drop duplicates (only one row per game is needed)
df = df.drop_duplicates(subset='game_id', keep='first')

# Assign teamA, teamB in alphabetical order
df[['teamA', 'teamB']] = pd.DataFrame(df['team_pair'].tolist(), index=df.index)

# Determine if teamA is the home team
df['teamA_home'] = ((df['team'] == df['teamA']) & (df['home/visitor'] == 'vs')) | \
                   ((df['team'] == df['teamB']) & (df['home/visitor'] == '@'))
df['teamA_home'] = df['teamA_home'].astype(int)

# Assign scores and moneylines using vectorized logic
is_teamA = df['team'] == df['teamA']

df['teamA_score'] = df['score'].where(is_teamA, df['opponentScore'])
df['teamB_score'] = df['opponentScore'].where(is_teamA, df['score'])

df['teamA_moneyLine'] = df['moneyLine'].where(is_teamA, df['opponentMoneyLine'])
df['teamB_moneyLine'] = df['opponentMoneyLine'].where(is_teamA, df['moneyLine'])

# Final cleaned dataframe
cleaned_df = df[[
    'date', 'season', 'teamA', 'teamB', 'teamA_home',
    'teamA_score', 'teamB_score',
    'teamA_moneyLine', 'teamB_moneyLine',
    'total', 'spread', 'secondHalfTotal'
]]

# Optional save or display
cleaned_df.to_csv("cleaned_odds_data.csv", index=False)
print(cleaned_df.head())


         date  season         teamA        teamB  teamA_home  teamA_score  \
0  2007-10-30    2008  Golden State         Utah           1           96   
1  2007-10-30    2008       Houston    LA Lakers           0           95   
3  2007-10-30    2008      Portland  San Antonio           0           97   
6  2007-10-31    2008       Chicago   New Jersey           0          103   
7  2007-10-31    2008     Cleveland       Dallas           1           74   

   teamB_score  teamA_moneyLine  teamB_moneyLine  total  spread  \
0          117           -120.0            100.0  212.0     1.0   
1           93           -230.0            190.0  199.0     5.0   
3          106            900.0          -1400.0  189.5   -13.0   
6          112            105.0           -125.0  186.0    -1.5   
7           92            120.0           -140.0  184.0    -2.5   

   secondHalfTotal  
0            105.5  
1             99.0  
3             95.0  
6             94.0  
7             91.5  


In [2]:
import pandas as pd
import numpy as np

# Load pre-cleaned dataset
cleaned_df = pd.read_csv("cleaned_odds_data.csv")  # <- Use your actual path
cleaned_df['date'] = pd.to_datetime(cleaned_df['date'])

# Compute target: whether teamA won
cleaned_df['teamA_win'] = (cleaned_df['teamA_score'] > cleaned_df['teamB_score']).astype(int)

# Compute implied probability from moneyline
def moneyline_to_implied_prob(ml):
    return 100 / (ml + 100) if ml > 0 else abs(ml) / (abs(ml) + 100)

cleaned_df['teamA_implied_prob'] = cleaned_df['teamA_moneyLine'].apply(moneyline_to_implied_prob)


In [3]:
# Sort chronologically
cleaned_df = cleaned_df.sort_values(by='date')

# Rolling win rate and moneyline average for teamA (past 5 games)
cleaned_df['teamA_winrate_5'] = (
    cleaned_df.groupby('teamA')['teamA_win']
    .transform(lambda x: x.shift().rolling(5, min_periods=1).mean())
)

cleaned_df['teamA_avg_ml_5'] = (
    cleaned_df.groupby('teamA')['teamA_moneyLine']
    .transform(lambda x: x.shift().rolling(5, min_periods=1).mean())
)

# Drop rows with missing features
cleaned_df = cleaned_df.dropna(subset=['teamA_winrate_5', 'teamA_avg_ml_5']).reset_index(drop=True)


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss

# Rolling window parameters
train_window_days = 730  # 2 years
test_window_days = 180   # ~half season
step_days = 180

start_date = cleaned_df['date'].min()
end_date = cleaned_df['date'].max()

features = ['teamA_winrate_5', 'teamA_avg_ml_5', 'teamA_implied_prob']
target = 'teamA_win'

results = []


In [5]:
while start_date + pd.Timedelta(days=train_window_days + test_window_days) <= end_date:
    train_start = start_date
    train_end = train_start + pd.Timedelta(days=train_window_days)
    test_end = train_end + pd.Timedelta(days=test_window_days)

    train_df = cleaned_df[(cleaned_df['date'] >= train_start) & (cleaned_df['date'] < train_end)]
    test_df = cleaned_df[(cleaned_df['date'] >= train_end) & (cleaned_df['date'] < test_end)]

    if len(train_df) < 100 or len(test_df) < 50:
        start_date += pd.Timedelta(days=step_days)
        continue

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(train_df[features], train_df[target])

    test_df = test_df.copy()
    test_df['pred_prob'] = model.predict_proba(test_df[features])[:, 1]
    test_df['implied_prob'] = test_df['teamA_implied_prob']
    test_df['model_edge'] = test_df['pred_prob'] - test_df['implied_prob']
    test_df['bet'] = test_df['model_edge'] > 0.05  # betting threshold

    # Profit simulation ($100 bets)
    def bet_return(row):
        if not row['bet']:
            return 0
        if row['teamA_win']:
            return row['teamA_moneyLine'] / 100 * 100 if row['teamA_moneyLine'] > 0 else 100 * 100 / abs(row['teamA_moneyLine'])
        else:
            return -100

    test_df['profit'] = test_df.apply(bet_return, axis=1)

    results.append({
        'train_start': train_start,
        'train_end': train_end,
        'test_start': train_end,
        'test_end': test_end,
        'bets': test_df['bet'].sum(),
        'profit': test_df['profit'].sum(),
        'roi': test_df['profit'].sum() / (test_df['bet'].sum() * 100) if test_df['bet'].sum() > 0 else 0,
        'accuracy': accuracy_score(test_df[target], test_df['pred_prob'] > 0.5),
        'log_loss': log_loss(test_df[target], test_df['pred_prob'])
    })

    start_date += pd.Timedelta(days=step_days)


In [6]:
results_df = pd.DataFrame(results)
print(results_df)
print(f"\nAverage ROI: {results_df['roi'].mean():.2%}")
print(f"Total Bets: {results_df['bets'].sum()}")
print(f"Total Profit: ${results_df['profit'].sum():.2f}")


   train_start  train_end test_start   test_end  bets       profit       roi  \
0   2007-11-01 2009-10-31 2009-10-31 2010-04-29   501 -2238.104769 -0.044673   
1   2008-10-26 2010-10-26 2010-10-26 2011-04-24   458   650.751971  0.014209   
2   2009-10-21 2011-10-21 2011-10-21 2012-04-18   365  -960.828461 -0.026324   
3   2010-04-19 2012-04-18 2012-04-18 2012-10-15    20  -590.996027 -0.295498   
4   2010-10-16 2012-10-15 2012-10-15 2013-04-13   483  2294.381564  0.047503   
5   2011-10-11 2013-10-10 2013-10-10 2014-04-08   450 -1669.615239 -0.037103   
6   2012-04-08 2014-04-08 2014-04-08 2014-10-05    30  1100.139812  0.366713   
7   2012-10-05 2014-10-05 2014-10-05 2015-04-03   401  2110.221608  0.052624   
8   2013-04-03 2015-04-03 2015-04-03 2015-09-30    35  -504.747830 -0.144214   
9   2013-09-30 2015-09-30 2015-09-30 2016-03-28   400  -517.587772 -0.012940   
10  2014-03-29 2016-03-28 2016-03-28 2016-09-24    52  -732.922887 -0.140947   
11  2014-09-25 2016-09-24 2016-09-24 201