# Synthetic Historical Data Creation

In this notebook I will simulate many seasons in order to have a larger dataset to test and train our picking strategies on.

In [13]:
import numpy as np
import pandas as pd
import random

In [14]:
df = pd.read_csv("../data/df_for_simulation.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Season,Week,Date,Home Team,Away Team,Winner,Home Odds Close,Away Odds Close
0,0,2024,18,2025-01-05,Detroit Lions,Minnesota Vikings,Detroit Lions,1.645,2.3
1,1,2024,18,2025-01-05,Arizona Cardinals,San Francisco 49ers,Arizona Cardinals,1.416,2.95
2,2,2024,18,2025-01-05,Atlanta Falcons,Carolina Panthers,Carolina Panthers,1.256,4.1
3,3,2024,18,2025-01-05,Dallas Cowboys,Washington Commanders,Washington Commanders,3.6,1.312
4,4,2024,18,2025-01-05,Denver Broncos,Kansas City Chiefs,Denver Broncos,1.166,5.5


In [15]:
def simulate_game(game_row):
    """
    Simulates the outcome of a single game based on implied win probabilities 
    derived from closing betting odds.

    Parameters:
        game_row (pd.Series): A row containing 'Home Team', 'Away Team', 
                              'Home Odds Close', and 'Away Odds Close'.

    Returns:
        str: The name of the team that is simulated to win the game.
    """
    implied_prob_home = 1/game_row['Home Odds Close']
    implied_prob_away = 1/game_row['Away Odds Close']
    implied_total = implied_prob_home + implied_prob_away
    home_prob = implied_prob_home / implied_total
    away_prob = implied_prob_away / implied_total
    teams = [game_row['Home Team'], game_row['Away Team']]
    chosen_team = np.random.choice(teams, p=[home_prob, away_prob])
    return chosen_team

In [16]:
simulate_game(df.iloc[0])

'Minnesota Vikings'

In [17]:
num_sims = 500
original_df = df.copy()
all_sims = [original_df]
for i in range(num_sims):
    simmed_history = original_df.copy()
    simmed_history['Winner'] = simmed_history.apply(simulate_game, axis=1)
    simmed_history['Season'] = simmed_history['Season'].astype(str) + '_sim_' + str(i+1)
    all_sims.append(simmed_history)
    
df = pd.concat(all_sims, ignore_index=True)

In [18]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Season,Week,Date,Home Team,Away Team,Winner,Home Odds Close,Away Odds Close
2466418,4918,2006_sim_500,1,2006-09-10,Detroit Lions,Seattle Seahawks,Seattle Seahawks,3.5,1.333333
2466419,4919,2006_sim_500,1,2006-09-10,Cleveland Browns,New Orleans Saints,New Orleans Saints,1.5,2.75
2466420,4920,2006_sim_500,1,2006-09-10,Carolina Panthers,Atlanta Falcons,Atlanta Falcons,1.434783,2.9
2466421,4921,2006_sim_500,1,2006-09-10,Arizona Cardinals,San Francisco 49ers,San Francisco 49ers,1.181818,5.0
2466422,4922,2006_sim_500,1,2006-09-07,Pittsburgh Steelers,Miami Dolphins,Pittsburgh Steelers,1.8,2.05


In [19]:
rows = []

for i, row in df.iterrows():
    rows.append({
        'Season': row['Season'],
        'Week': row['Week'],
        'Date': row['Date'],
        'Team': row['Home Team'],
        'Odds': row['Home Odds Close'],
        'Won?': row['Home Team'] == row['Winner']
    })
    rows.append({
        'Season': row['Season'],
        'Week': row['Week'],
        'Date': row['Date'],
        'Team': row['Away Team'],
        'Odds': row['Away Odds Close'],
        'Won?': row['Away Team'] == row['Winner']
    })

new_df = pd.DataFrame(rows)
new_df['Implied Prob'] = 1/ new_df['Odds']

In [20]:
new_df.to_csv("../data/simulated_nfl_histories")