In [151]:
import pandas as pd
import numpy

### Start of Elo Simulations

In [152]:
year = 2018
games_df = pd.read_csv(f'game-data/{year}games.csv', index_col='Unnamed: 0')

In [153]:
print(games_df.columns)
games_df.tail(10)

Index(['Date', 'Team A', 'Team B', 'Team A Points', 'Team B Points', 'Winner',
       'Point Differential'],
      dtype='object')


Unnamed: 0,Date,Team A,Team B,Team A Points,Team B Points,Winner,Point Differential
5490,2019-3-31,Arizona,Wyoming,67,45,Arizona,22
5491,2019-4-1,Baylor,Iowa,85,53,Baylor,32
5492,2019-4-1,Notre Dame,Stanford,84,68,Notre Dame,16
5493,2019-4-3,Appalachian State,North Texas,76,59,Appalachian State,17
5494,2019-4-3,Northwestern,James Madison,74,69,Northwestern,5
5495,2019-4-3,Arizona,TCU,59,53,Arizona,6
5496,2019-4-5,Baylor,Oregon,72,67,Baylor,5
5497,2019-4-5,Notre Dame,UConn,81,76,Notre Dame,5
5498,2019-4-6,Arizona,Northwestern,56,42,Arizona,14
5499,2019-4-7,Baylor,Notre Dame,82,81,Baylor,1


In [154]:
games_df['Pre-Game Team A Elo'] = ''
games_df['Pre-Game Team B Elo'] = ''
games_df['Post-Game Team A Elo'] = ''
games_df['Post-Game Team B Elo'] = ''

# RECALL

# Elo Rating Adjustment with Point Differential

To better reflect the impact of **blowout wins**, we adjust Elo ratings by incorporating **Margin of Victory (MOV)**.

## **Adjusted Elo Formula with MOV**
$$
R_A' = R_A + K \cdot (W_A - P_A) \cdot f(\text{MOV})
$$

$$
R_B' = R_B + K \cdot (W_B - P_B) \cdot f(\text{MOV})
$$

Where:
- \( R_A', R_B' \) = Updated Elo ratings for Team A and Team B  
- \( R_A, R_B \) = Pre-game Elo ratings  
- \( K \) = Scaling factor (typically **20-40**)  
- \( W_A, W_B \) = Win values (**1 for a win, 0 for a loss**)  
- \( P_A, P_B \) = Expected probabilities, given by:  

$$
P_A = \frac{1}{1 + 10^{(R_B - R_A)/400}}
$$

- \( \text{MOV} \) = Margin of Victory (Point Differential)  
- \( f(\text{MOV}) \) = A function that increases Elo changes for larger wins:

$$
f(\text{MOV}) = \ln(\text{MOV} + 1) \cdot \frac{2.2}{(R_A - R_B) \times 0.001 + 2.2}
$$

## **Explanation**
- **Larger MOV values** increase the Elo adjustment.
- **Close games** (small MOV) result in **standard Elo changes**.
- **Blowouts** (high MOV) result in **bigger Elo adjustments**, but are limited to avoid extreme shifts.


In [155]:
import math

team_As = games_df['Team A'].unique()
team_Bs = games_df['Team B'].unique()

teams = set()

for team in team_As:
    teams.add(team)
for team in team_Bs:
    teams.add(team)

team_elos = {team: 1500 for team in teams}

In [156]:

def expected_win_probability(team_A, team_B):
    R_A = team_elos[team_A]
    R_B = team_elos[team_B]
    R_D = R_B - R_A # elo differential
    P_A = 1 / (1 + 10 ** (R_D / 400))
    return P_A

def f_mov(R_A, R_B, MOV):
    return math.log(MOV + 1) * 2.2 / ((R_A - R_B) * 0.001 + 2.2)

def calulate_elo(team_A, team_B, point_differential, K = 20):
    R_A = team_elos[team_A]
    R_B = team_elos[team_B]

    P_A = expected_win_probability(team_A, team_B)
    P_B = 1 - P_A
    
    # Team A wins
    elo_change = int(K * (1 - P_A) * f_mov(R_A, R_B, point_differential))
    R_A_NEW = team_elos[team_A] + elo_change
    R_B_NEW = team_elos[team_B] - elo_change

    return (R_A_NEW, R_B_NEW)


In [157]:
pre_game_team_A_elo = []
pre_game_team_B_elo = []
post_game_team_A_elo = []
post_game_team_B_elo = []

for i, row in games_df.iterrows():
    team_A, team_B, point_diff = row['Team A'], row['Team B'], row['Point Differential']
    R_A, R_B = team_elos[team_A], team_elos[team_B]
    R_A_NEW, R_B_NEW = calulate_elo(team_A, team_B, point_diff)
    team_elos[team_A] = R_A_NEW
    team_elos[team_B] = R_B_NEW

    pre_game_team_A_elo.append(R_A)
    pre_game_team_B_elo.append(R_B)
    post_game_team_A_elo.append(R_A_NEW)
    post_game_team_B_elo.append(R_B_NEW)

games_df['Pre-Game Team A Elo'] = pre_game_team_A_elo
games_df['Pre-Game Team B Elo'] = pre_game_team_B_elo
games_df['Post-Game Team A Elo'] = post_game_team_A_elo
games_df['Post-Game Team B Elo'] = post_game_team_B_elo

In [158]:
games_df

Unnamed: 0,Date,Team A,Team B,Team A Points,Team B Points,Winner,Point Differential,Pre-Game Team A Elo,Pre-Game Team B Elo,Post-Game Team A Elo,Post-Game Team B Elo
0,2018-11-9,Notre Dame,Harvard,103,58,Notre Dame,45,1500,1500,1538,1462
1,2018-11-9,Louisville,Chattanooga,75,49,Louisville,26,1500,1500,1532,1468
2,2018-11-9,Mississippi State,Virginia,72,44,Mississippi State,28,1500,1500,1533,1467
3,2018-11-9,Oregon State,Cal Poly,79,54,Oregon State,25,1500,1500,1532,1468
4,2018-11-9,Maryland,Coppin State,93,36,Maryland,57,1500,1500,1540,1460
...,...,...,...,...,...,...,...,...,...,...,...
5495,2019-4-3,Arizona,TCU,59,53,Arizona,6,1726,1756,1747,1735
5496,2019-4-5,Baylor,Oregon,72,67,Baylor,5,2028,1947,2041,1934
5497,2019-4-5,Notre Dame,UConn,81,76,Notre Dame,5,2016,1981,2031,1966
5498,2019-4-6,Arizona,Northwestern,56,42,Arizona,14,1747,1699,1769,1677


In [159]:
games_df.to_csv(f'game-data/{year}games.csv')

### Assess Accuracy of Pure Elo Predictions

In [160]:
import random
errors = 0

for i, row in games_df.iterrows():
    team_A, team_B = row['Team A'], row['Team B']
    team_A_elo, team_B_elo = row['Pre-Game Team A Elo'], row['Pre-Game Team B Elo']

    if (team_A_elo < team_B_elo):
        errors += 1
    elif team_A_elo == team_B_elo: 
        if random.random() < 0.5:
            errors += 1


total_games = len(games_df)
sample_error_rate = errors / total_games if total_games > 0 else 0 
print(f"{year} - Sample Error Rate: {sample_error_rate:.4f}")

2018 - Sample Error Rate: 0.2936
