# Pre-processing and Data Cleaning

In [1]:
import pandas as pd
import numpy as np

year = 2024
games_df = pd.read_csv(f'game-data/{year}games.csv', index_col='Unnamed: 0')

In [13]:
games_df['Pre-Game Team Elo'] = ''
games_df['Pre-Game Opponent Elo'] = ''
games_df['Post-Game Team Elo'] = ''
games_df['Post-Game Opponent Elo'] = ''
games_df.head(10)

Unnamed: 0,Date,Rank,Team,Conference,Opponent,Venue,Result,AdjO,AdjD,EffO,...,Game Importance,Game Breakdown,Extra,Team Points,Opponent Points,Point Differential,Pre-Game Team Elo,Pre-Game Opponent Elo,Post-Game Team Elo,Post-Game Opponent Elo
10916,2023-11-06,0,Fordham,A10,Wagner,H,W,96.5,107.1,99.9,...,0.2088,"[""11/6/2023"", 225, ""Wagner"", ""Fordham"", 24, 62...",3,68,64,4,,,,
5653,2023-11-06,0,VCU,A10,McNeese St.,H,L,100.3,114.5,99.9,...,0.671097,"[""11/6/2023"", 200, ""McNeese St."", ""VCU"", 27, 5...",4,65,76,-11,,,,
9063,2023-11-06,0,Southern Utah,WAC,Cal St. Bakersfield,A,L,110.8,120.5,112.4,...,0.394863,"[""11/6/2023"", 200, ""Southern Utah"", ""Cal St. B...",0,72,73,-1,,,,
9062,2023-11-06,0,Cal St. Bakersfield,BW,Southern Utah,H,W,108.3,113.1,114.0,...,0.262231,"[""11/6/2023"", 200, ""Southern Utah"", ""Cal St. B...",3,73,72,1,,,,
2763,2023-11-06,0,Eastern Michigan,MAC,Butler,A,L,90.7,109.7,74.8,...,0.856721,"[""11/6/2023"", 200, ""Eastern Michigan"", ""Butler...",0,55,94,-39,,,,
2762,2023-11-06,0,Butler,BE,Eastern Michigan,H,W,111.7,92.9,127.8,...,0.134942,"[""11/6/2023"", 200, ""Eastern Michigan"", ""Butler...",5,94,55,39,,,,
10892,2023-11-06,0,Richmond,A10,VMI,H,W,116.6,118.1,122.5,...,0.049366,"[""11/6/2023"", 200, ""VMI"", ""Richmond"", 29, 65, ...",1,93,75,18,,,,
5652,2023-11-06,0,McNeese St.,Slnd,VCU,A,W,129.1,94.3,116.8,...,0.798083,"[""11/6/2023"", 200, ""McNeese St."", ""VCU"", 27, 5...",2,76,65,11,,,,
10893,2023-11-06,0,VMI,SC,Richmond,A,L,106.6,120.7,98.8,...,0.79941,"[""11/6/2023"", 200, ""VMI"", ""Richmond"", 29, 65, ...",2,75,93,-18,,,,
9010,2023-11-06,0,Saint Louis,A10,Southern Indiana,H,W,98.7,93.2,103.4,...,0.122909,"[""11/6/2023"", 200, ""Southern Indiana"", ""Saint ...",0,75,63,12,,,,


In [28]:
print(games_df.columns)

Index(['Date', 'Rank', 'Team', 'Conference', 'Opponent', 'Venue', 'Result',
       'AdjO', 'AdjD', 'EffO', 'eFG%', 'TO%', 'Reb%', 'FTR', 'EffD',
       'Opp eFG%', 'Opp TO%', 'Opp Reb%', 'Opp FTR', 'G-SC',
       'Opponent Conference', 'Game ID', 'Season Year', 'Game Tempo',
       'Game Unique ID', 'Coach', 'Opponent Coach', 'Unknown',
       'Game Importance', 'Game Breakdown', 'Extra', 'Team Points',
       'Opponent Points', 'Point Differential', 'Pre-Game Team Elo',
       'Pre-Game Opponent Elo', 'Post-Game Team Elo',
       'Post-Game Opponent Elo'],
      dtype='object')


#### Consider new x_i's:
* (Date, Team, Conference, Opponent, Venue, Coach, Opponent Coach) -> will have to project everything else

# Elo Rating Adjustment with Point Differential

To better reflect the impact of **blowout wins**, we adjust Elo ratings by incorporating **Margin of Victory (MOV)**.

## **Adjusted Elo Formula with MOV**
$$
R_A' = R_A + K \cdot (W_A - P_A) \cdot f(\text{MOV})
$$

$$
R_B' = R_B + K \cdot (W_B - P_B) \cdot f(\text{MOV})
$$

Where:
- \( R_A', R_B' \) = Updated Elo ratings for Team A and Team B  
- \( R_A, R_B \) = Pre-game Elo ratings  
- \( K \) = Scaling factor (typically **20-40**)  
- \( W_A, W_B \) = Win values (**1 for a win, 0 for a loss**)  
- \( P_A, P_B \) = Expected probabilities, given by:  

$$
P_A = \frac{1}{1 + 10^{(R_B - R_A)/400}}
$$

- \( \text{MOV} \) = Margin of Victory (Point Differential)  
- \( f(\text{MOV}) \) = A function that increases Elo changes for larger wins:

$$
f(\text{MOV}) = \ln(\text{MOV} + 1) \cdot \frac{2.2}{(R_A - R_B) \times 0.001 + 2.2}
$$

## **Explanation**
- **Larger MOV values** increase the Elo adjustment.
- **Close games** (small MOV) result in **standard Elo changes**.
- **Blowouts** (high MOV) result in **bigger Elo adjustments**, but are limited to avoid extreme shifts.


In [39]:
import math

team_elos = {team: 1500 for team in games_df['Team'].unique()}
processed_games = {} # {'Game Unique ID': ((R_A, R_B, R_A', P_B'))}

def expected_win_probability(team_A, team_B):
    R_A = team_elos[team_A]
    R_B = team_elos[team_B]
    R_D = R_B - R_A # elo differential
    P_A = 1 / (1 + 10 ** (R_D / 400))
    return P_A

def f_mov(R_A, R_B, MOV):
    return math.log(MOV + 1) * 2.2 / ((R_A - R_B) * 0.001 + 2.2)

def calulate_elo(team_A, team_B, point_differential, game_unique_id, K = 20):
    R_A = team_elos[team_A]
    R_B = team_elos[team_B]

    P_A = expected_win_probability(team_A, team_B)
    P_B = 1 - P_A

    if point_differential > 0: # team A won    
        elo_change = int(K * (1 - P_A) * f_mov(R_A, R_B, point_differential))
        R_A_NEW = team_elos[team_A] + elo_change
        R_B_NEW = team_elos[team_B] - elo_change
    else: # team B won
        elo_change = int(K * (1 - P_B) * f_mov(R_B, R_A, -point_differential))
        R_A_NEW = team_elos[team_A] - elo_change
        R_B_NEW = team_elos[team_B] + elo_change

    processed_games[game_unique_id] = ((R_A, R_B, R_A_NEW, R_B_NEW))
    return (R_A_NEW, R_B_NEW)


In [40]:
pre_game_team_elo = []
pre_game_opponent_elo = []
post_game_team_elo = []
post_game_opponent_elo = []

for i, row in games_df.iterrows():
    team_A, team_B, point_diff, game_unique_id = row['Team'], row['Opponent'], row['Point Differential'], row['Game Unique ID']
    
    if game_unique_id in processed_games:
        R_B, R_A, R_B_NEW, R_A_NEW = processed_games[game_unique_id]
    else:
        R_A, R_B = team_elos[team_A], team_elos[team_B]
        R_A_NEW, R_B_NEW = calulate_elo(team_A, team_B, point_diff, game_unique_id)
        team_elos[team_A] = R_A_NEW
        team_elos[team_B] = R_B_NEW

    pre_game_team_elo.append(R_A)
    pre_game_opponent_elo.append(R_B)
    post_game_team_elo.append(R_A_NEW)
    post_game_opponent_elo.append(R_B_NEW)

games_df["Pre-Game Team Elo"] = pre_game_team_elo
games_df["Pre-Game Opponent Elo"] = pre_game_opponent_elo
games_df["Post-Game Team Elo"] = post_game_team_elo
games_df["Post-Game Opponent Elo"] = post_game_opponent_elo
print(sorted(team_elos.items(), key=lambda x: x[1], reverse=True))

[('Connecticut', 1987), ('Purdue', 1902), ('Houston', 1864), ('Iowa St.', 1862), ('Auburn', 1835), ('Illinois', 1831), ('North Carolina', 1816), ('Tennessee', 1798), ('Gonzaga', 1797), ('Duke', 1795), ('Indiana St.', 1775), ('Alabama', 1768), ('N.C. State', 1764), ('Creighton', 1762), ('Arizona', 1760), ("Saint Mary's", 1759), ('Drake', 1745), ('Marquette', 1742), ('James Madison', 1741), ('Florida', 1739), ('Grand Canyon', 1737), ('San Diego St.', 1736), ('Nevada', 1732), ('Kentucky', 1730), ('Seton Hall', 1724), ('Colorado', 1719), ('Clemson', 1717), ('Baylor', 1713), ('Dayton', 1708), ('Pittsburgh', 1708), ('McNeese St.', 1707), ('Charleston', 1707), ('Vermont', 1704), ('New Mexico', 1704), ('Florida Atlantic', 1703), ('Texas A&M', 1701), ('Duquesne', 1698), ('Washington St.', 1697), ('South Carolina', 1696), ('Utah St.', 1693), ('Michigan St.', 1686), ('Texas', 1682), ('Samford', 1681), ("St. John's", 1681), ('South Florida', 1680), ('Oregon', 1679), ('BYU', 1679), ('Appalachian St

In [42]:
games_df[['Date', 'Team', 'Opponent', 'Result', 'Point Differential', 'Pre-Game Team Elo',
       'Pre-Game Opponent Elo', 'Post-Game Team Elo', 'Post-Game Opponent Elo']].head(15)

Unnamed: 0,Date,Team,Opponent,Result,Point Differential,Pre-Game Team Elo,Pre-Game Opponent Elo,Post-Game Team Elo,Post-Game Opponent Elo
10916,2023-11-06,Fordham,Wagner,W,4,1500,1500,1516,1484
5653,2023-11-06,VCU,McNeese St.,L,-11,1500,1500,1476,1524
9063,2023-11-06,Southern Utah,Cal St. Bakersfield,L,-1,1500,1500,1494,1506
9062,2023-11-06,Cal St. Bakersfield,Southern Utah,W,1,1500,1500,1506,1494
2763,2023-11-06,Eastern Michigan,Butler,L,-39,1500,1500,1464,1536
2762,2023-11-06,Butler,Eastern Michigan,W,39,1500,1500,1536,1464
10892,2023-11-06,Richmond,VMI,W,18,1500,1500,1529,1471
5652,2023-11-06,McNeese St.,VCU,W,11,1500,1500,1524,1476
10893,2023-11-06,VMI,Richmond,L,-18,1500,1500,1471,1529
9010,2023-11-06,Saint Louis,Southern Indiana,W,12,1500,1500,1525,1475


In [43]:
games_df.to_csv('test.csv')
