In [6]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import time
from collections import OrderedDict
from sklearn.calibration import calibration_curve
from sklearn.calibration import CalibrationDisplay

np.random.seed(42)

In [78]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import time
from collections import OrderedDict
from sklearn.calibration import calibration_curve
from sklearn.calibration import CalibrationDisplay


class ELO:
    GAME_COLUMNS = [
        "timestamp",
        "winner",
        "winner_prev_elo",
        "loser",
        "loser_prev_elo",
        "win_prob",
        "site",
    ]

    def __init__(
        self,
        winners,
        losers,
        sites=None,
        ids=None,
        timestamps=None,
        k=20,
        elo_init=1500,
        elo_diff=400,
        seasonal_mean_reversion=0,
    ):
        self.k = k
        self.elo_init = elo_init
        self.elo_diff = elo_diff

        winners = list(winners)
        losers = list(losers)

        ELO.__check_valid_games__(winners, losers, ids, timestamps)
        ELO.__check_valid_params__(k, elo_init, elo_diff, seasonal_mean_reversion)

        # Assemble temporary empty dataframe to store ELOs
        if ids is None:
            self.ids = range(len(winners))
        else:
            self.ids = ids
        self.winners = winners
        self.losers = losers
        self.competitors = sorted(list(set(winners) | set(losers)))
        self.timestamps = timestamps
        self.seasonal_mean_reversion = seasonal_mean_reversion
        self.sites = sites
        if self.timestamps is not None:
            # sort arrays by timestamp
            self.ids, self.winners, self.losers, self.timestamps, self.sites = zip(
                *sorted(
                    zip(
                        self.ids, self.winners, self.losers, self.timestamps, self.sites
                    ),
                    key=lambda x: x[3],
                )
            )

    # Compute the ELO of every competitor after each match (37s original)
    # 0.04476022720336914s (without dataframe conversion)
    # 0.15063881874084473 (with dataframe conversion)
    def fit_fastest(self):
        start = time.time()
        elo_dict = OrderedDict(
            [(competitor, self.elo_init) for competitor in self.competitors]
        )
        game_array = np.vstack([self.ids, self.winners, self.losers, self.timestamps]).T
        elo_array = [np.ones(len(self.competitors)) * self.elo_init]
        game_probs, winner_prev_elos, loser_prev_elos = [], [], []
        current_year = self.timestamps[0].year if self.timestamps is not None else None
        for i, game in enumerate(game_array):
            game_id, winner, loser, timestamp = game
            winner_prev_elos.append(elo_dict[winner])
            loser_prev_elos.append(elo_dict[loser])
            if self.timestamps is not None and timestamp.year != current_year:
                current_year = timestamp.year
                mean_elo = np.mean(elo_array[-1])
                elo_dict = {
                    team: mean_elo
                    + (1 - self.seasonal_mean_reversion) * (elo - mean_elo)
                    for team, elo in elo_dict.items()
                }
            winner_elo, loser_elo = elo_dict[winner], elo_dict[loser]
            winner_new_elo, loser_new_elo, expected_outcome_prob = (
                ELO.compute_pairwise_elo(
                    winner_elo, loser_elo, elo_diff=self.elo_diff, k=self.k
                )
            )
            game_probs.append(expected_outcome_prob)
            elo_dict[winner], elo_dict[loser] = winner_new_elo, loser_new_elo
            elo_array.append(list(elo_dict.values()))
        self.elo_df = pd.concat(
            [
                pd.DataFrame(
                    np.vstack(
                        [
                            self.ids,
                            self.timestamps,
                            self.winners,
                            winner_prev_elos,
                            self.losers,
                            loser_prev_elos,
                            game_probs,
                            self.sites,
                        ]
                    ).T,
                    columns=[
                        "id",
                        "timestamp",
                        "winner",
                        "winner_prev_elo",
                        "loser",
                        "loser_prev_elo",
                        "win_prob",
                        "site",
                    ],
                ).set_index("id"),
                pd.DataFrame(elo_array[1:], columns=self.competitors, index=self.ids),
            ],
            axis=1,
        )
        self.convert_column_dtypes()
        print("Computed elos in", time.time() - start, "seconds.")

    # Show a calibration curve of the ELO output probabilities after fitting
    def show_calibration(self, start_year=2018, A=5, B=0.2):
        d = self.elo_df.loc[self.elo_df.timestamp > str(start_year)]
        win_sample = d.sample(frac=0.5)
        lose_sample = d.drop(win_sample.index)
        p_true = [1] * len(win_sample) + [0] * len(lose_sample)
        p_pred = win_sample.win_prob.tolist() + (1 - lose_sample.win_prob).tolist()
        p_pred = np.array(p_pred)

        def sigmoid(x, A, B):
            return 1 / (1 + np.exp(A * x + B))

        def invsigmoid(x, A, B=1 / 2):
            return -(1 / A) * np.log((1 + B) / (x + B / 2) - 1) + 1 / 2

        prob_true, prob_pred = calibration_curve(
            p_true, invsigmoid(p_pred, A, B), n_bins=20, strategy="uniform"
        )
        # prob_true, prob_pred = calibration_curve(p_true, p_pred, n_bins=20, strategy='uniform')
        plt.figure(dpi=300)
        CalibrationDisplay(prob_true, prob_pred, p_pred).plot()
        plt.suptitle("Calibration Curve with Inverse Sigmoid")
        plt.title(
            f"A={A}, B={B}, start_year={start_year}, k={self.k}, elo_init={self.elo_init}, elo_diff={self.elo_diff}, smr={self.seasonal_mean_reversion}"
        )
        plt.grid()

    def convert_column_dtypes(self):
        self.elo_df = self.elo_df.astype(
            {
                "winner": "int64",
                "winner_prev_elo": "float64",
                "loser": "int64",
                "loser_prev_elo": "float64",
                "win_prob": "float64",
                "site": "object",
            }
        )

    def get_home_elo_advantage(self):
        home_win_pct = self.get_home_win_pct()
        return np.log(home_win_pct / (1 - home_win_pct)) * self.elo_diff

    def get_home_win_pct(self):
        n_home_wins = sum(self.elo_df.site == "home")
        n_away_wins = sum(self.elo_df.site == "away")
        return n_home_wins / (n_home_wins + n_away_wins)

    def get_accuracy(self, since=None):
        since = str(since) if since else str(self.elo_df.timestamp.min().year)
        since_df = self.elo_df.loc[self.elo_df.timestamp >= since]
        n_correct = sum(since_df.win_prob > 0.5)
        return n_correct / len(since_df)

    @staticmethod
    def compute_pairwise_elo(winner_elo, loser_elo, elo_diff, k):
        expected_outcome_prob = ELO.compute_expected_outcome_prob(
            winner_elo, loser_elo, elo_diff=elo_diff
        )
        return (
            winner_elo + k * (1 - expected_outcome_prob),
            loser_elo - k * (1 - expected_outcome_prob),
            expected_outcome_prob,
        )

    @staticmethod
    def compute_expected_outcome_prob(elo1, elo2, elo_diff=400):
        return 1 / (1 + 10 ** ((elo2 - elo1) / elo_diff))

    @staticmethod
    def __check_valid_params__(k, elo_init, elo_diff, seasonal_mean_reversion):
        assert isinstance(k, int) and k > 0
        assert isinstance(elo_init, int) and elo_init > 0
        assert isinstance(elo_diff, int) and elo_diff > 0
        assert (
            isinstance(seasonal_mean_reversion, float) or seasonal_mean_reversion == 0
        ) and seasonal_mean_reversion <= 1

    @staticmethod
    def __check_valid_games__(winners, losers, ids, timestamps):
        # assert proper data types
        assert pd.api.types.is_list_like(winners)
        assert pd.api.types.is_list_like(losers)
        assert ids is None or pd.api.types.is_list_like(ids)
        assert timestamps is None or pd.api.types.is_list_like(timestamps)

        # check that winners, losers, ids, and timestamps have the same length
        assert len(winners) == len(losers)
        assert ids is None or len(ids) == len(winners)
        assert timestamps is None or len(timestamps) == len(winners)

        # check that all ids are unique
        assert ids is None or len(set(ids)) == len(ids)

        # check that no teams play against themselves
        for i in range(len(winners)):
            assert winners[i] != losers[i]

        # check that the timestamps are valid
        if timestamps is not None:
            for timestamp in timestamps:
                try:
                    pd.to_datetime(timestamp)
                except ValueError:
                    raise ValueError("Invalid timestamp: {}".format(timestamp))

    def show_elos(self, since=None):
        if since is not None:
            return self.elo_df.loc[self.elo_df.timestamp >= str(since)]
        return self.elo_df

    def show_games(self, since=None):
        if since is not None:
            return self.elo_df.loc[self.elo_df.timestamp >= str(since)][
                ELO.GAME_COLUMNS
            ]
        return self.elo_df[ELO.GAME_COLUMNS]

In [3]:
games = pd.read_csv("games_2011_to_2023.csv", parse_dates=["date"])
wins = games.loc[games.result == "W"]
schools = pd.read_csv(
    "general_data/schools.csv",
    skiprows=1,
    header=None,
    names=["school_id", "school_name"],
).set_index("school_id")
elo_df = (
    wins[["date", "contest_id", "site", "school_id", "opponent_school_id"]]
    .sort_values(by="date")
    .set_index("date")
)
elo_df.columns = ["id", "site", "winner", "loser"]

In [133]:
sd = ELO(
    elo_df.winner,
    elo_df.loser,
    sites=elo_df.site,
    ids=elo_df.id,
    timestamps=elo_df.index,
    k=20,
    elo_init=1500,
    elo_diff=400,
    seasonal_mean_reversion=0,
)
sd.fit_fastest()
df = sd.show_games(2018)

Computed elos in 0.23586821556091309 seconds.


In [96]:
sd.get_home_elo_advantage()

121.17246563517983

In [108]:
(
    df.loc[df.site == "home"].winner_prev_elo.mean()
    - df.loc[df.site == "away"].winner_prev_elo.mean()
)

17.39311655166125

In [114]:
df.loc[(df.site == "away") & (df.winner_prev_elo > df.loser_prev_elo)]

Unnamed: 0,timestamp,winner,winner_prev_elo,loser,loser_prev_elo,win_prob,site
142831.0,2018-02-01,738,1384.619876,244,1360.535444,0.534605,away
142841.0,2018-02-03,738,1393.927778,406,1331.778036,0.588499,away
142848.0,2018-02-03,518,1696.807546,148,1437.723474,0.816289,away
142857.0,2018-02-09,579,1429.164319,52,1367.209956,0.588226,away
142866.0,2018-02-10,183,1800.472826,721,1597.423129,0.762937,away
...,...,...,...,...,...,...,...
2432349.0,2023-05-02,369,1658.590541,726,1565.389320,0.631000,away
2432074.0,2023-05-04,392,1930.172173,322,1702.604320,0.787511,away
2432378.0,2023-05-05,369,1665.970540,68,1647.364531,0.526751,away
2432362.0,2023-05-05,721,1538.393760,579,1527.122219,0.516215,away


In [135]:
df["winner_school"] = df.winner.apply(lambda x: schools.loc[x].school_name)
df["loser_school"] = df.loser.apply(lambda x: schools.loc[x].school_name)

In [131]:
pd.set_option("display.max_rows", None)

In [138]:
df.loc[
    (df.site == "home")
    & (df.winner_prev_elo < df.loser_prev_elo)
    & (df.winner_prev_elo > df.loser_prev_elo - 100)
].shape

(241, 9)

In [139]:
df.loc[
    (df.site == "away")
    & (df.winner_prev_elo > df.loser_prev_elo)
    & (df.winner_prev_elo < df.loser_prev_elo + 100)
].shape

(272, 9)

In [136]:
df.loc[(df.win_prob < 0.5) & (df.site != "neutral")]

Unnamed: 0,timestamp,winner,winner_prev_elo,loser,loser_prev_elo,win_prob,site,winner_school,loser_school
142842.0,2018-02-03,739,1579.53278,539,1623.214608,0.437466,away,Villanova,Penn St.
142850.0,2018-02-04,316,1314.078572,726,1523.688182,0.230301,home,Jacksonville,Navy
142879.0,2018-02-10,391,1432.100662,575,1580.014765,0.299126,home,UMBC,Richmond
142874.0,2018-02-10,746,1624.38577,369,1694.92186,0.399862,home,Virginia,Loyola Maryland
142885.0,2018-02-10,738,1402.157804,220,1537.762551,0.31419,away,Vermont,Fairfield
142865.0,2018-02-10,153,1490.645421,386,1512.669601,0.468347,away,Colgate,Marist
142864.0,2018-02-10,418,1414.660485,148,1434.049264,0.472126,home,Michigan,Cleveland St.
142883.0,2018-02-10,322,1649.644511,711,1692.846077,0.438146,home,Johns Hopkins,Towson
142880.0,2018-02-10,590,1363.703359,683,1553.056371,0.251618,away,Sacred Heart,Stony Brook
142903.0,2018-02-10,83,1552.061132,81,1559.104843,0.489865,away,Bucknell,Bryant


In [119]:
np.mean(
    df.loc[df.site == "home"].winner_prev_elo - df.loc[df.site == "home"].loser_prev_elo
)

104.66050488582181

In [118]:
np.mean(
    df.loc[df.site == "away"].winner_prev_elo - df.loc[df.site == "away"].loser_prev_elo
)

141.61766378106026

In [113]:
df.loc[(df.site == "home") & (df.winner_prev_elo < df.loser_prev_elo)]

Unnamed: 0,timestamp,winner,winner_prev_elo,loser,loser_prev_elo,win_prob,site
142850.0,2018-02-04,316,1314.078572,726,1523.688182,0.230301,home
142879.0,2018-02-10,391,1432.100662,575,1580.014765,0.299126,home
142874.0,2018-02-10,746,1624.385770,369,1694.921860,0.399862,home
142864.0,2018-02-10,418,1414.660485,148,1434.049264,0.472126,home
142883.0,2018-02-10,322,1649.644511,711,1692.846077,0.438146,home
...,...,...,...,...,...,...,...
2361437.0,2023-04-29,410,1423.949306,81,1533.503254,0.347365,home
2361417.0,2023-04-29,62,1399.010961,14,1542.904060,0.304001,home
2361367.0,2023-04-29,711,1536.080488,180,1632.204640,0.365091,home
2432365.0,2023-05-02,579,1515.872211,316,1559.530286,0.437500,home


In [109]:
for d in range(0, 20, 1):
    adj = np.where(
        df.site == "home",
        df.winner_prev_elo + d,
        np.where(df.site == "away", df.winner_prev_elo - d, df.winner_prev_elo),
    )
    print(d, np.mean(adj > df.loser_prev_elo))

0 0.7137733142037302
1 0.7137733142037302
2 0.7148493543758967
3 0.7152080344332855
4 0.7159253945480631
5 0.7155667144906743
6 0.7152080344332855
7 0.7152080344332855
8 0.7159253945480631
9 0.7155667144906743
10 0.7148493543758967
11 0.7152080344332855
12 0.7137733142037302
13 0.7141319942611191
14 0.712338593974175
15 0.7119799139167863
16 0.7116212338593975
17 0.7116212338593975
18 0.7116212338593975
19 0.7126972740315638


In [101]:
np.mean(np.abs(df.winner_prev_elo - df.loser_prev_elo))

145.4155637962851

In [90]:
np.mean( > df.loser_prev_elo)

0.7137733142037302

In [60]:
df.timestamp.min().year

2018

In [53]:
dfh = df.loc[df.site == "home"]
dfa = df.loc[df.site == "away"]

In [57]:
df.site.value_counts()

site
home       1525
away       1089
neutral     174
Name: count, dtype: int64

In [33]:
df.loc[df.site == "home"]

Unnamed: 0,timestamp,winner,winner_prev_elo,loser,loser_prev_elo,win_prob,site,14,52,62,...,738,739,741,746,748,813,1320,11504,19651,30136
142840.0,2018-02-03,193,1731.277269,721,1603.913132,0.6755,home,1750.282089,1367.209956,1472.431707,...,1402.157804,1590.783461,1153.284089,1624.385770,1263.156234,1690.031228,1428.60073,1500.00000,1434.394259,1500.00000
142847.0,2018-02-03,81,1556.431853,381,1231.741353,0.86635,home,1750.282089,1367.209956,1472.431707,...,1402.157804,1590.783461,1153.284089,1624.385770,1263.156234,1690.031228,1428.60073,1500.00000,1434.394259,1500.00000
142836.0,2018-02-03,68,1544.362418,556,1465.065133,0.612177,home,1750.282089,1367.209956,1472.431707,...,1402.157804,1590.783461,1153.284089,1624.385770,1263.156234,1690.031228,1428.60073,1500.00000,1434.394259,1500.00000
142835.0,2018-02-03,352,1528.726303,471,1201.891213,0.867773,home,1750.282089,1367.209956,1472.431707,...,1402.157804,1590.783461,1153.284089,1624.385770,1263.156234,1690.031228,1428.60073,1500.00000,1434.394259,1500.00000
142837.0,2018-02-03,587,1556.138247,579,1435.833736,0.666529,home,1750.282089,1367.209956,1472.431707,...,1402.157804,1590.783461,1153.284089,1624.385770,1263.156234,1690.031228,1428.60073,1500.00000,1434.394259,1500.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2433948.0,2023-05-13,513,1828.511361,732,1561.021544,0.823435,home,1530.501514,1361.176001,1405.930448,...,1584.048434,1664.209548,1159.890639,1860.370115,1144.469566,1768.852620,1428.60073,1389.49799,1530.916038,1403.51702
2433937.0,2023-05-13,746,1860.370115,575,1664.281157,0.755613,home,1530.501514,1361.176001,1405.930448,...,1584.048434,1664.209548,1159.890639,1865.257846,1144.469566,1768.852620,1428.60073,1389.49799,1530.916038,1403.51702
2433946.0,2023-05-14,539,1653.749557,554,1698.175111,0.436413,home,1530.501514,1361.176001,1405.930448,...,1584.048434,1664.209548,1159.890639,1865.257846,1144.469566,1768.852620,1428.60073,1389.49799,1530.916038,1403.51702
2433949.0,2023-05-14,322,1698.354536,81,1537.837589,0.715858,home,1530.501514,1361.176001,1405.930448,...,1584.048434,1664.209548,1159.890639,1865.257846,1144.469566,1768.852620,1428.60073,1389.49799,1530.916038,1403.51702
