## Optimizing K with Full Training

This notebook will take in your dataset, and it will find the K value that leads to the final elo ratings that best fit the data. This also runs the Elo system at that K value.

Compared to OptimizeK_OnTheGo, this notebook is a more standard approach to optimizing K. It takes in all the training information, then produces the best end Elo ratings based on that. This idea originally comes from https://opisthokonta.net/?p=1387

-Grant Harkins

In [None]:
#This cell contains the main calculations associated with the Elo method
#The first function calculates the probability of each team winning
#The second function takes those probabilities, and the outcome of the game to determine the new ratings 
import math 
def Probability(rating1, rating2): 
  
    return 1.0 * 1.0 / (1 + 1.0 * math.pow(10, 1.0 * (rating1 - rating2) / 400)) 
  
# Function to calculate Elo rating 
# K is a constant. 
# Player A wins over Player B.  
# tie = true if tie, false otherwise
def EloRating(Ra, Rb, K, tie): 
    
    # To calculate the Winning 
    # Probability of Player B 
    Pb = Probability(Ra, Rb) 
  
    # To calculate the Winning 
    # Probability of Player A 
    Pa = Probability(Rb, Ra) 
  
    # Updating the Elo Ratings 
    if tie:
       Ra = Ra + K * (1/2 - Pa) 
        
       Rb = Rb + K * (1/2 - Pb) 
    else:        
       Ra = Ra + K * (1 - Pa) 
       Rb = Rb + K * (0 - Pb) 
    
    return Ra, Rb

In [None]:
import pandas as pd
pathGames = '/FILEPATH/' #Filepath for games file
pathTeams = '/FILEPATH/' #Filepath for team file
gameFilename = '.txt' 
teamFilename = '.txt'
games = pd.read_csv(pathGames + gameFilename, skiprows = 1, header = None)

#We got our data from masseyratings.com, so reading the files is based on the structure of those files

In [None]:
import pandas as pd

teamNames = pd.read_csv(pathTeams + teamFilename, header = None)
numTeams = len(teamNames)

In [None]:
# columns of games are:
#	column 0 = days since 1/1/0000
#	column 1 = date in YYYYMMDD format
#	column 2 = team1 index
#	column 3 = team1 homefield (1 = home, -1 = away, 0 = neutral)
#	column 4 = team1 score
#	column 5 = team2 index
#	column 6 = team2 homefield (1 = home, -1 = away, 0 = neutral)
#	column 7 = team2 score


numGames = len(games)

In [None]:
#This functions takes in the elo ratings found at the end of training set for certain K, then finds Squared Errors for that season
def GetErrors(eloRatings):
    squaredErrors = []
    for i in range(numGames):
        team1ID = games.loc[i, 2] - 1 # subtracting 1 since python indexes at 0
        team1Score = games.loc[i, 4]
        team2ID = games.loc[i, 5] - 1 # subtracting 1 since python indexes at 0
        team2Score = games.loc[i, 7]

        if team1Score > team2Score: #finds error in every game
            localError = (Probability(eloRatings[team2ID],eloRatings[team1ID]) - 1)**2 + (Probability(eloRatings[team1ID],eloRatings[team2ID]) - 0)**2
            squaredErrors.append(localError)
        elif team1Score < team2Score: 
            localError = (Probability(eloRatings[team1ID],eloRatings[team2ID]) - 1)**2 + (Probability(eloRatings[team2ID],eloRatings[team1ID]) - 0)**2
            squaredErrors.append(localError)
    return squaredErrors

In [None]:
#Finding the optimal K

import numpy as np

sigFig = 2 #number of sigfigs for optimal K to go to for
for p in range(sigFig):
    
    if p == 0:    
        startK = 15 #The K's to check range from startK to endK
        endK = 30
        step = 1
    else:
        startK = bestK - step*(1/2)
        endK = bestK + step*(1/2)
        step = 10**(-p)

    runs = int((endK - startK) / step) + 1

    allErrors = []
    for m in range(runs):
        if p < sigFig:
            K = startK + (m * step)
        else:
            K = bestK
        eloRatings = np.zeros(numTeams) #resets the elo ratings for each K
        for i in range(numGames):
            team1ID = games.loc[i, 2] - 1 # subtracting 1 since python indexes at 0
            team1Score = games.loc[i, 4]
            team2ID = games.loc[i, 5] - 1 # subtracting 1 since python indexes at 0
            team2Score = games.loc[i, 7]
            
            # Getting Elo ratings for this particular K
            if team1Score > team2Score: 
                team1Rating, team2Rating = EloRating(eloRatings[team1ID], eloRatings[team2ID], K, False)
            elif team1Score < team2Score: 
                team2Rating, team1Rating = EloRating(eloRatings[team2ID], eloRatings[team1ID], K, False)
            else:  
                team1Rating, team2Rating = EloRating(eloRatings[team1ID], eloRatings[team2ID], K, True)
                
            eloRatings[team1ID] = team1Rating
            eloRatings[team2ID] = team2Rating

        squaredErrors = GetErrors(eloRatings) #finds the squared errors at this K
        meanError = sum(squaredErrors) / len(squaredErrors) 
        
        allErrors.append(meanError) #holds the errors 
        #print(f'The mean squared Error for K = {K} is {meanError}') #optional to print MSE for every K
    
    bestK = (np.argmin(allErrors) * step) + startK
    print(f'The best K in [{startK}, {endK}] is {bestK} with MSE of {allErrors[np.argmin(allErrors)]}')
K = bestK #To print ratings/rankings below with bestK

In [None]:
#printing the ratings/rankings at optimal K

k = 0 #number of teams to print in rankings; if k=0, prints all
iSort = np.argsort(-eloRatings)

print('\n\n************** ELO Rating Method **************\n')
print('===========================')
print('Rank   Rating      Team   ')
print('===========================')
if k == 0:
    for i in range(numTeams):
        print(f'{i+1:4d}   {eloRatings[iSort[i]]:.5f}  {teamNames.loc[iSort[i],1]}')
else:
    for i in range(k):
        print(f'{i+1:4d}   {eloRatings[iSort[i]]:.5f}  {teamNames.loc[iSort[i],1]}')

print('')   # extra carriage return