In [1]:
import numpy as np
import matplotlib.pyplot as plt

### Represent state as both a number and state data
Transformation matrix requires that state has to be turned into a number, for readability the class allows both.
How to map game state (inning, half, outs, bases, home score, away score) is somewhat arbitrary, and choices are mostly done for simplicity's sake.

A state is represented as an inning (which includes the half), the bases (represented by one number from converting the 3 digit binary base states), outs, and the score (home team - away team).

    inning = [0,17]
    outs = [0,2]
    bases = [0,8]
    score = [-15,15]

Matrix is finite, so data that has extra innings or a score difference greater than 15 (negative or positive) are thrown out.

Converting from state to number:

    Every unique state maps to a unique number: [0, # of possible states - 1]
    Simple conversion: (score + 15)*18*8*3 + (inning)*8*3 + (bases)*3 + (outs)
    Adding 15 to score is to change the range of score from [-15, 15] to [0, 30]

Lastly to determine when a game has ended a couple flags are used by reserving 3 states (home win, home loss, draw) which normally could never occur:
    
    (Since the away team bats first, score can't be positive in the first inning)
    Home win flag: inning = 0, outs = 0, bases = 0, score = 1
    Away win flag: inning = 0, outs = 0, bases = 0, score = 2
    Draw flag:     inning = 0, outs = 0, bases = 0, score = 3
    
    Note that while the draw flag is still available to use, it shouldn't ever be utilized.

In [11]:
#gameState represents any state in a baseball game, each state can be represented by either members or as one number
#note: end flags, they are inning = 0, outs = 0, bases = 0, score = 1,2,3 (since these states are impossible in normal play)
#if flag score is 1 (home win), 2 (away win), 3 (draw state at end of 17th (9th) inning)
class gameState:
    inning = 0 #[0,17] since each of the 9 innings are either in the top or the bottom
    outs = 0 #[0, 2] number of outs
    bases = 0 #[0,7] binary representation, i.e. 6 (110) has runners on 2nd and 3rd
    score = 0 #score = homeTeamScore - awayTeamScore, for simplicity score = [-15, 15]

    def setFromInfo(self, inn, out, bas, scor): #construct from data, note: base should be a 3 element list like [0,1,1]
        if(inn > 17):
            inn = 16 + int(inn % 2 == 0)
        self.inning = inn
        self.outs = out
        self.bases = 4*bas[2] + 2*bas[1] + bas[0] #conversion from list format to single number
        
        self.score = max(-15, min(15, scor)) #score between -15 and 15
        
    def setFromData(self, inn, out, bas, scor): #construct from data, note: base should be [0,7]
        if(inn > 17):
            inn = 16 + int(inn % 2 == 0)
        self.inning = inn
        self.outs = out
        self.bases = bas
        
        self.score = max(-15, min(15, scor)) #score between -15 and 15

    def setFromNum(self, numEquiv): #construct from number, map number to gameState
        self.outs = numEquiv % 3
        numEquiv = numEquiv // 3
        
        self.bases = numEquiv % 8
        numEquiv = numEquiv // 8
        
        self.inning = (numEquiv % 18)
        numEquiv = numEquiv // 18
        
        self.score = (numEquiv % 31) - 15
        
    def __str__(self): #for convient printing, note bases are printed as 1,2,3 not 3,2,1
        return ("{'Inning': " + str(self.inning) + ", " 
            + "'Outs': " + str(self.outs) + ", "
            + "'Bases': " + str(self.bases % 2) + str((self.bases >> 1) % 2) + str(self.bases >> 2) + ", "
            + "'Score': " + str(self.score) + "}")
    
    def toNum(self): #map gameState to number
            return (self.score + 15)*18*8*3 + (self.inning)*8*3 + (self.bases)*3 + (self.outs)
    
    def setEndFlagState(self, lastState): #set the end flag state from the last state of the game, asssume lastState is an end
        self.inning = 0
        self.outs = 0
        self.bases = 0
        self.score = 3 #draw
        if(lastState.score < 0): #away win
            self.score = 2
        elif(lastState.score > 0): #home win
            self.score = 1
        else: #game ended in overtime and scores are equal
            self.score = 1 #is this correct
                     
    def isEndState(self): #check if the state is an end state
        if(self.inning != 0 or self.outs != 0 or self.bases != 0): #not an end state if any of these aren't true
            return False
        return (self.score >= 1 and self.score <= 3) #is an end flag if score is between [1, 3]
    def getEndResult(self): #return the value of an end state 1 = home win, -1 away win, 0 draw (assumes the state is an end state)        
        if(self.score == 1):
            return 1 #home win
        elif(self.score == 2):
            return -1 #away win
        else:
            return 0 #draw
        
    def getStateSpace(self): #return the number of possible states
        return 3*8*18*31
#gameStates options: 18*3*8*31 = 13392
        

### Get Data from file

The file used is gotten from 2010-2019 play-by-play data from retrosheet.org and using "bevent -f 2-4,8-9,26-28,78 -y 2010 20?????.EV?> FormattedData\2010-2019StateData.txt" with the bevent software provided. Effectively this takes the data and extracts the relevant fields.

Then read each line, turn it into a state, and remember the previous state. Then for every state and previous state 1 is added to the corresponding matrix cell (i.e. matrix[ prev.Num ][ curr.Num ]). Also new game states and data that goes out of range (i.e. score > 15) have to be caught and thrown out.

Lastly turn the matrix into a probability matrix. Since matrix[y][x] yields the number of times state y goes to state x, if you get the sum of matrix[y] and divide every entry in matrix[y] by the sum, then matrix[y][x] is the probability state y will go to state x.

In [3]:
def getStateFromLine(line): #line in format of: inning, top/bottom, out, vis. score, home score, 1st,2nd,3rd, newGameFlag
    lineData = line.split(',')
    
    #convert from the line to the state
    inning = (int(lineData[0])-1)*2 + int(lineData[1]) #map inning and bottom/top for state
    outs = int(lineData[2])
    bases = [int(lineData[5] != '""'), int(lineData[6] != '""'), int(lineData[7] != '""')] #convert to bases
    score = int(lineData[4]) - int(lineData[3]) #home - vis score
    
    newGameStart = lineData[8][1] == 'T' #weird way of checking string, but it works
    
    state = gameState()
    state.setFromInfo(inning,outs,bases,score)
    return state, newGameStart

In [143]:
def getFileCount(fileName): #get the counted matrix from the file
    dataFile = open(fileName, "r")

    stateCurr = gameState()
    statePrev = gameState()
    stateEnd = gameState()
    
    matrixDim = (stateCurr.getStateSpace(),stateCurr.getStateSpace()) #since 31*18*8*3
    matr = np.zeros(matrixDim) #set the size of the large state transition matrix

    line = dataFile.readline()

    gameCount = 1 #1 for initial starting game

    statePrev, newGame = getStateFromLine(line) #avoid running into an error of writing the last game end
    line = dataFile.readline() #read second line, not a newGame
    while(line):
        stateCurr, newGame = getStateFromLine(line)
        if(not newGame):
            matr[statePrev.toNum()][stateCurr.toNum()] += 1 #add one to the entry
        else: #new game, put result of previous game
            stateEnd.setEndFlagState(statePrev) #get the flag from the end state
            #print(lineNum, stateEnd, statePrev)
            matr[statePrev.toNum()][stateEnd.toNum()] += 1 #add one to the end state entry
            gameCount = gameCount + 1

        statePrev = stateCurr
        line = dataFile.readline()

    dataFile.close()
    print("Total games in file: ", gameCount)
    
    return matr
    
def getTransMatrix(countMatrix): #go from count matrix to transition matrix

    matrixDim = np.shape(countMatrix)
    stateTran = np.zeros(matrixDim) #set the size of the large state transition matrix

    #win coordinates should have only 1 entry in their row, gaurenteed chance to return to iteself
    stateEnd = gameState()
    
    stateEnd.setFromData(0,0,0,1) #home win
    stateTran[stateEnd.toNum()] = np.zeros(matrixDim[1]) #set to zeros to remove any "bad" data
    stateEnd.setFromData(0,0,0,2) #home loss
    stateTran[stateEnd.toNum()] = np.zeros(matrixDim[1]) #set to zeros to remove any "bad" data
    stateEnd.setFromData(0,0,0,3) #draw
    stateTran[stateEnd.toNum()] = np.zeros(matrixDim[1]) #set to zeros to remove any "bad" data
    
    #change the matrix to a probability matrix, i.e. matrix[y][x] is probability from going to x, given that you are at y
    for row in range(matrixDim[0]):
        rowSum = sum(countMatrix[row])
        if(rowSum != 0): 
            stateTran[row] = countMatrix[row] / rowSum
        else: #sum is zero, if any other rows lead here, set diagonal to 1
            stateTran[row][row] = 1

    return stateTran

### Interpolate Missing Data, Machine Learning approach

Let $c[i]$ be the number of data points for element i, $d[i]$ be the transition probability for element i purely from the data, $s[i]$ be the transition probability for element i purely from the standard matrix, and $w$ be the likely probability row from $d$ and $s$.

$c$, $d$, $s$, and $w$ are all of shape $(m, 1)$ where m is total options for the transition row.

$t(x)$ can be any function as long as the following is true: $t(0) = 1$ and $t(\infty)=0$. A simple t(x) = $\frac{1}{\sqrt{x+1}}$, note that t(x) can work for either one number or a vector.

Let $ E_{in}(w) = \sum_{i=1}^{m} (w[i] - d[i])^2 $ <br>
Let $ E_{reg}(w) = \sum_{i=1}^{m} (w[i] - s[i])^2 $ <br><br>

Let $ E_{aug}(w) = kE_{in}(w) + (1-k)E_{reg}(w) $ <br>
Substitute: $ E_{aug}(w) =  k \sum_{i=1}^{m} (w[i] - d[i])^2 + (1-k) \sum_{i=m}^{d} (w[i] - s[i])^2 $ <br>
$ E_{aug}(w) = \sum_{i=1}^{m} (k(w[i] - d[i])^2 + (1-k)(w[i] - s[i])^2) $ <br>
Let $k = t(c[i])$ <br>
$ E_{aug}(w) = \sum_{i=1}^{m} (t(c[i])(w[i] - d[i])^2 + (1-t(c[i]))(w[i] - s[i])^2) $ <br>
Without summations: $ E_{aug}(w) = t(c)(w-d)^T(w-d) + (1-t(c))(w-s)^T(w-s)$ <br>

To solve for $w$, get $\nabla E_{aug}(w) = 0$ <br>
$\nabla E_{aug}(w) = t(c)(w-d) + (1-t(c))(w-s) = 0$ <br>
$t(c)w^T - t(c)d^T + (1-t(c))w^T - (1-t(c))s^T = 0$ <br>
$w(t(c) + (1-t(c))) = t(c)d + (1-t(c))s$ <br><br>

$w = t(c)d + (1-t(c))s$ <br>

Simply a weighted combination of the vectors.

In [118]:
def regularizeRow(dataRow, dataCount, standardRow):
    regRow = np.ones(np.shape(dataRow))
    for i in range(len(dataRow)):
        if(dataCount[i] != 0): #already correct value if there are no data points
            regRow[i] = 1/np.sqrt(dataCount[i] + 1)
    
    newRow = ( regRow*standardRow + (1-regRow)*dataRow )
    
    if(sum(newRow) != 0):
        newRow = newRow/sum(newRow) #normalize row since there may be some loss
    return newRow
    

In [144]:
chnCount = getFileCount("F:\\Users\\Daniel\\Machine Learning Work\\Baseball work\\test software\\FormattedData\\2019CHNStateData.txt")
chnTran = getTransMatrix(chnCount)

stanTran = getTransMatrix(getFileCount("F:\\Users\\Daniel\\Machine Learning Work\\Baseball work\\test software\\FormattedData\\2000-2019StateData.txt"))

Total games in file:  81
Total games in file:  48588


In [139]:
#reguralize chnMatrix
chnRegTran = np.zeros(np.shape(chnTran))
for i in range(len(chnRegTran)):
    if(i % 1000 == 0):
        print(i/len(chnRegTran)*100, "%")
    chnRegTran[i] = regularizeRow(chnTran[i], chnCount[i], stanTran[i])

0.0 %
7.467144563918758 %
14.934289127837516 %
22.401433691756274 %
29.868578255675033 %
37.335722819593784 %
44.80286738351255 %
52.27001194743131 %
59.737156511350065 %
67.20430107526882 %
74.67144563918757 %
82.13859020310633 %
89.6057347670251 %
97.07287933094385 %


In [146]:
#shows how regularization changes values
testState = gameState()
testState2 = gameState()
testState.setFromData(0,0,0,2)
print(testState)

print("standard: ")
for i in range(len(stanTran[testState.toNum()])):
    if(stanTran[testState.toNum()][i] != 0):
        testState2.setFromNum(i)
        print(testState2, stanTran[testState.toNum()][i])
print("\n")

print("data: ")
for i in range(len(chnTran[testState.toNum()])):
    if(chnTran[testState.toNum()][i] != 0):
        testState2.setFromNum(i)
        print(testState2, chnTran[testState.toNum()][i], chnCount[testState.toNum()][i])
print("\n")
        
print("Reg: ")
for i in range(len(chnRegTran[testState.toNum()])):
    if(chnRegTran[testState.toNum()][i] != 0):
        testState2.setFromNum(i)
        print(testState2, chnRegTran[testState.toNum()][i])

{'Inning': 0, 'Outs': 0, 'Bases': 000, 'Score': 2}
standard: 
{'Inning': 0, 'Outs': 0, 'Bases': 100, 'Score': 2} 1.0


data: 
{'Inning': 0, 'Outs': 0, 'Bases': 000, 'Score': 2} 1.0 0.0


Reg: 
{'Inning': 0, 'Outs': 0, 'Bases': 000, 'Score': 2} 1.0


In [78]:
def mergeMatrix(home, away): #merge a home and away probability matrix into one, can be used to get match off win probability
    s = gameState() #move through all states and 
    m = np.zeros(home.shape)
    
    for n in range(m.shape[0]):
        s.setFromNum(n)
        if(s.inning % 2 == 0): #away team at bat
            m[n] = away[n]
        else: #away team at bat
            m[n] = home[n]
    return m


In [147]:
import time

startTime = time.time()

chnVsStandard = mergeMatrix(chnRegTran, stanTran)
winMatrix = np.linalg.matrix_power(chnVsStandard, 512) #almost any game should be over by 512 states or "plays"

startState = gameState()
endState = gameState()
startState.setFromData(0,0,0,0)
y = startState.toNum()

for x in range(np.shape(winMatrix)[1]):
    if(winMatrix[y][x] > .000001):
        endState.setFromNum(x)
        print(endState, ", chance: ", winMatrix[y][x])
print("Time elapsed: ", time.time() - startTime)

{'Inning': 0, 'Outs': 0, 'Bases': 000, 'Score': 1} , chance:  0.99993415339108
Time elapsed:  672.6787028312683


In [125]:
print(sum(winMatrix[y]))

for i in range(np.shape(stanTran)[0]):
    if(np.sum(stanTran[i]) > 1.000001):
        print(np.sum(stanTran[i]), i)
        
testState = gameState()
testState.setFromNum(7344)
print(testState)

1345638.28810552
2.0 7344
{'Inning': 0, 'Outs': 0, 'Bases': 000, 'Score': 2}
