In [1]:
# Liam Fruzyna
# MATH 4630
# Final Project

# Data from Sean Lahman's Baseball Database
# http://www.seanlahman.com/baseball-archive/statistics/

In [2]:
# imports
import pandas as pd
import random as rd
from statistics import mean

In [3]:
# function for improved logging
printLogs = False
def log(*objs, force=False):
    if force or printLogs:
        strs = []
        for obj in objs:
            strs.append(str(obj))
        print(' '.join(strs))

In [4]:
# read in and format batting data
b = pd.read_csv('Batting.csv')
b = b[b['yearID'] == 1968]
b['1B'] = b['H'] - b['2B'] - b['3B'] - b['HR']
b['O'] = b['AB'] - (b['H'] + b['SO'])
b['PA'] = b['AB'] + b['BB']
b = b[['playerID', 'teamID', 'lgID', 'PA', 'H', '1B', '2B', '3B', 'HR', 'BB', 'SO', 'O']]
b.head()

Unnamed: 0,playerID,teamID,lgID,PA,H,1B,2B,3B,HR,BB,SO,O
46520,aaronha01,ATL,NL,670.0,174.0,108.0,33.0,4.0,29.0,64.0,62.0,370.0
46521,aaronto01,ATL,NL,304.0,69.0,55.0,10.0,3.0,1.0,21.0,37.0,177.0
46522,abernte02,CIN,NL,20.0,0.0,0.0,0.0,0.0,0.0,3.0,12.0,5.0
46523,adairje01,BOS,AL,217.0,45.0,42.0,1.0,0.0,2.0,9.0,28.0,135.0
46524,adamsmi01,BAL,AL,3.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0


In [5]:
# convert batting data to percentages
bp = b.copy()
bp['H'] = b['H'] / b['PA']
bp['BB'] = b['BB'] / b['PA']
bp['SO'] = b['SO'] / b['PA']
bp['O'] = b['O'] / b['PA']
bp['1B'] = b['1B'] / b['PA']
bp['2B'] = b['2B'] / b['PA']
bp['3B'] = b['3B'] / b['PA']
bp['HR'] = b['HR'] / b['PA']
bp.head()

Unnamed: 0,playerID,teamID,lgID,PA,H,1B,2B,3B,HR,BB,SO,O
46520,aaronha01,ATL,NL,670.0,0.259701,0.161194,0.049254,0.00597,0.043284,0.095522,0.092537,0.552239
46521,aaronto01,ATL,NL,304.0,0.226974,0.180921,0.032895,0.009868,0.003289,0.069079,0.121711,0.582237
46522,abernte02,CIN,NL,20.0,0.0,0.0,0.0,0.0,0.0,0.15,0.6,0.25
46523,adairje01,BOS,AL,217.0,0.207373,0.193548,0.004608,0.0,0.009217,0.041475,0.129032,0.62212
46524,adamsmi01,BAL,AL,3.0,0.333333,0.333333,0.0,0.0,0.0,0.0,0.666667,0.0


In [6]:
# read in and format pitching data
p = pd.read_csv('Pitching.csv')
p = p[p['yearID'] == 1968]
p['O'] = p['BFP'] - (p['H'] + p['BB'] + p['SO'])
p = p[['playerID', 'teamID', 'lgID', 'BFP', 'H', 'HR', 'BB', 'SO', 'O']]
p.head()

Unnamed: 0,playerID,teamID,lgID,BFP,H,HR,BB,SO,O
18474,abernte02,CIN,NL,562.0,111,9,55,64,332.0
18475,adamsmi01,BAL,AL,36.0,9,2,4,4,19.0
18476,aguirha01,LAN,NL,167.0,32,0,13,25,97.0
18477,akerja01,OAK,AL,330.0,72,6,33,44,181.0
18478,arrigge01,CIN,NL,853.0,181,13,77,140,455.0


In [7]:
# convert pitching data to percentages
pp = p.copy()
pp['H'] = p['H'] / p['BFP']
pp['BB'] = p['BB'] / p['BFP']
pp['SO'] = p['SO'] / p['BFP']
pp['O'] = p['O'] / p['BFP']
pp['HR'] = p['HR'] / p['BFP']
pp.head()

Unnamed: 0,playerID,teamID,lgID,BFP,H,HR,BB,SO,O
18474,abernte02,CIN,NL,562.0,0.197509,0.016014,0.097865,0.113879,0.590747
18475,adamsmi01,BAL,AL,36.0,0.25,0.055556,0.111111,0.111111,0.527778
18476,aguirha01,LAN,NL,167.0,0.191617,0.0,0.077844,0.149701,0.580838
18477,akerja01,OAK,AL,330.0,0.218182,0.018182,0.1,0.133333,0.548485
18478,arrigge01,CIN,NL,853.0,0.212192,0.01524,0.09027,0.164127,0.533411


In [8]:
# get a list of teams and franchise info
teamIds = b['teamID'].unique()
teamIds

array(['ATL', 'CIN', 'BOS', 'BAL', 'HOU', 'NYN', 'LAN', 'OAK', 'WS2',
       'PHI', 'PIT', 'MIN', 'CHA', 'SFN', 'CLE', 'NYA', 'CHN', 'CAL',
       'SLN', 'DET'], dtype=object)

In [9]:
# build a roster of 8 batters and a pitcher for each team
teams = {}
for team in teamIds:
    # find single most used pitched (batters faced)
    pitchers = pp[pp['teamID'] == team]
    pitcher = pitchers.nlargest(1, columns=['BFP']).iloc[0]
    teams[team + '-pitcher'] = pitcher
    # find top 8 most used batters (at bats), plus pitcher
    batters = bp[bp['teamID'] == team]
    pitcherBat = batters[batters['playerID'] == pitcher['playerID']]
    teams[team + '-batters'] = batters.nlargest(8, columns=['PA']).append(pitcherBat)
teams

{'ATL-pitcher': playerID    niekrph01
 teamID            ATL
 lgID               NL
 BFP              1019
 H            0.223749
 HR          0.0157017
 BB          0.0441609
 SO            0.13739
 O            0.594701
 Name: 18667, dtype: object,
 'ATL-batters':         playerID teamID lgID     PA         H        1B        2B        3B  \
 46536   aloufe01    ATL   NL  710.0  0.295775  0.221127  0.052113  0.007042   
 46520  aaronha01    ATL   NL  670.0  0.259701  0.161194  0.049254  0.005970   
 46957  millafe01    ATL   NL  592.0  0.278716  0.236486  0.037162  0.003378   
 47170  torrejo01    ATL   NL  458.0  0.251092  0.200873  0.024017  0.004367   
 46721  francti01    ATL   NL  397.0  0.249370  0.209068  0.032746  0.002519   
 46913  martima01    ATL   NL  385.0  0.212987  0.192208  0.012987  0.007792   
 46822  jacksso01    ATL   NL  383.0  0.211488  0.182768  0.020888  0.005222   
 46834  johnsde01    ATL   NL  377.0  0.188329  0.135279  0.029178  0.002653   
 46989  niekrp

In [10]:
# example of batter data for the Cubs
teams['CHN-batters']

Unnamed: 0,playerID,teamID,lgID,PA,H,1B,2B,3B,HR,BB,SO,O
46860,kessido01,CHN,NL,693.0,0.226551,0.194805,0.020202,0.010101,0.001443,0.054834,0.124098,0.594517
47213,willibi01,CHN,NL,690.0,0.268116,0.169565,0.043478,0.011594,0.043478,0.069565,0.076812,0.585507
46560,beckegl01,CHN,NL,674.0,0.280415,0.227003,0.041543,0.005935,0.005935,0.045994,0.029674,0.643917
47094,santoro01,CHN,NL,673.0,0.210996,0.142645,0.02526,0.004458,0.038633,0.142645,0.157504,0.488856
46815,hundlra01,CHN,NL,592.0,0.211149,0.162162,0.030405,0.006757,0.011824,0.065878,0.116554,0.606419
46555,bankser01,CHN,NL,579.0,0.234888,0.132988,0.046632,0.0,0.055268,0.046632,0.115717,0.602763
47027,phillad01,CHN,NL,486.0,0.218107,0.139918,0.041152,0.010288,0.026749,0.096708,0.185185,0.5
46839,johnslo01,CHN,NL,211.0,0.236967,0.151659,0.066351,0.014218,0.004739,0.028436,0.109005,0.625592
46828,jenkife01,CHN,NL,106.0,0.150943,0.103774,0.037736,0.0,0.009434,0.056604,0.386792,0.40566


In [11]:
# example of pitcher data for the Cubs
teams['CHN-pitcher']

playerID    jenkife01
teamID            CHN
lgID               NL
BFP              1231
H            0.207149
HR           0.021121
BB          0.0528026
SO            0.21121
O            0.528838
Name: 18598, dtype: object

In [12]:
# function to build a list of odds into a list of brackets
def sumOdds(odds):
    for i in range(1, len(odds)):
        odds[i] += odds[i-1]
    return odds

In [13]:
# object to track and manage which bases are occupied
class Bases:
    def __init__(self):
        self.bases = [False, False, False, False]
        self.runs = 0
        
    def __repr__(self):
        bases = ''
        for b in range(1, 4):
            if self.bases[b]:
                bases += ' ' + str(b)
        return str(self.runs) + ' scored with men on' + bases
        
    def play(self, earned):
        log('Bases:', earned)
        if earned > 0:
            for b in range(len(self.bases)-1, 0, -1):
                if self.bases[b]:
                    reached = b + earned
                    self.bases[b] = False
                    if reached >= 4:
                        self.runs += 1
                    else:
                        self.bases[reached] = True
            if earned == 4:
                self.runs += 1
            else:
                self.bases[earned] = True

In [14]:
# process a single at bat of a pitcher vs a batter
def runAtBat(batter, pitcher):
    log('Batting:', batter['playerID'])
    odds = sumOdds([mean([batter['1B'], pitcher['H']/4]), mean([batter['2B'], pitcher['H']/4]), 
                    mean([batter['3B'], pitcher['H']/4]), mean([batter['HR'], pitcher['HR']]), 
                    mean([batter['BB'], pitcher['BB']]), mean([batter['SO'], pitcher['SO']])])
    log(odds)
    play = rd.random()
    if play <= odds[0]:
        log('Single')
        return 1, 0
    elif play <= odds[1]:
        log('Double')
        return 2, 0
    elif play <= odds[2]:
        log('Triple')
        return 3, 0
    elif play <= odds[3]:
        log('Home Run')
        return 4, 0
    elif play <= odds[4]:
        log('Base on Balls')
        return 1, 0
    elif play <= odds[5]:
        log('Strike Out')
        return 0, 1
    log('Out')
    return 0, 1

In [15]:
# run a single side of an inning (3 outs)
def runInning(offTeam, defTeam, leadOff):
    lineup = teams[offTeam + '-batters']
    pitcher = teams[defTeam + '-pitcher']
    bnum = leadOff
    outs = 0
    bases = Bases()
    while outs < 3:
        batter = lineup.iloc[bnum]
        b, o = runAtBat(batter, pitcher)
        bases.play(b)
        outs += o
        bnum += 1
        log(bases)
        if bnum >= 9:
            bnum = 0
    return bases.runs, bnum

In [16]:
# run a single game, a home team against a visitor
def runGame(homeTeam, awayTeam):
    home = 0
    away = 0
    inning = 1
    homeNext = 0
    awayNext = 0
    while inning <= 9 or home == away:
        log('---')
        log('Top', inning)
        r, n = runInning(awayTeam, homeTeam, awayNext)
        away += r
        awayNext = n
        log('---')
        log('Bottom', inning)
        r, n = runInning(homeTeam, awayTeam, homeNext)
        home += r
        homeNext = n
        inning += 1
    winner = 'HOME'
    if home < away:
        winner = 'AWAY'
    return home, away, winner

In [17]:
# play a series of games between 2 teams
def runSeries(home, away, games, seriesLen):
    for i in range(seriesLen):
        log(away, '@', home, '#' + str(i+1))
        games['homeTeam'].append(home)
        games['awayTeam'].append(away)
        h, a, w = runGame(home, away)
        games['homeScore'].append(h)
        games['awayScore'].append(a)
        games['winner'].append(w)
        log('Final:', h, a)
        log('')

In [18]:
# process a while league of teams, assuming a certain amount of home games against each team in the league
def runLeague(teams, seriesLen):
    games = {'homeTeam': [], 'awayTeam': [], 'homeScore': [], 'awayScore': [], 'winner': []}
    for home in teams:
        for away in teams:
            if home != away:
                runSeries(home, away, games, seriesLen)
    return pd.DataFrame(data=games)

In [19]:
# process a while league of teams, assuming a certain amount of home games against each team in the league
def runPlayoff(teamA, teamB):
    games = {'homeTeam': [], 'awayTeam': [], 'homeScore': [], 'awayScore': [], 'winner': []}
    teams = pd.concat([teamA, teamB], axis=1).T.sort_values(by=['wins', 'homeWins', 'awayWins', 'team'], ascending=False)
    teamA = teams['team'].iloc[0]
    teamB = teams['team'].iloc[1]
    runSeries(teamA, teamB, games, 4)
    runSeries(teamB, teamA, games, 3)
    return pd.DataFrame(data=games)

In [20]:
# generate a leaderboard for a given set of teams
def standings(league):
    board = {'team': [], 'wins': [], 'losses': [], 'homeWins': [], 'homeLosses': [], 'awayWins': [], 'awayLosses': []}
    for team in league['homeTeam'].unique():
        homeGames = league[league['homeTeam'] == team]
        homeWins = len(homeGames[homeGames['winner'] == 'HOME'].index)
        awayGames = league[league['awayTeam'] == team]
        awayWins = len(awayGames[awayGames['winner'] == 'AWAY'].index)
        totalGames = len(homeGames.index) + len(awayGames.index)
        board['team'].append(team)
        board['wins'].append(homeWins + awayWins)
        board['losses'].append(totalGames - (homeWins + awayWins))
        board['homeWins'].append(homeWins)
        board['homeLosses'].append(len(homeGames.index) - homeWins)
        board['awayWins'].append(awayWins)
        board['awayLosses'].append(len(awayGames.index) - awayWins)
    return pd.DataFrame(data=board).sort_values(by=['wins', 'homeWins', 'awayWins', 'team'], ascending=False)

In [21]:
# returns the best team in a standings 
def getWinner(standings):
    return standings.iloc[0]

In [22]:
# process the national and american leagues independently
national = runLeague(b.loc[b['lgID'] == 'NL']['teamID'].unique(), 9)
american = runLeague(b.loc[b['lgID'] == 'AL']['teamID'].unique(), 9)

In [23]:
# national league standings
nlstand = standings(national)
nlstand

Unnamed: 0,team,wins,losses,homeWins,homeLosses,awayWins,awayLosses
9,SLN,108,54,50,31,58,23
3,NYN,88,74,43,38,45,36
7,SFN,87,75,46,35,41,40
8,CHN,81,81,49,32,32,49
1,CIN,81,81,42,39,39,42
2,HOU,81,81,39,42,42,39
6,PIT,79,83,35,46,44,37
0,ATL,71,91,37,44,34,47
4,LAN,69,93,31,50,38,43
5,PHI,65,97,33,48,32,49


In [24]:
# american league standings
alstand = standings(american)
alstand

Unnamed: 0,team,wins,losses,homeWins,homeLosses,awayWins,awayLosses
9,DET,103,59,54,27,49,32
1,BAL,93,69,42,39,51,30
4,MIN,90,72,47,34,43,38
6,CLE,88,74,40,41,48,33
0,BOS,79,83,36,45,43,38
8,CAL,77,85,34,47,43,38
7,NYA,75,87,39,42,36,45
2,OAK,75,87,33,48,42,39
3,WS2,66,96,36,45,30,51
5,CHA,64,98,38,43,26,55


In [25]:
# simulate a world series
ws = runPlayoff(getWinner(nlstand), getWinner(alstand))
worldstand = standings(ws)
worldstand

Unnamed: 0,team,wins,losses,homeWins,homeLosses,awayWins,awayLosses
1,DET,5,2,2,1,3,1
0,SLN,2,5,1,3,1,2


In [26]:
# display world champion
getWinner(worldstand)['team']

'DET'