In [None]:
# imports
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import numpy as np

In [None]:
driver = webdriver.Chrome("/Applications/chromedriver")
url = "https://www.baseball-reference.com/leagues/MLB/2014-schedule.shtml"
driver.get(url)
content = driver.page_source
soup = BeautifulSoup(content)

In [None]:
team_abbrev = {"Arizona D'Backs" : 'ARI',
'Atlanta Braves' : 'ATL',
'Baltimore Orioles' : 'BAL',
'Boston Red Sox' : 'BOS',
'Chicago Cubs' : 'CHC',
'Chicago White Sox' : 'CHW',
'Cincinnati Reds' : 'CIN',
'Cleveland Indians' : 'CLE',
'Colorado Rockies' : 'COL',
'Detroit Tigers' : 'DET',
'Miami Marlins' : 'MIA',
'Houston Astros' : 'HOU',
'Kansas City Royals' : 'KCR',
'LA Angels of Anaheim' : 'LAA',
'Los Angeles Dodgers' : 'LAD',
'Milwaukee Brewers' : 'MIL',
'Minnesota Twins' : 'MIN',
'New York Mets' : 'NYM',
'New York Yankees' : 'NYY',
'Oakland Athletics' : 'OAK',
'Philadelphia Phillies' : 'PHI',
'Pittsburgh Pirates' : 'PIT',
'San Diego Padres' : 'SDP',
'San Francisco Giants' : 'SFG',
'Seattle Mariners' : 'SEA',
'St. Louis Cardinals' : 'STL',
'Tampa Bay Rays' : 'TBR',
'Texas Rangers' : 'TEX',
'Toronto Blue Jays' : 'TOR',
'Washington Nationals' : 'WSN'}

team_index = {'ARI' : 0,
'ATL' : 1,
'BAL' : 2,
'BOS' : 3,
'CHC' : 4,
'CHW' : 5,
'CIN' : 6,
'CLE' : 7,
'COL' : 8,
'DET' : 9,
'MIA' : 10,
'HOU' : 11,
'KCR' : 12,
'LAA' : 13,
'LAD' : 14,
'MIL' : 15,
'MIN' : 16,
'NYM' : 17,
'NYY' : 18,
'OAK' : 19,
'PHI' : 20,
'PIT' : 21,
'SDP' : 22,
'SFG' : 23,
'SEA' : 24,
'STL' : 25,
'TBR' : 26,
'TEX' : 27,
'TOR' : 28,
'WSN' : 29}

In [None]:
schedule = {}
body = soup.find('body', attrs={'class':'br'})
wrap = body.find('div', attrs={'id' : 'wrap'})
main = wrap.find('div', attrs={'id' : 'content', 'role' : 'main'})
section_wrapper = main.find('div', attrs={'class' : 'section_wrapper'})
section_content = section_wrapper.find('div', attrs={'class' : 'section_content'})
for date in section_content.findAll('div', recursive=False):
    date_string = date.find('h3').findAll(text=True)[0]
    date_object = datetime.strptime(date_string, '%A, %B %d, %Y')
    play_matrix = np.zeros((30,30))
    for game in date.findAll('p', attrs={'class':'game'}, recursive=False):
        i = team_index[team_abbrev[game.find('a', recursive=False).findAll(text=True)[0]]]
        j = team_index[team_abbrev[game.find('strong', recursive=False).find('a').findAll(text=True)[0]]]
        play_matrix[i,j] = play_matrix[i,j] + 1
        play_matrix[j,i] = play_matrix[j,i] + 1
    schedule[date_object] = play_matrix

In [None]:
def game_matrix(date, prior=True):
    """Get the games played (prior is True) or games remaining to be played (prior False)"""
    games_played = np.zeros((30,30))
    for day in schedule:
        if prior:
            if day <= date:
                games_played = games_played + schedule[day]
        else:
            if day > date:
                games_played = games_played + schedule[day]
    df = pd.DataFrame(games_played)
    index_to_team = {v: k for k, v in team_index.items()}
    df = df.rename(columns=index_to_team, index=index_to_team)
    return df.astype(int)

In [None]:
# generate remaining games
game_matrix(datetime(2014,9,1), prior=False).to_csv('data/games_left.csv')

## TEST

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
standing = pd.read_csv('data/standing.csv', index_col = 0)
games_left = pd.read_csv('data/games_left.csv', index_col = 0)

In [None]:
team = 'HOU'
W = standing.at[team,'W'] + sum(games_left[team])
w = {}
for index, row in standing.iterrows():
    w[index] = W - row['W'] 
    if w[index] < 0:
        raise ValueError('Negative edge weight')

In [None]:
# creating a new graph
G = nx.DiGraph()

G.add_node('s', pos=(0,0.5))
G.add_node('t', pos=(1,0.5))

for i in range(len(standing)):
    # create team node
    G.add_node('%s'%(standing.index[i]), pos=(0.75,10*i))
    for j in range(len(standing)):
        if i < j:
            # create all pair nodes
            G.add_node('%s,%s'%(standing.index[i],standing.index[j]), pos=(0.25,10*(i+j)))
            
for i in range(len(standing)):

    # add team node to sink edges
    G.add_edge(standing.index[i], 't', capacity = w[standing.index[i]] )
    
    for j in range(len(standing)):
        if i < j:
            pair_node = '%s,%s'%(standing.index[i],standing.index[j])
            
            # add s to pair nodes edges
            G.add_edge('s', pair_node, capacity = games_left.at[standing.index[i],standing.index[j]])
            
            # create edges from pair nodes to team nodes
            G.add_edge(pair_node, '%s'%(standing.index[i]))
            G.add_edge(pair_node, '%s'%(standing.index[j]))

In [None]:
from max_flow import *
ex = max_flow(add_infinite_capacities(G)) # create a max flow instance from the graph G
ex.ford_fulkerson(show=False) # run Ford-Fulkerson

# print the set of checked nodes
checked_attr = nx.get_node_attributes(ex.G,'check')
for i in checked_attr:
    if checked_attr[i]:
        print(i)

In [None]:
games_left.at['LAA','OAK'] + games_left.at['LAA','SEA'] + games_left.at['SEA','OAK']

In [None]:
w['LAA']+w['OAK']+w['SEA']