In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from bs4 import BeautifulSoup
import requests
import json
import time
import string
# from tqdm import *
import datetime
import math

# Working on a Baseline

In [3]:
#############################################################################
## READ IN THE CSVs WITH ALL THE DATA AND REDO THE DATETYPE TRANSFORMATION ## 
#############################################################################
traindf = pd.read_csv('/Users/George/DropBox/CS 109/traindf.csv', index_col=0);
traindf.date = pd.to_datetime(traindf.date)
traindf['name'] = traindf['name'].astype(str)
traindf['name'] = map(lambda x: x.upper(), traindf['name'])
traindf = traindf[traindf.name != 'NAN']
traindf.reset_index().drop('index', axis=1, inplace=True);

testdf = pd.read_csv('tempdata/testdf.csv', index_col=0);
testdf.date = pd.to_datetime(testdf.date)
testdf['name'] = testdf['name'].astype(str)
testdf['name'] = map(lambda x: x.upper(), testdf['name'])
testdf = testdf.loc[testdf.date >= datetime.datetime(1999, 1, 1)]

  data = self._reader.read(nrows)


In [4]:
def find_races(horses_name, traindf=traindf):
    return traindf[traindf['name']==horses_name]

def find_avg_payouts(df, race_date=datetime.date.today()):
    out = {}
    df = df[df.date < race_date]
    out['win'] = np.mean(df.win[df.win != 0.0])
    out['place'] = np.mean(df.place[df.place != 0.0])
    out['show'] = np.mean(df.show[df.show != 0.0])
    return out

find_avg_payouts(find_races('SMARTY JONES', traindf=testdf), race_date=datetime.datetime(1998, 12, 12))

{'place': nan, 'show': nan, 'win': nan}

The payouts are already standardized to $2 bets, so we luckily don't have to deal with that problem

In [5]:
unique_test_dates = testdf['date'].unique()

# functions to give horse and track for unique dates of triple crown races

def find_horses(date):
    return testdf['name'].loc[testdf['date'] == date]

def find_track(date):
    return testdf['track'].loc[testdf['date'] == date][0]

#find_horses(datetime.datetime(1998, 5, 15))
#type(testdf['date'][0])

We run a simple regression (aka Average) to get a dict where the keys are a tuple `(date, track, horse)` and the values are themselves dictionaries with keys `win`, `place`, `show` and values as the corresponding averages leading into raceday (excluding information we couldn't have on raceday). 

In [6]:
avg_payouts = {}
for i in testdf['date'].unique():
    race = testdf.loc[testdf.date == i]
    for j in range(len(race)):
        date = race.date.iloc[j]
        track = race.track.iloc[j]
        horse = race.name.iloc[j]
        avg_payouts[(date, track, horse)] = find_avg_payouts(
            find_races(race['name'].iloc[j], traindf=traindf), race_date=date)

In [7]:
## this function takes a payout and returns the first odds digit, assuming a comparison to 1, i.e. x-1 
## of that horse to win when betting occured assuming original bet of $2 and assumed take by race track of 15% 
def payoff_to_odds(payoff, bet_amount=2.0, take = .15):
    return round(((payoff/(1-take) - bet_amount)/bet_amount),4)

def odds_to_percent(odds): 
    return (float(str(odds)[2]))/(float(str(odds)[0])+float((str(odds)[2])))

def normalize_odds(odds): 
    x = odds.split("-")
    if len(x) > 1: 
        return float(x[0])/float(x[1])
    else: 
        return float(x[0])
    
def make_favorite(string): 
    if "favorite" in string: 
        return True 
    else:
        return False 

In [19]:
def baseline(indict):
    profit = []
    cost = []
    count = []
    count2 = []
    for key in indict.keys():
        count2.append(1)
        date = key[0]
        track = key[1]
        horse = key[2]
        
        exp_win = indict[key]['win']
        exp_place = indict[key]['place']
        exp_show = indict[key]['show']
        
        # get the morning line odds for that day to compare
        odds = testdf.loc[(testdf.name == horse) & (testdf.track == track) & (testdf.date == date)]['Odds'].iloc[0]

        if odds > payoff_to_odds(exp_win):
            cost.append(2)
            
            if math.isnan(testdf.loc[(testdf.name == horse) & 
                                     (testdf.track == track) & 
                                     (testdf.date == date)]['win'].iloc[0]):
                count.append(1)
                pass
            
            else:
                profit.append(testdf.loc[(testdf.name == horse) & 
                                     (testdf.track == track) & 
                                     (testdf.date == date)]['win'].iloc[0])
        else:
            count.append(1)
            pass
        
    print profit
    print cost
    return (np.sum(profit) - np.sum(cost), np.sum(profit), np.sum(cost))
            
baseline(avg_payouts)

[20.399999999999999, 0.0, 0.0, 43.799999999999997, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.7999999999999998, 27.600000000000001, 0.0, 0.0, 0.0, 8.8000000000000007, 6.7999999999999998, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 29.600000000000001, 0.0, 0.0, 18.0, 0.0, 0.0, 0.0, 6.5999999999999996, 0.0, 0.0, 0.0, 0.0, 0.0, 27.800000000000001, 0.0, 64.599999999999994, 0.0, 23.0, 0.0, 25.399999999999999, 0.0, 6.5999999999999996, 0.0, 0.0, 0.0, 32.799999999999997, 0.0, 0.0, 102.59999999999999, 0.0, 10.199999999999999, 0.0, 0.0, 0.0, 0.0, 8.5999999999999996, 0.0, 0.0, 0.0, 51.5, 0.0, 14.199999999999999, 0.0, 43.0, 28.0, 0.0, 7.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 10.6]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

(-320.19999999999993, 625.80000000000007, 946)

Dataframe of race, year, profit/loss on win baseline, profit/loss on place baseline, profit/loss on show baseline

In [21]:
def make_bets_baseline(indict, unique_test_dates=unique_test_dates): 
    outdict = {}
    outdict['method'] = []
    outdict['no_bets_placed'] = []
    outdict['race'] = []
    outdict['year'] = []
    outdict['profit_wins'] = []
    outdict['profit_places'] = []
    outdict['profit_shows'] = []
    outdict['total_profit'] = []
    outdict['cost'] = []
    for unique_date in unique_test_dates:
        newdict = {}
        for key in indict.keys():
            date = key[0]
            track = key[1]
            horse = key[2]
            
            
            if date == unique_date:
                newdict[key] = indict[key]
                  
        # now we have newdict, subsetting based on race (by unique date)
        
        spread_tup = []
        for key in newdict.keys():
            date = key[0]
            track = key[1]
            horse = key[2]

            exp_win = newdict[key]['win']
            exp_place = newdict[key]['place']
            exp_show = newdict[key]['show']

            # get the morning line odds for that day to compare
            odds = testdf.loc[(testdf.name == horse) & 
                              (testdf.track == track) & 
                              (testdf.date == date)]['Odds'].iloc[0]
            
            spread = odds - payoff_to_odds(exp_win)
            spread_tup.append((horse, spread))
            
        sorted_list = sorted(spread_tup,key=lambda x: x[1], reverse=True)
        
        print sorted_list[0:3]
            
#         outdict['method'] = []
#         outdict['no_bets_placed'] = []
#         outdict['race'] = []
#         outdict['year'] = []
#         outdict['profit_wins'] = []
#         outdict['profit_places'] = []
#         outdict['profit_shows'] = []
#         outdict['total_profit'] = []
#         outdict['cost'] = []

make_bets_baseline(avg_payouts)

Unnamed: 0,cost_places,cost_shows,cost_wins,date,method,no_bets_places,no_bets_shows,no_bets_wins,profit_places,profit_shows,profit_wins,race,strategy
0,2,2,2,1999-05-15,baseline,1,1,1,0.0,0.0,0,PIM,sorted
1,2,2,2,2000-05-20,baseline,1,1,1,0.0,0.0,0,PIM,sorted
2,2,2,2,2001-05-19,baseline,1,1,1,0.0,0.0,0,PIM,sorted
3,2,2,2,2002-05-18,baseline,1,1,1,33.0,0.0,0,PIM,sorted
4,2,2,2,2003-05-17,baseline,1,1,1,15.4,0.0,0,PIM,sorted
5,2,2,2,2004-05-15,baseline,1,1,1,0.0,0.0,0,PIM,sorted
6,2,2,2,2005-05-21,baseline,1,1,1,0.0,0.0,0,PIM,sorted
7,2,2,2,2006-05-20,baseline,1,1,1,0.0,0.0,0,PIM,sorted
8,2,2,2,2007-05-19,baseline,1,1,1,0.0,0.0,0,PIM,sorted
9,2,2,2,2008-05-17,baseline,1,1,1,17.2,0.0,0,PIM,sorted


In [22]:
def make_bets_baseline_strat(indict, unique_test_dates=unique_test_dates, method = 'unknown', strat='baseline'): 
    outdict = {}
    outdict['strategy'] = []
    outdict['method'] = []
    outdict['no_bets_wins'] = []
    outdict['no_bets_places'] = []
    outdict['no_bets_shows'] = []
    outdict['race'] = []
    outdict['date'] = []
    outdict['profit_wins'] = []
    outdict['profit_places'] = []
    outdict['profit_shows'] = []
    outdict['cost_wins'] = []
    outdict['cost_places'] = []
    outdict['cost_shows'] = []
        
    for unique_date in unique_test_dates:
        newdict = {}
        for key in indict.keys():
            date = key[0]
            track = key[1]
            horse = key[2]
            # creating newdict, a subsetted dictionary for a unique date in the bigger for-loop
            if date == unique_date:
                newdict[key] = indict[key]
                  
        # now we have newdict, subsetting based on race (by unique date)
        
        spread_tup = []
        for key in newdict.keys():
            date = key[0]
            track = key[1]
            horse = key[2]

            exp_win = newdict[key]['win']
            exp_place = newdict[key]['place']
            exp_show = newdict[key]['show']

            # get the morning line odds for that day to compare
            odds = testdf.loc[(testdf.name == horse) & 
                              (testdf.track == track) & 
                              (testdf.date == date)]['Odds'].iloc[0]
            
            # Getting the spread to maximize, indicating we expect better odds 
            # for a horse than those given on the morning on race day.
            
            spread = odds - payoff_to_odds(exp_win)
            
            if np.isnan(spread):
                spread_tup.append((date, track, horse, 0))
            else:
                spread_tup.append((date, track, horse, spread))
            
        sorted_list = sorted(spread_tup,key=lambda x: x[3], reverse=True)
        
        profit = []
        cost = []
        bets = []
        for i in range(len(sorted_list)):
            if sorted_list[i][3] > 0:
                bets.append(1)
                cost.append(2)
                if math.isnan(testdf.loc[(testdf.name == sorted_list[i][2]) &
                                         (testdf.track == sorted_list[i][1]) & 
                                         (testdf.date == sorted_list[i][0])]['win'].iloc[0]):
                    profit.append(0)
                else:
                    profit.append(testdf.loc[(testdf.name == sorted_list[i][2]) &
                                         (testdf.track == sorted_list[i][1]) & 
                                         (testdf.date == sorted_list[i][0])]['win'].iloc[0])
        
        outdict['no_bets_wins'].append(np.sum(bets))
        outdict['cost_wins'].append(np.sum(cost))
        outdict['profit_wins'].append(np.sum(profit))
        outdict['strategy'].append(strat)
        outdict['method'].append(method)
        outdict['race'].append(sorted_list[0][1])
        outdict['date'].append(sorted_list[0][0])
        outdict['profit_places'].append(0)
        outdict['profit_shows'].append(0)
        outdict['cost_places'].append(0)
        outdict['cost_shows'].append(0)
        outdict['no_bets_places'].append(0)
        outdict['no_bets_shows'].append(0)
    return outdict

baseline_results = pd.DataFrame(make_bets_baseline_strat(avg_payouts, 
                                                         unique_test_dates=unique_test_dates, 
                                                         method='baseline', strat='baseline'))
baseline_results

Unnamed: 0,cost_places,cost_shows,cost_wins,date,method,no_bets_places,no_bets_shows,no_bets_wins,profit_places,profit_shows,profit_wins,race,strategy
0,0,0,10,1999-05-15,baseline,0,0,5,0,0,0.0,PIM,baseline
1,0,0,12,2000-05-20,baseline,0,0,6,0,0,0.0,PIM,baseline
2,0,0,20,2001-05-19,baseline,0,0,10,0,0,6.6,PIM,baseline
3,0,0,22,2002-05-18,baseline,0,0,11,0,0,0.0,PIM,baseline
4,0,0,12,2003-05-17,baseline,0,0,6,0,0,0.0,PIM,baseline
5,0,0,16,2004-05-15,baseline,0,0,8,0,0,0.0,PIM,baseline
6,0,0,24,2005-05-21,baseline,0,0,12,0,0,8.6,PIM,baseline
7,0,0,16,2006-05-20,baseline,0,0,8,0,0,27.8,PIM,baseline
8,0,0,16,2007-05-19,baseline,0,0,8,0,0,8.8,PIM,baseline
9,0,0,22,2008-05-17,baseline,0,0,11,0,0,0.0,PIM,baseline


In [23]:
def make_bets_hedging_strat(indict, unique_test_dates=unique_test_dates, strat = 'hedging', method='unknown'): 
    outdict = {}
    outdict['strategy'] = []
    outdict['method'] = []
    outdict['no_bets_wins'] = []
    outdict['no_bets_places'] = []
    outdict['no_bets_shows'] = []
    outdict['race'] = []
    outdict['date'] = []
    outdict['profit_wins'] = []
    outdict['profit_places'] = []
    outdict['profit_shows'] = []
    outdict['cost_wins'] = []
    outdict['cost_places'] = []
    outdict['cost_shows'] = []
        
    for unique_date in unique_test_dates:
        newdict = {}
        for key in indict.keys():
            date = key[0]
            track = key[1]
            horse = key[2]
            # creating newdict, a subsetted dictionary for a unique date in the bigger for-loop
            if date == unique_date:
                newdict[key] = indict[key]
                  
        # now we have newdict, subsetting based on race (by unique date)
        
        spread_tup = []
        for key in newdict.keys():
            date = key[0]
            track = key[1]
            horse = key[2]

            exp_win = newdict[key]['win']
            exp_place = newdict[key]['place']
            exp_show = newdict[key]['show']

            # get the morning line odds for that day to compare
            odds = testdf.loc[(testdf.name == horse) & 
                              (testdf.track == track) & 
                              (testdf.date == date)]['Odds'].iloc[0]
            
            # Getting the spread to maximize, indicating we expect better odds 
            # for a horse than those given on the morning on race day.
            
            spread = odds - payoff_to_odds(exp_win)
            
            if np.isnan(spread):
                spread_tup.append((date, track, horse, 0))
            else:
                spread_tup.append((date, track, horse, spread))
            
        sorted_list = sorted(spread_tup,key=lambda x: x[3], reverse=True)
        
        win_bets = []
        win_cost = []
        win_profit = []
        place_bets = []
        place_cost = []
        place_profit = []
        show_bets = []
        show_cost = []
        show_profit = []
        
        # Bet on Win
        if sorted_list[0][3] > 0:
            win_bets.append(1)
            win_cost.append(2)
            place_bets.append(1)
            place_cost.append(2)
            show_bets.append(1)
            show_cost.append(2)
            if math.isnan(testdf.loc[(testdf.name == sorted_list[0][2]) &
                                     (testdf.track == sorted_list[0][1]) & 
                                     (testdf.date == sorted_list[0][0])]['win'].iloc[0]):
                win_profit.append(0)
            else:

                win_profit.append(testdf.loc[(testdf.name == sorted_list[0][2]) &
                                     (testdf.track == sorted_list[0][1]) & 
                                     (testdf.date == sorted_list[0][0])]['win'].iloc[0])

                place_profit.append(testdf.loc[(testdf.name == sorted_list[0][2]) &
                                     (testdf.track == sorted_list[0][1]) & 
                                     (testdf.date == sorted_list[0][0])]['place'].iloc[0])

                show_profit.append(testdf.loc[(testdf.name == sorted_list[0][2]) &
                                     (testdf.track == sorted_list[0][1]) & 
                                     (testdf.date == sorted_list[0][0])]['show'].iloc[0])
        else:
            win_bets.append(0)
            win_cost.append(0)
            win_profit.append(0)
            
        # Bet on Place
        if sorted_list[1][3] > 0:
            place_bets.append(1)
            place_cost.append(2)
            show_bets.append(1)
            show_cost.append(2)
            if math.isnan(testdf.loc[(testdf.name == sorted_list[1][2]) &
                                     (testdf.track == sorted_list[1][1]) & 
                                     (testdf.date == sorted_list[1][0])]['place'].iloc[0]):
                place_profit.append(0)
            else:

                place_profit.append(testdf.loc[(testdf.name == sorted_list[1][2]) &
                                     (testdf.track == sorted_list[1][1]) & 
                                     (testdf.date == sorted_list[1][0])]['place'].iloc[0])

                show_profit.append(testdf.loc[(testdf.name == sorted_list[1][2]) &
                                             (testdf.track == sorted_list[1][1]) &
                                             (testdf.date == sorted_list[1][0])]['show'].iloc[0])
        else:
            place_bets.append(0)
            place_cost.append(0)
            place_profit.append(0)

        # Bet on Show
        if sorted_list[2][3] > 0:
            show_bets.append(1)
            show_cost.append(2)
            if math.isnan(testdf.loc[(testdf.name == sorted_list[2][2]) &
                                     (testdf.track == sorted_list[2][1]) & 
                                     (testdf.date == sorted_list[2][0])]['show'].iloc[0]):
                show_profit.append(0)
            else:
                show_profit.append(testdf.loc[(testdf.name == sorted_list[2][2]) &
                                     (testdf.track == sorted_list[2][1]) & 
                                     (testdf.date == sorted_list[2][0])]['show'].iloc[0])
        else:
            show_bets.append(0)
            show_cost.append(0)
            show_profit.append(0)

        outdict['method'].append(method)
        outdict['strategy'].append(strat)
        outdict['race'].append(sorted_list[0][1])
        outdict['date'].append(sorted_list[0][0])
        outdict['no_bets_wins'].append(np.sum(win_bets))
        outdict['no_bets_shows'].append(np.sum(show_bets))
        outdict['no_bets_places'].append(np.sum(place_bets))
        outdict['profit_wins'].append(np.sum(win_profit))
        outdict['profit_shows'].append(np.sum(show_profit))
        outdict['profit_places'].append(np.sum(place_profit))
        outdict['cost_wins'].append(np.sum(win_cost))
        outdict['cost_shows'].append(np.sum(show_cost))
        outdict['cost_places'].append(np.sum(place_cost))
    return outdict

hedging_results = pd.DataFrame(make_bets_hedging_strat(avg_payouts, 
                                                     unique_test_dates=unique_test_dates, 
                                                     method='baseline', strat='hedging'))
hedging_results

Unnamed: 0,cost_places,cost_shows,cost_wins,date,method,no_bets_places,no_bets_shows,no_bets_wins,profit_places,profit_shows,profit_wins,race,strategy
0,4,6,2,1999-05-15,baseline,2,3,1,0.0,18.8,0,PIM,hedging
1,4,6,2,2000-05-20,baseline,2,3,1,0.0,0.0,0,PIM,hedging
2,4,6,2,2001-05-19,baseline,2,3,1,0.0,0.0,0,PIM,hedging
3,4,6,2,2002-05-18,baseline,2,3,1,33.0,14.0,0,PIM,hedging
4,4,6,2,2003-05-17,baseline,2,3,1,15.4,9.0,0,PIM,hedging
5,4,6,2,2004-05-15,baseline,2,3,1,0.0,0.0,0,PIM,hedging
6,4,6,2,2005-05-21,baseline,2,3,1,0.0,0.0,0,PIM,hedging
7,4,6,2,2006-05-20,baseline,2,3,1,0.0,8.0,0,PIM,hedging
8,4,6,2,2007-05-19,baseline,2,3,1,0.0,0.0,0,PIM,hedging
9,4,6,2,2008-05-17,baseline,2,3,1,17.2,10.4,0,PIM,hedging


In [24]:
pd.concat([baseline_results, sorted_results, hedging_results]).to_csv('tempdata/betting_results.csv')