In [12]:
import trueskill as trueskill
import pandas as pd
import numpy as np
from collections import defaultdict

### Load Data

In [66]:
df = pd.read_csv('data/runs.csv')
df_train = df.loc[df['race_id'] < 6000]
df_test = df.loc[df['race_id'] >= 6000]
df[:11]

Unnamed: 0,race_id,horse_no,horse_id,result,won,lengths_behind,horse_age,horse_country,horse_type,horse_rating,...,time2,time3,time4,time5,time6,finish_time,win_odds,place_odds,trainer_id,jockey_id
0,0,1,3917,10,0.0,8.0,3,AUS,Gelding,60,...,21.59,23.86,24.62,,,83.92,9.7,3.7,118,2
1,0,2,2157,8,0.0,5.75,3,NZ,Gelding,60,...,21.99,23.3,23.7,,,83.56,16.0,4.9,164,57
2,0,3,858,7,0.0,4.75,3,NZ,Gelding,60,...,21.59,23.9,24.22,,,83.4,3.5,1.5,137,18
3,0,4,1853,9,0.0,6.25,3,SAF,Gelding,60,...,21.83,23.7,24.0,,,83.62,39.0,11.0,80,59
4,0,5,2796,6,0.0,3.75,3,GB,Gelding,60,...,21.75,23.22,23.5,,,83.24,50.0,14.0,9,154
5,0,6,3296,3,0.0,1.25,3,NZ,Gelding,60,...,22.03,22.9,23.57,,,82.83,7.0,1.8,54,34
6,0,7,911,12,0.0,9.5,3,NZ,Gelding,60,...,21.59,23.94,25.09,,,84.15,99.0,28.0,55,149
7,0,8,2170,1,1.0,0.0,3,AUS,Gelding,60,...,21.87,23.58,23.06,,,82.64,12.0,3.6,47,183
8,0,9,1730,13,0.0,9.75,3,NZ,Gelding,60,...,21.71,23.9,24.94,,,84.2,38.0,13.0,75,131
9,0,10,2998,14,0.0,999.0,3,AUS,Mare,60,...,22.31,24.38,30.46,,,92.2,39.0,12.0,109,145


### TrueSkill Ranking by Horse

In [67]:
# initialize
env = trueskill.TrueSkill()
horses = dict()

In [68]:
races = df_train.groupby('race_id')

for race, group in races:
    horse_ids = group.apply(lambda row : row['horse_id'], axis=1).values.tolist()
    ranks = group.apply(lambda row : row['result'], axis=1).values.tolist()
    rating_groups = []

    for horse_id in horse_ids:
        if horse_id not in horses:
            horses[horse_id] = (env.create_rating(),)
        rating_groups.append(horses[horse_id])
            
    new_ratings = env.rate(rating_groups, ranks=ranks)
    
    for i in range(len(horse_ids)):
        horses[horse_ids[i]] = new_ratings[i]

### Prediction

In [72]:
races = df_test.groupby('race_id')
success, total, new_horses = 0, 0, 0

for race, group in races:
    horse_ids = group.apply(lambda row : row['horse_id'], axis=1).values.tolist()
    actual_winners = group.nsmallest(1, 'result')['horse_id'].tolist()
    predicted_winner = None
        
    for horse_id in horse_ids:
        if horse_id not in actual_winners and horse_id not in horses:
            new_horses += 1
            break
        if horse_id not in horses:
            continue
        elif predicted_winner is None or horses[horse_id][0].mu > horses[predicted_winner][0].mu:
            predicted_winner = horse_id
    
    if predicted_winner in actual_winners:
        success += 1
        
    total += 1
    
print(success / (total - new_horses))

0.25925925925925924
