# 1-Introduction

## Definitions:

For the sake of consistency, it is important to lay out some terms that will be referred to consistently in this project.

1. "Course": Defined by the track, race type and distance.

2. "Race": Defined by the track, date and program number_obj.  It will include the data, "track", horse-trainer pairing and outcome.

3. "Path": Defined by the horses within each race.  If available, it will include the long/lat-coordinates which are used to generate statistics.

# 2-Library, Functions & Data Upload

## Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from lxml import etree, objectify
import re

https://www.google.com/search?client=safari&rls=en&q=xml+python&ie=UTF-8&oe=UTF-8

In [3]:
import geopandas
import shapely
from shapely.geometry import Point, MultiLineString, LineString
from haversine import haversine, haversine_vector

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
#from tensorflow import keras
from tensorflow.keras.layers import Input, LSTM, Dense, Attention, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [5]:
from tslearn.clustering import TimeSeriesKMeans
from tslearn.utils import to_time_series_dataset
from sklearn.metrics import silhouette_score

## User-Defined Functions

In [6]:
def analyze_path(points):
    points_sorted = points.sort_values(['trakus_index'])
    max_time = points['trakus_index'].max()
    dists = []
    deltas_lon = []
    deltas_lat = []
    vls = []
    accls = []
    dirs = []
    dirs_delta = []
    lines = []
    for i in range(0,len(points)-1):
        t = points_sorted.iloc[i+1]['trakus_index']
        p1 = points_sorted.iloc[i].Point
        p2 = points_sorted.iloc[i+1].Point
        cord1 = p1.coords[0]
        cord2 = p2.coords[0]
        lines.append(LineString([p1,p2]))
        dist = haversine(cord2,cord1)
        delta_lon = cord2[0] - cord1[0]
        delta_lat = cord2[1] - cord1[1]
        deltas_lon.append(delta_lon)
        deltas_lat.append(delta_lat)
        dists.append(dist)
        vls.append(dist/0.25)
        v_delta = vls[-1] - vls[-2] if i > 1 else vls[-1]
        accl = v_delta/0.25
        accls.append(accl)
        direction = np.arctan2(delta_lon,delta_lat)
        dir_delta = direction - dirs[-1] if i > 0 else 0
        dirs.append(direction)
        dirs_delta.append(dir_delta)


    results = {'distances':dists,
               'speed':vls,'acceleration':accls,'direction':dirs,
               'delta_lat':deltas_lat,'delta_lon':deltas_lon,'delta_direction':dirs_delta
               }
    path = MultiLineString(lines)    
    return path, results
    

In [7]:
def calculate_metrics(run):
    pass

In [8]:
def analyze_file(file_name):
    return races,entries

In [9]:
def create_backdata(id,date,data,features=None):
    pass

In [10]:
def create_sequences(df, seq_length):
    xs = []
    ys = []
    for trajectory_id in df['trajectory_id'].unique():
        df_trajectory = df[df['trajectory_id'] == trajectory_id]
        for i in range(len(df_trajectory) - seq_length):
            x = df_trajectory[i:(i + seq_length)][['longitude', 'latitude', 'speed', 'acceleration', 'delta_direction', 'distance_to_leader']].values
            y = df_trajectory.iloc[i + seq_length][['longitude', 'latitude']].values
            xs.append(x)
            ys.append(y)
    return np.array(xs), np.array(ys)

def create_sequences_and_targets(data, seq_length):
    """Creates sequences of data and corresponding next-step targets."""
    xs = []
    ys = []
    # Group data by race and program number to process sequences within each horse's track
    grouped = data.groupby(['race_number', 'program_number'])
    for name, group in grouped:
        # Extract features (longitude and latitude)
        features = group[['normalized_longitude', 'normalized_latitude']].values
        for i in range(len(features) - seq_length):
            # Create input sequence
            x = features[i:(i + seq_length)]
            # Create target (next step)
            y = features[i + seq_length]
            xs.append(x)
            ys.append(y)

## Datasets

In [16]:
files = os.listdir('../Data/2023 Result Charts')
print(len(files))

4906


In [34]:
patt = re.compile(r'([a-z]{2,3})([0-9]{4})([0-9]{2})([0-9]{2})')
base = '../Data/2023 Result Charts/'
def collect_file(file_name):
    track,yr,month,day= patt.match(file_name).groups()
    races = {}
    entries = {}
    tree = etree.parse(base+file_name)
    rs = tree.findall('.//RACE')
    for r in rs:
        number = r.attrib['NUMBER']
        tag = '_'.join([track,yr,month,day,number])
        races[tag], es = collect_race(r,tag)
        entries.update(es)
    return races,entries

def collect_race(r_obj,tag):
    info = {
        'distance' : r_obj.findtext('DISTANCE')+r_obj.findtext('DIST_UNIT'),
        'course_type' : r_obj.findtext('COURSE_ID') + r_obj.findtext('COURSE_DESC'),
        'conditions' : r_obj.findtext('TRK_COND') + r_obj.findtext('WEATHER') + r_obj.findtext('STRT_DESC'),
        'wind' : r_obj.findtext('WIND_DIRECTION')+r_obj.findtext('WIND_SPEED'),
        'track_measures' : r_obj.findtext('RUNUPDIST') + r_obj.findtext('RAILDIST'),
        'description' : r_obj.findtext('FOOTNOTES')
          }
    es = r_obj.findall('.//ENTRY')
    entries={}
    i=0
    for e in es:
        entries[tag+'_'+str(i)] = collect_entry(e)
        i+=1
    return info,entries

def collect_entry(e_obj):
    dkk = {
    'key' : e_obj.findtext('AXCISKEY'),'prog_num' : e_obj.findtext('PROGRAM_NUM'),
    'weight' : e_obj.findtext('WEIGHT'),'age' : e_obj.findtext('AGE'),
    'odds' : e_obj.findtext('DOLLAR_ODDS'), 'position' : e_obj.findtext('START_POSITION'),
    'trainer' : e_obj.find('TRAINER').findtext('KEY'),
    'jockey' : e_obj.find('JOCKEY').findtext('KEY')
    }
    last = e_obj.find('LAST_PP')
    if last is not None:
        last_run = last.find('TRACK').findtext('CODE')+last.findtext('RACE_DATE')+last.findtext('OFL_FINISH')
        dkk['last_run'] = last_run
    results = [e_obj.findtext('OFFICIAL_FIN'),e_obj.findtext('FINISH_TIME'),e_obj.findtext('SPEED_RATING')]
    points = e_obj.findall('POINT_OF_CALL')
    results = results + [{pp.attrib['WHICH']:(pp.findtext('POSITION'),pp.findtext('LENGTHS')) for pp in points}]
    results = results + [e_obj.findtext('COMMENT')]
    dkk['results'] = results

    return dkk

In [36]:
files = os.listdir('../Data/2023 Result Charts')
print(len(files))

4906


In [37]:
all_races_2023={}
all_entries_2023={}
count=0
for f in files:
    try:
        rs,ss = collect_file(f)
        #all_races_2023.update(rs)
        #all_entries_2023.update(ss)
    except:
        count+=-1

In [38]:
count

-3672

In [None]:
track,yr,month,day= patt.match(f).groups()

In [103]:
races = tree.findall('.//RACE')
races

[<Element RACE at 0x2879eaec0>,
 <Element RACE at 0x2879eba40>,
 <Element RACE at 0x110a00f00>,
 <Element RACE at 0x110a00d80>,
 <Element RACE at 0x110a013c0>,
 <Element RACE at 0x110a01ac0>,
 <Element RACE at 0x110a01000>]

In [None]:
r = races[0]
number = r_obj.attrib['NUMBER']


In [None]:
dk = {'distance' : r_obj.findtext('DISTANCE')+r_obj.findtext('DIST_UNIT'),
      'course_type' : r_obj.findtext('COURSE_ID') + r_obj.findtext('COURSE_DESC'),
      'conditions' : r_obj.findtext('TRK_COND') + r_obj.findtext('WEATHER') + r_obj.findtext('STRT_DESC'),
      'wind' : r_obj.findtext('WIND_DIRECTION')+r_obj.findtext('WIND_SPEED'),
      'track_measures' : r_obj.findtext('RUNUPDIST') + r_obj.findtext('RAILDIST'), 
      'description' : r_obj.findtext('FOOTNOTES')
     }

In [None]:
distance = r_obj.findtext('DISTANCE')+r_obj.findtext('DIST_UNIT')
course_type = r_obj.findtext('COURSE_ID') + r_obj.findtext('COURSE_DESC')
conditions = r_obj.findtext('TRK_COND') + r_obj.findtext('WEATHER') + r_obj.findtext('STRT_DESC')
wind = r_obj.findtext('WIND_DIRECTION')+r_obj.findtext('WIND_SPEED')
track_measures = r_obj.findtext('RUNUPDIST') + r_obj.findtext('RAILDIST')
description = r_obj.findtext('FOOTNOTES')

In [None]:
entries = r_obj.findall('.//ENTRY')
e = entries[0]

In [None]:
dkk = {
    'key' : e.findtext('AXCISKEY'),'prog_num' : e.findtext('PROGRAM_NUM'),
    'weight' : e.findtext('WEIGHT'),'age' : e.findtext('AGE'),
    'odds' : e.findtext('DOLLAR_ODDS'), 'position' : e.findtext('START_POSITION'),
    'trainer' : e.find('TRAINER').findtext('KEY'),
    'jockey' : e.find('JOCKEY').findtext('KEY')
    }
last = e.find('LAST_PP')
if last:
    last_run = last.find('TRACK').findtext('CODE')+last.findtext('RACE_DATE')+last.findtext('OFL_FINISH')
    dkk['last_run'] = last_run
results = [e.findtext('OFFICIAL_FIN'),e.findtext('FINISH_TIME'),e.findtext('SPEED_RATING')]
results = results + [{pp.attrib['WHICH']:(pp.findtext('POSITION'),pp.findtext('LENGTHS')) for pp in e.findall('POINT_OF_CALL')}]
results = results + [r_obj.findtext('COMMENT')]
dkk['results'] = results


In [None]:
e.findtext('AXCISKEY')
e.findtext('PROGRAM_NUM')
e.findtext('WEIGHT')
e.findtext('AGE')
e.findtext('DOLLAR_ODDS')
e.findtext('PROGRAM_NUM') + e.findtext('START_POSITION')
last_run = e.find('LAST_PP').find('TRACK').findtext('CODE')+e.find('LAST_PP').findtext('RACE_DATE')+e.find('LAST_PP').findtext('OFL_FINISH')
results = [e.findtext('OFFICIAL_FIN'),e.findtext('FINISH_TIME'),e.findtext('SPEED_RATING')]
results = results + [{pp.attrib['WHICH']:(pp.findtext('POSITION'),pp.findtext('LENGTHS')) for pp in e.findall('POINT_OF_CALL')}]
results = results + [r_obj.findtext('COMMENT')]
trainer = e.find('TRAINER').findtext('KEY')
jockey = e.find('JOCKEY').findtext('KEY')

Tracking Data

In [1]:
heads = ['track','date',
         'race_number','program_number',
         'trakus_index','latitude','longitude',
         'race_distance','course_type','track_condition',
         'run_up_distance','race_type','post_time',
         'purse','weight_carried','jockey',
         'odds','finish'
         ]

In [65]:
data = pd.read_csv('Data/nyra_2019_complete.csv',header=None,low_memory=False)
data.columns = heads
data

Unnamed: 0,track,date,race_number,program_number,trakus_index,latitude,longitude,race_distance,course_type,track_condition,run_up_distance,race_type,post_time,purse,weight_carried,jockey,odds,finish
0,AQU,2019-01-01,9,6,72,40.672902,-73.827607,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
1,AQU,2019-01-01,9,6,73,40.672946,-73.827587,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
2,AQU,2019-01-01,9,6,74,40.672990,-73.827568,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
3,AQU,2019-01-01,9,6,63,40.672510,-73.827781,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
4,AQU,2019-01-01,9,6,64,40.672553,-73.827762,600,D,GD,48,CLM,25000.0,420,120,Andre Shivnarine Worrie,2090,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5228425,AQU,2019-11-23,9,2,167,40.672363,-73.830853,1100,T,GD,72,STK,200000.0,353,124,Joel Rosario,1120,9
5228426,AQU,2019-11-23,9,2,168,40.672321,-73.830873,1100,T,GD,72,STK,200000.0,353,124,Joel Rosario,1120,9
5228427,AQU,2019-11-23,9,2,169,40.672281,-73.830893,1100,T,GD,72,STK,200000.0,353,124,Joel Rosario,1120,9
5228428,AQU,2019-11-23,9,2,170,40.672240,-73.830913,1100,T,GD,72,STK,200000.0,353,124,Joel Rosario,1120,9


In [None]:
starts = pd.read_csv('data/nyra_start_table_new.csv')

# 3-Basic EDA

## Tracks

## Horses & Trainers

In [None]:
jockeys = starts['jockey'].value_counts()
jockeys.plot.hist()
jockeys

In [None]:
horses = starts['horse_id'].value_counts()
horses.plot.hist()
horses

In [None]:
pairings = starts[['horse_id','jockey']].value_counts()
pairings.plot.hist()
pairings

## Races

## Runs

## Bad Data

# 4-Feature Engineering/Wrangling

# 5-Full EDA

# 6-Modeling & Analysis

## Basic Regression

## Clustering

Which features should be used?

In [None]:
features = ['distance', 'speed', 'acceleration']

Prepare for clustering by grouping runs; converting into type for TS analyses; handling bad data.

In [None]:
grouped_data = df.groupby(['track_id', 'race_date', 'race_number', 'program_number'])[features].apply(lambda x: x.values)
grouped_data = grouped_data.tolist()
X = to_time_series_dataset(grouped_data)
X[np.isnan(X)] = 0
X[np.isinf(X)] = 0

Define the range for k (# of clusters) we will be examining.

In [None]:
silhouette_scores = []
range_n_clusters = [2,4,5,6,8,10,15,20]

Check to make sure silhouette score can be calculated for clustering--i.e. there are at least 2 clusters and more than 1 sample in each cluster_obj.

In [None]:
for n_clusters in range_n_clusters:
    #print(f"Trying {n_clusters} clusters...")
    km_dtw = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", max_iter=10, random_state=42)
    labels = km_dtw.fit_predict(X)
    # 
    # Check if there is more than one cluster and each cluster has more than one sample
    if len(np.unique(labels)) > 1 and min(np.bincount(labels)) > 1:
        score = silhouette_score(X.reshape(X.shape[0], -1), labels) # Reshape for silhouette_score
        silhouette_scores.append(score)
        #print(f"Silhouette score for {n_clusters} clusters: {score}")
    else:
        silhouette_scores.append(-1) # Append a low score if conditions are not met
        #print(f"Could not compute silhouette score for {n_clusters} clusters.")

Identify the optimal # of clusters amongst the range explored and fit a new optimal model.

In [None]:
optimal_n_clusters = range_n_clusters[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters: {optimal_n_clusters}")

In [None]:
km_dtw_optimal = TimeSeriesKMeans(n_clusters=optimal_n_clusters, metric="dtw", max_iter=10, random_state=42)
cluster_labels = km_dtw_optimal.fit_predict(X)

Analyze the results:

In [None]:
group_keys = df.groupby(['track_id', 'race_date', 'race_number', 'program_number']).groups.keys()
group_key_list = list(group_keys)

# Create a list of (group_key, label) pairs
group_labels = list(zip(group_key_list, cluster_labels))

# Create a dictionary mapping group key to label
label_dict = {key: label for key, label in group_labels}

In [None]:
df['group_key'] = list(zip(df['track_id'], df['race_date'], df['race_number'], df['program_number']))

# Add the cluster label to the original DataFrame
df['cluster_label'] = df['group_key'].map(label_dict)

# Drop the temporary group_key column
df = df.drop(columns=['group_key'])

display(df.head())

## Geospatial Deep Learning

In [None]:
seq_length = 20
X_train, y_train = create_sequences_and_targets(train_df, seq_length)
X_val, y_val = create_sequences_and_targets(val_df, seq_length)

In [None]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of y_val:", y_val.shape)

In [None]:
inputs = Input(shape=(seq_length, 2))
lstm_out = LSTM(64, return_sequences=True)(inputs)

attention_output = Attention()([lstm_out, lstm_out])
merged_output = keras.layers.concatenate([lstm_out, attention_output])

In [None]:
pooled_attention = keras.layers.GlobalAveragePooling1D()(attention_output)
outputs = Dense(2)(pooled_attention)

In [None]:
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# 7-Conclusions

## Research & Resources