# The Oguri Model
In this notebook, we will ...

**Learning Objectives**

1. ...

**Import modules**
Begin by importing the modules to be used in this notebook

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from dateutil import parser

## Japanese Race Horses

Let's read in the data here:

In [2]:
# read in the Japanese Race Horse racing data
df_jp = pd.read_csv('jpracehorses.csv', header=1)

df_jp.head()

Unnamed: 0,NAME,DATE,RACENAME,COURSE,TRACK,DISTANCE,CONDITION,FINISHTIME,FINISHPOSITION,MARGIN,WEIGHT,SEX
0,Special Week,"Dec 26,1999",ARIMA KINEN (GRAND PRIX) G1,NAKAYAMA,TURF,2500M,FIRM,2:37.2,2,0.0,464 (-4),Stallion
1,Special Week,"Nov 28,1999",JAPAN CUP G1,TOKYO,TURF,2400M,FIRM,2:25.5,1,0.2,468 (-2),Stallion
2,Special Week,"Oct 31,1999",TENNO SHO(AUTUMN) G1,TOKYO,TURF,2000M,FIRM,1:58.0,1,0.1,470 (-16),Stallion
3,Special Week,"Oct 10,1999",KYOTO DAISHOTEN G2,KYOTO,TURF,2400M,FIRM,2:25.1,7,0.8,486 (+6),Stallion
4,Special Week,"Jul 11,1999",TAKARAZUKA KINEN G1,HANSHIN,TURF,2200M,FIRM,2:12.6,2,0.5,480 (+4),Stallion


In [3]:
# https://pandas.pydata.org/docs/reference/api/pandas.to_numeric.html
# https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
# https://pandas.pydata.org/docs/reference/api/pandas.Series.dt.strftime.html
# Data Preprocessing
# Converts string date to datetime format, then converts to Year-Month-Date format
df_jp['DATE'] = pd.to_datetime(df_jp['DATE'], format="%b %d,%Y", errors='coerce')
df_jp['DATE'] = df_jp['DATE'].dt.strftime('%Y-%m-%d')
df_jp['DISTANCE'] = df_jp['DISTANCE'].astype(str).str.replace('M', '')
df_jp['DISTANCE'] = pd.to_numeric(df_jp['DISTANCE'],errors='coerce')
df_jp['FINISHPOSITION'] = pd.to_numeric(df_jp['FINISHPOSITION'],errors='coerce')
#TODO: Convert finish time into all seconds, and to convert from string into float

df_jp['MARGIN'] = pd.to_numeric(df_jp['MARGIN'],errors='coerce')
df_jp['WEIGHT'] = df_jp['WEIGHT'].astype(str).str.extract(r'(\d+)').astype(float)

In [4]:
pd.set_option('display.max_rows', None) # this is required to see all of the rows in the dataset.
#display(df_jp)
df_jp.head()
#df_jp.groupby('NAME')['MARGIN'].mean().sort_values(ascending=False)

Unnamed: 0,NAME,DATE,RACENAME,COURSE,TRACK,DISTANCE,CONDITION,FINISHTIME,FINISHPOSITION,MARGIN,WEIGHT,SEX
0,Special Week,1999-12-26,ARIMA KINEN (GRAND PRIX) G1,NAKAYAMA,TURF,2500.0,FIRM,2:37.2,2.0,0.0,464.0,Stallion
1,Special Week,1999-11-28,JAPAN CUP G1,TOKYO,TURF,2400.0,FIRM,2:25.5,1.0,0.2,468.0,Stallion
2,Special Week,1999-10-31,TENNO SHO(AUTUMN) G1,TOKYO,TURF,2000.0,FIRM,1:58.0,1.0,0.1,470.0,Stallion
3,Special Week,1999-10-10,KYOTO DAISHOTEN G2,KYOTO,TURF,2400.0,FIRM,2:25.1,7.0,0.8,486.0,Stallion
4,Special Week,1999-07-11,TAKARAZUKA KINEN G1,HANSHIN,TURF,2200.0,FIRM,2:12.6,2.0,0.5,480.0,Stallion


## Hong Kong Race Horses

Let's read in the data here:

In [5]:
# read in the Japanese Race Horse racing data
df_hk = pd.read_csv('hkracehorses.csv', header=1)

df_hk.head()

Unnamed: 0,NAME,DATE,RACENAME,COURSE,TRACK,DISTANCE,CONDITION,FINISHTIME,FINISHPOSITION,MARGIN,WEIGHT,SEX
0,AURORA LADY (J066),11/02/25,149,HAPPY VALLEY,Turf,1200,GOOD TO FIRM,1.09.09,3,1-1/4,124,Stallion
1,AURORA LADY (J066),"Oct 22,2025",119,HAPPY VALLEY,Turf,1000,GOOD,0.56.82,4,3-3/4,130,Stallion
2,AURORA LADY (J066),"Jul 16,2025",846,HAPPY VALLEY,Turf,1200,GOOD TO FIRM,1.08.65,3,3-1/2,119,Stallion
3,AURORA LADY (J066),"Jun 28,2025",794,SHA TIN,Turf,1000,GOOD,0.56.56,6,5-1/2,120,Stallion
4,AURORA LADY (J066),06/04/25,736,HAPPY VALLEY,Turf,1200,GOOD TO YIELDING,1.10.07,6,2-1/2,129,Stallion


In [6]:
df_hk['NAME'] = df_hk['NAME'].str.replace(r'\s*\([^)]*\)', '', regex=True)
# Parser changes all of the date entries to be in the same format, and strftime changes it into year-month-day format
# HK dataset recorded dates in different formats, so could not be cleaned the same way as JP set.
# https://stackoverflow.com/questions/48384376/converting-multiple-date-formats-into-one-format-python
df_hk['DATE'] = df_hk['DATE'].astype(str).map(parser.parse)
df_hk['DATE'] = df_hk['DATE'].dt.strftime('%Y-%m-%d')
df_hk['DISTANCE'] = pd.to_numeric(df_hk['DISTANCE'],errors='coerce').astype(float)
df_hk['FINISHPOSITION'] = pd.to_numeric(df_hk['FINISHPOSITION'],errors='coerce')
#TODO: Convert finish time into all seconds, and to convert from string into float

#TODO: create a python function that converts the margin (ex. 1-1/4 into 1.25) into float format

df_hk['WEIGHT'] = pd.to_numeric(df_hk['WEIGHT'],errors='coerce').astype(float)

In [7]:
df_hk.head()

Unnamed: 0,NAME,DATE,RACENAME,COURSE,TRACK,DISTANCE,CONDITION,FINISHTIME,FINISHPOSITION,MARGIN,WEIGHT,SEX
0,AURORA LADY,2025-11-02,149,HAPPY VALLEY,Turf,1200.0,GOOD TO FIRM,1.09.09,3.0,1-1/4,124.0,Stallion
1,AURORA LADY,2025-10-22,119,HAPPY VALLEY,Turf,1000.0,GOOD,0.56.82,4.0,3-3/4,130.0,Stallion
2,AURORA LADY,2025-07-16,846,HAPPY VALLEY,Turf,1200.0,GOOD TO FIRM,1.08.65,3.0,3-1/2,119.0,Stallion
3,AURORA LADY,2025-06-28,794,SHA TIN,Turf,1000.0,GOOD,0.56.56,6.0,5-1/2,120.0,Stallion
4,AURORA LADY,2025-06-04,736,HAPPY VALLEY,Turf,1200.0,GOOD TO YIELDING,1.10.07,6.0,2-1/2,129.0,Stallion


In [11]:
#pd.set_option('display.max_rows', None) # this is required to see all of the rows in the dataset.
#display(df_hk)
#df_hk.head()
#df_hk.groupby('NAME')['FINISHPOSITION'].mean().sort_values(ascending=False)