In [None]:
import numpy as np
import pandas as pd

In [None]:
races_df = pd.read_csv('races.csv', index_col='race_id')
runs_df = pd.read_csv('runs.csv')

# Set up dataframe

create new dataframe df that we will use to train the model

### Remove unnessussary columns from runs_df

In [None]:
df = runs_df
df=df.drop('won',axis=1)
df=df.drop('horse_gear',axis=1)
df=df.drop('draw',axis=1)

# the below have too many na values
df = df.drop('horse_no', axis = 1)
df=df.drop('position_sec1',axis=1)
df=df.drop('position_sec2',axis=1)
df=df.drop('position_sec3',axis=1)
df=df.drop('position_sec4',axis=1)
df=df.drop('position_sec5',axis=1)
df=df.drop('position_sec6',axis=1)
df=df.drop('behind_sec1',axis=1)
df=df.drop('behind_sec2',axis=1)
df=df.drop('behind_sec3',axis=1)
df=df.drop('behind_sec4',axis=1)
df=df.drop('behind_sec5',axis=1)
df=df.drop('behind_sec6',axis=1)
df=df.drop('time4',axis=1)
df=df.drop('time5',axis=1)
df=df.drop('time6',axis=1)
df=df.drop('place_odds',axis=1) 

### Add relevant columns from races_df

In [None]:
df = pd.merge(df, races_df[['venue','config','surface','distance','going']], on='race_id', how='left')

### Create new columns to add to df

#### time metric columns
these columns use the time1 time2 and time3 fields to determine how much of the final time was used to run different parts of the race

In [None]:
df["time1"] = df["time1"] / df["finish_time"]
df["time2"] = df["time2"] / df["finish_time"]
df["time3"] = df["time3"] / df["finish_time"]
df["time23"] = (df["time2"] + df["time3"]) / df["finish_time"]

#### Add horse, jockey, and trainer placement percentage columns

add columns capturing the likelyhood of a given, horse, jockey, and trainer placing in the race

###### New Horse columns

In [None]:
horse_tot_race=runs_df.groupby(['horse_id'])['result'].apply(lambda x: (x).sum()).reset_index(name='horse_tot_race')

df=pd.merge(df,horse_tot_race,on='horse_id',how='left')

horse_tot_place=runs_df.groupby(['horse_id'])['result'].apply(lambda x: (x <=3).sum()).reset_index(name='horse_tot_place')

df=pd.merge(df,horse_tot_place,on='horse_id',how='left')

df['horse_place_perc']=df['horse_tot_place']/df['horse_tot_race']

###### New Jockey columns

In [None]:
jockey_tot_race=runs_df.groupby(['jockey_id'])['result'].apply(lambda x: (x).sum()).reset_index(name='jockey_tot_race')

df=pd.merge(df,jockey_tot_race,on='jockey_id',how='left')

jockey_tot_place=runs_df.groupby(['jockey_id'])['result'].apply(lambda x: (x <=3).sum()).reset_index(name='jockey_tot_place')

df=pd.merge(df,jockey_tot_place,on='jockey_id',how='left')

df['jockey_place_perc']=df['jockey_tot_place']/df['jockey_tot_race']

###### New Trainer Columns

In [None]:
trainer_tot_race=runs_df.groupby(['trainer_id'])['result'].apply(lambda x: (x).sum()).reset_index(name='trainer_tot_race')

df=pd.merge(df,trainer_tot_race,on='trainer_id',how='left')

trainer_tot_place=runs_df.groupby(['trainer_id'])['result'].apply(lambda x: (x <=3).sum()).reset_index(name='trainer_tot_place')

df=pd.merge(df,trainer_tot_place,on='trainer_id',how='left')

df['trainer_place_perc']=df['trainer_tot_place']/df['trainer_tot_race']

##### Remove unneccessary columns

In [None]:
df=df.drop('horse_tot_place',axis=1)
df=df.drop('horse_tot_race',axis=1)
df=df.drop('horse_id',axis=1)


df=df.drop('trainer_tot_place',axis=1)
df=df.drop('trainer_tot_race',axis=1)
df=df.drop('trainer_id',axis=1)


df=df.drop('jockey_tot_place',axis=1)
df=df.drop('jockey_tot_race',axis=1)
df=df.drop('jockey_id',axis=1)

### Clean the data

In [None]:
# Define Anomalies function
def iqr_anomalies(data, col):
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return data[(data[col] < lower_bound) | (data[col] > upper_bound)]

In [None]:
print("number of rows before cleaning:", df.shape[0])

# drop missing values
df = df.dropna()

# drop duplicates
df = df.drop_duplicates()


# handle anomalies
race_ids = df['race_id'].unique()
anomalie_race_ids = df.iloc[iqr_anomalies(df, "finish_time").index]["race_id"].unique()
anomalie_race_indicies = df[df["race_id"].isin(anomalie_race_ids)].index
df = df.drop(anomalie_race_indicies)

### Encode categorical variables

In [None]:
from sklearn import preprocessing

config_encoder = preprocessing.OrdinalEncoder()
df['config'] = config_encoder.fit_transform(df['config'].values.reshape(-1, 1))

going_encoder = preprocessing.OrdinalEncoder()
df['going'] = going_encoder.fit_transform(df['going'].values.reshape(-1, 1))

venue_encoder = preprocessing.LabelEncoder()
df['venue'] = venue_encoder.fit_transform(df['venue'])

horse_country_encoder = preprocessing.LabelEncoder()
df['horse_country'] = horse_country_encoder.fit_transform(df['horse_country'])

horse_type_encoder = preprocessing.LabelEncoder()
df['horse_type'] = horse_type_encoder.fit_transform(df['horse_type'])

### Save current state of dataframe

We will want to reuse this state of the dataframe later, so save it as its own variable

In [None]:
data = df

test_1 = data[data.race_id==1601]
test_2 = data[data.race_id==1602]
test_3 = data[data.race_id==1603]
test_4 = data[data.race_id==1604]
test_5 = data[data.race_id==1605]

### Limit Number of Races up to Race 1600

only use up to race 1600 for training

In [None]:
df = df[df.race_id <= 1600]

print("number of rows after cleaning:", df.shape[0])

#### Now drop race_id and result because they are not needed for training

we are trying to predict the horse's time to finish the race, we did not drop them earlier because they were needed in the test_x dataframes.

In [None]:
df = df.drop('race_id', axis=1)
df = df.drop('result', axis=1)

### Scale data

In [None]:
from sklearn.preprocessing import StandardScaler

y = df['finish_time']
X = df.drop('finish_time', axis=1)
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

### Display Correlation Matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(16,5))
sns.heatmap(df.corr())

# Train Models

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear Regression

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
linearRegression = LinearRegression()
linearRegression.fit(x_train, y_train)

In [None]:
y_pred = linearRegression.predict(x_test)
y_pred

In [None]:
# Evaluating Model - Linear Regression
print(f"Linear Regression Results:")
print(f"Mean Squared Error:", mean_squared_error(y_test, y_pred))
print(f"Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print(f"R2 Score:", r2_score(y_test, y_pred))

### KNN

In [None]:
# KNN
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knnRegression = KNeighborsRegressor(n_neighbors=5)
knnRegression.fit(x_train, y_train)

In [None]:
y_predKNN = knnRegression.predict(x_test)
y_predKNN

In [None]:
# Evaluating Model - KNN
print(f"KNN Regression Results:")
print(f"Mean Squared Error:", mean_squared_error(y_test, y_predKNN))
print(f"Mean Absolute Error:", mean_absolute_error(y_test, y_predKNN))
print(f"R2 Score:", r2_score(y_test, y_predKNN))

### Decision Tree

In [None]:
# Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor

In [None]:
treeRegression = DecisionTreeRegressor(max_depth=5, random_state=42)
treeRegression.fit(x_train, y_train)

In [None]:
# Evaluating Model - Decision Tree
y_predTree = treeRegression.predict(x_test)
y_predTree

In [None]:
print("Decision Tree Results:")
print(f"Mean Squared Error:", mean_squared_error(y_test, y_predTree))
print(f"Mean Absolute Error:", mean_absolute_error(y_test, y_predTree))
print(f"R2 Score:", r2_score(y_test, y_predTree))

In [None]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

In [None]:
# Decision Tree Regressor
parameterGridTree = {"max_depth": [3,5,10, None], 'min_samples_split': [2,5,10]}
gridTree = GridSearchCV(DecisionTreeRegressor(random_state=42), parameterGridTree, cv=5, scoring='neg_mean_squared_error')
gridTree.fit(x_train, y_train)

In [None]:
print(f"Best Parameters for Decision Tree:", gridTree.best_params_)

In [None]:
# KNN Regressor
parameterGridKNN = {'n_neighbors': [3,5,7,10]}
gridKNN = GridSearchCV(KNeighborsRegressor(), parameterGridKNN, cv=5, scoring='neg_mean_squared_error')
gridKNN.fit(x_train, y_train)

In [None]:
print(f"Best Parameters for KNN:", gridKNN.best_params_)

### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_estimators = 100, max_depth = 10)

rfr.fit(x_train, y_train)

y_pred = rfr.predict(x_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def metrics(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R² Score: {r2}")

metrics(y_test, y_pred)

### Bagging Regressor

In [None]:
from sklearn.ensemble import BaggingRegressor

br = BaggingRegressor(estimator=None, n_estimators = 50, max_samples = .8,bootstrap = True)

br.fit(x_train, y_train)

y_pred = br.predict(x_test)

metrics(y_test, y_pred)

### AbaBoost Regressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor(estimator=None, n_estimators = 100, learning_rate = 1)

ada.fit(x_train, y_train)

y_pred = ada.predict(x_test)

metrics(y_test, y_pred)

### Stacking Regressor

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression

st = StackingRegressor(estimators=[('decision_tree', DecisionTreeRegressor()),('knn', KNeighborsRegressor())])

st.fit(x_train, y_train)

y_pred = st.predict(x_test)

metrics(y_test, y_pred)

### SVR

Support Vector Regression

In [None]:
from sklearn.svm import SVR

C = 10

In [None]:
svr_rbf = SVR(kernel="rbf", C=C, epsilon=0.01).fit(x_train, y_train)
y_pred_rbf = svr_rbf.predict(x_test)
print("SVR RBF Results:")
print(f"Mean Squared Error:", mean_squared_error(y_test, y_pred_rbf))
print(f"Mean Absolute Error:", mean_absolute_error(y_test, y_pred_rbf))
print(f"R2 Score:", r2_score(y_test, y_pred_rbf))

In [None]:
svr_poly = SVR(kernel="poly", C=C, gamma="scale").fit(x_train, y_train)
y_pred_poly = svr_poly.predict(x_test)
print("SVR Polynomial Results:")
print(f"Mean Squared Error:", mean_squared_error(y_test, y_pred_poly))
print(f"Mean Absolute Error:", mean_absolute_error(y_test, y_pred_poly))
print(f"R2 Score:", r2_score(y_test, y_pred_poly))

### Neural Networks


In [None]:
#import libraries
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt

In [None]:
#creating the deep residual network
def deepResidualNetwork(input_shape):
    model = Sequential([
        #decided to use elu = exponential linear unit for activation function
        #first layer--> has 256 neurons
        Dense(256, activation='elu', input_shape=(input_shape,)),
        BatchNormalization(),
        #dropout was picked to be moderate--> this is based on studies of NNs
        Dropout(0.3),
        #second layer--> has 128 neurons
        Dense(128, activation='elu'),
        BatchNormalization(),
        Dropout(0.3),
        #thrid layer--> has 64 neurons
        Dense(64, activation='elu'),
        BatchNormalization(),
        #fourth layer--> has 32 neurons
        Dense(32, activation='elu'),
        #output layer--> 1 neuron for finish time
        Dense(1)
    ])
    #using adam optimizer and a small alpha, 
    #for loss we are using huber--> it is more prone to outlier adaptation than MSE
    model.compile(optimizer=Adam(learning_rate=0.01), loss='huber')
    return model

In [None]:

#creating the pyramid neural network 
def pyramidNeuralNetwork(input_shape):
    model = Sequential([
        #first layer--> base of pyramid with 512 neurons
        Dense(512, activation='selu', input_shape=(input_shape,)),
        BatchNormalization(),
        Dropout(0.4),
        #second layer--> half the neurons = 256 neurons
        Dense(256, activation='selu'),
        BatchNormalization(),
        Dropout(0.3),
        #third layer--> half neurons again 
        Dense(128, activation='selu'),
        BatchNormalization(),
        Dropout(0.2),
        #fourth layer --> half neurons
        Dense(64, activation='selu'),
        BatchNormalization(),
        #fifth layer--> half nuerons
        Dense(32, activation='selu'),
        #output= pyramid top
        Dense(1)
    ])
    #using adam optimizer with a 0.01 alpha and the huber loss
    model.compile(optimizer=Adam(learning_rate=0.01), loss='huber')
    return model

In [None]:
def simpleNeuralNetwork(input_shape):
    model = Sequential([
        # a single layer that directly maps inputs to output
        Dense(1, input_shape=(input_shape,))
    ])
    #also using adam and huber
    model.compile(optimizer=Adam(learning_rate=0.01), loss='huber')
    return model

In [None]:
def get_callbacks(): #monitoring val loss to earling stop or update learning rate 
    return [
        #we are monitoring the val_loss
        #val loss is a metric that calculates how well the model will perform on unseen data
        EarlyStopping( #stop early if neccesary
            monitor='val_loss',
            patience=10, #waits 10 epochs for improvement
            restore_best_weights=True #we want to keep the weaits that give the best val loss
        ),
        ReduceLROnPlateau( #we will reduce learning rate when we see it plateau on performance
            monitor='val_loss', #monitoring val loss to see if LR needs adjustment 
            factor=0.5, #multiplies learning rate if need be
            patience=10, #waits 10 epochs
            min_lr=0.0001 #set a lower bound 
        )
    ]


In [None]:
#training and evaluating the model
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name): 
    callbacks = get_callbacks()
    history = model.fit(
        X_train, y_train, #splitting x training and y training
        validation_split=0.2, #takes 20 percent of trainng data for validation
        epochs=100, #using 100 epochs 
        batch_size=32, #number of samples that we process before model updates
        callbacks=callbacks,#may want to correct model during training
        verbose=1 #print what is happening
    )
    
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred) #calculates MSE between predicted and actual
    r2 = r2_score(y_test, y_pred)# calculates coeff of determination
    return history, mse, r2, y_pred

In [None]:

# Create testing samples to test the Neural Networks 
test_races = pd.concat([test_1, test_2, test_3, test_4, test_5]) #all races we are testing 
X_test_nn = test_races.drop(['race_id', 'result', 'finish_time'], axis=1) #remove columns not needed 
X_test_nn = pd.DataFrame(scaler.fit_transform(X_test_nn), columns=X_test_nn.columns) #x test set
y_test_nn = test_races['finish_time'] #y test set

#all models that we run on test data
models = {
    'Simple NN': simpleNeuralNetwork(X.shape[1]), #Regular NN
    'Deep Residual NN': deepResidualNetwork(X.shape[1]), #deep residual NN
    'Pyramid NN': pyramidNeuralNetwork(X.shape[1]), #pyramid NN
}
results = {} #store training results 
predictions = {} #store model predictions

# train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    history, mse, r2, y_pred = evaluate_model(
        model, 
        x_train, 
        y_train, 
        X_test_nn, 
        y_test_nn, 
        name
    )
    results[name] = {
        'history': history,
        'mse': mse,
        'r2': r2
    }
    predictions[name] = y_pred.flatten()


In [None]:
#all visualization functions
#learning curve to watch alpha change 
def learningCurve():
    plt.figure(figsize=(15, 5))
    for name, result in results.items():
        plt.plot(result['history'].history['loss'], label=f'{name} - Training')
        plt.plot(result['history'].history['val_loss'], label=f'{name} - Validation')
    plt.title('Learning Curves')
    plt.xlabel('Epoch')
    plt.ylabel('Huber Loss')
    plt.legend()
    plt.grid(True)
    plt.show()
#a scatter plot to see how well each model performs 
def modelPredictions():
    plt.figure(figsize=(15, 5))
    plt.scatter(range(len(y_test_nn)), y_test_nn, label='Actual', alpha=0.5)
    for name, pred in predictions.items():
        plt.scatter(range(len(pred)), pred, label=f'{name} Predicted', alpha=0.5)
    plt.title('Actual vs Predicted Finish Times')
    plt.xlabel('Race Instance')
    plt.ylabel('Finish Time')
    plt.legend()
    plt.grid(True)
    plt.show()
#print results of the models 
def performanceMetrics():
    print("\nModel Performance Metrics:")
    print("-" * 50)
    for name, result in results.items():
        print(f"\n{name}:")
        print(f"Mean Squared Error: {result['mse']:.4f}")
        print(f"Root Mean Squared Error: {np.sqrt(result['mse']):.4f}")
        print(f"R² Score: {result['r2']:.4f}")
learningCurve()
modelPredictions()
performanceMetrics()

#show best model
best_model_name = min(results, key=lambda x: results[x]['mse'])
best_model = models[best_model_name]
print(f"\nBest performing model was {best_model_name}")

# Test Models Against New Races

Now that we have trained multiple models, we will compare them by testing them against the 5 test races we extracted earlier

In [None]:
test_races = [data[data.race_id == race_id].copy() for race_id in [1601, 1602, 1603, 1604, 1605]]

knn_preds = []

tree_preds = []

svr_preds = []

nn_preds = []

for race in test_races:
    X = race.drop(["finish_time", "result", "race_id"], axis=1)
    X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

    knn_preds.append(knnRegression.predict(X))

    tree_preds.append(treeRegression.predict(X))

    svr_preds.append(svr_rbf.predict(X))

    # add new columns with nn predictions here
    # nn_preds.append(nn.predict(race)) 


for i, race in enumerate(test_races):
    race["knn_pred"] = knn_preds[i]
    race["knn_result"] = race["knn_pred"].rank(ascending=True).astype(int)

    race["tree_pred"] = tree_preds[i]
    race["tree_result"] = race["tree_pred"].rank(ascending=True).astype(int)

    race["svr_pred"] = svr_preds[i]
    race["svr_result"] = race["svr_pred"].rank(ascending=True).astype(int)

    # add new columns with nn predictions here
    # test_races[i]["nn_pred"] = nn_preds[i]
    # test_races[i]["nn_result"] = test_races[i]["nn_result"].rank(ascending=True).astype(int)

    # make sure to add your columns to this list
    test_races[i] = race[['finish_time', 'result', 
                          'knn_pred', 'knn_result',
                          'tree_pred', 'tree_result',
                          'svr_pred', 'svr_result']] 

In [None]:
test_races[0]

In [None]:
test_races[1]

In [None]:
test_races[2]

In [None]:
test_races[3]

In [None]:
test_races[4]

# columns

finish_time	

actual_weight	

config	

declared_weight	

distance	

going	

horse_age	

horse_country	

horse_place_perc	

horse_type	

jockey_place_perc	

time1/finising_time

time2/finising_time

time3/finising_time

surface	

trainer_place_perc	

venue

quartile = result / # horses ....

_______

result & race_id

exploratory data visualization, want both uni and multivariate visualizations for analysis

more data preprocessing and cleanning

experiment with feature selection techniques

show variety of algorithms, log reg, knn, decision trees, rand forrests, ensemble techniques, SVMs (all doable with scikit learn) build NN and DNN to expieriment with different architecture of NNs, maybe take a representative sample of 20000 or so examples (carful not to lose too much info)

meet as team ASAP and make final plan 

