In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartModel
from torch import nn
import os
from tqdm import tqdm
import joblib

# Set random seed for reproducibility
torch.manual_seed(77)
np.random.seed(77)

# Step 1: Define the load_and_preprocess_data function
def load_and_preprocess_data(file_path, is_test=False, existing_scaler=None):
    df = pd.read_csv(file_path)
    print(f"Loaded data with {len(df)} rows and {len(df.columns)} columns")
    print("Columns in dataset:", df.columns.tolist())

    # Fix mismatched data: move sire_color to weight and drop non-numeric weight data
    if 'weight' in df.columns and 'sire_color' in df.columns:
        print("Before fix - weight sample:", df['weight'].head().tolist())
        print("Before fix - sire_color sample:", df['sire_color'].head().tolist())

        # Convert weight to numeric, coercing strings to NaN
        df['weight'] = pd.to_numeric(df['weight'], errors='coerce')

        # Move sire_color data to weight (assuming sire_color contains true weights)
        df['weight'] = pd.to_numeric(df['sire_color'], errors='coerce')

        # Drop sire_color column
        df = df.drop(columns=['sire_color'])
        print("After fix - weight sample:", df['weight'].head().tolist())
        print(f"Dropped 'sire_color' column. New columns: {df.columns.tolist()}")
    else:
        print("Warning: 'weight' or 'sire_color' column missing. Skipping column fix.")

    # Map test dataset columns to training feature names
    column_mapping = {
        'horse_name': 'horse',
        'surface_code': 'surface_x',
        'purse': 'purse_x',
        'odds': 'dollar_odds',
        'sex': 'sex_code'
    }
    df = df.rename(columns=column_mapping)

    # Convert distance from yards to furlongs (1 furlong = 220 yards)
    if 'distance' in df.columns:
        df['distance_f'] = df['distance'] / 220.0

    # Define features
    features = [
        'horse', 'jockey', 'trainer', 'program_number', 'surface_x', 'distance_f', 'purse_x',
        'track_code', 'race_date', 'race_number', 'race_type', 'weight', 'age', 'sex_code',
        'dollar_odds', 'recent_finish_pos_1', 'recent_lengths_back_finish_1', 'recent_purse_1',
        'recent_finish_pos_2', 'recent_lengths_back_finish_2', 'recent_purse_2',
        'recent_finish_pos_3', 'recent_lengths_back_finish_3', 'recent_purse_3',
        'recent_finish_pos_4', 'recent_lengths_back_finish_4', 'recent_purse_4',
        'recent_finish_pos_5', 'recent_lengths_back_finish_5', 'recent_purse_5'
    ]
    targets = ['official_finish', 'speed_rating', 'win_time']

    # Define categorical and numerical columns
    categorical_cols = [
        'horse', 'jockey', 'trainer', 'program_number', 'surface_x', 'track_code',
        'race_date', 'race_number', 'race_type', 'sex_code'
    ]
    numerical_cols = [
        'distance_f', 'purse_x', 'dollar_odds', 'weight', 'age',
        'recent_finish_pos_1', 'recent_lengths_back_finish_1', 'recent_purse_1',
        'recent_finish_pos_2', 'recent_lengths_back_finish_2', 'recent_purse_2',
        'recent_finish_pos_3', 'recent_lengths_back_finish_3', 'recent_purse_3',
        'recent_finish_pos_4', 'recent_lengths_back_finish_4', 'recent_purse_4',
        'recent_finish_pos_5', 'recent_lengths_back_finish_5', 'recent_purse_5'
    ]

    # Add missing features for test dataset
    for i in range(1, 6):
        finish_pos_col = f'recent_finish_pos_{i}'
        lengths_col = f'recent_lengths_back_finish_{i}'
        purse_col = f'recent_purse_{i}'
        if finish_pos_col not in df.columns:
            df[finish_pos_col] = df.get(f'recentFinishPosition{i}', -1)
        if lengths_col not in df.columns:
            df[lengths_col] = -1
        if purse_col not in df.columns:
            df[purse_col] = -1

    # Drop rows with missing target values (only for training data)
    if not is_test and all(col in df.columns for col in targets):
        df = df.dropna(subset=targets).copy()
        print(f"After dropping NaN targets: {len(df)} rows")

    # Initialize missing columns
    for col in features:
        if col not in df.columns:
            df[col] = 'unknown' if col in categorical_cols else -1

    # Handle missing values
    for col in df.columns:
        if col in categorical_cols:
            df[col] = df[col].fillna('unknown').astype(str)
        elif col in numerical_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(-1)

    # Verify numerical columns
    for col in numerical_cols:
        if col in df.columns:
            if df[col].dtype not in [np.float64, np.float32, np.int64, np.int32]:
                print(f"Warning: Column {col} contains non-numerical data: {df[col].dtype}")
                df[col] = pd.to_numeric(df[col], errors='coerce').fillna(-1)

    # Normalize numerical features
    cols_to_scale = [col for col in numerical_cols if col in df.columns]
    print("Columns to scale:", cols_to_scale)
    if cols_to_scale:
        if is_test and existing_scaler is not None:
            scaler = existing_scaler
            df[cols_to_scale] = scaler.transform(df[cols_to_scale])
        else:
            scaler = MinMaxScaler()
            df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
            if is_test:
                print("Warning: Fitting a new scaler on test data. For consistent predictions, use the training scaler.")
    else:
        scaler = MinMaxScaler()

    # Create text sequences
    def create_text_sequence(row):
        text_parts = []
        for feature in features:
            value = row.get(feature, 'unknown')
            if feature in numerical_cols:
                try:
                    value = float(value)
                    text_parts.append(f"{feature.replace('_x', '').capitalize()}: {value:.2f}")
                except (ValueError, TypeError):
                    text_parts.append(f"{feature.replace('_x', '').capitalize()}: unknown")
            else:
                text_parts.append(f"{feature.replace('_x', '').capitalize()}: {value}")
        return ", ".join(text_parts)
    df['input_text'] = df.apply(create_text_sequence, axis=1)

    # Create target text
    if not is_test and all(col in df.columns for col in targets):
        df['target_text'] = df.apply(
            lambda row: f"Finish: {row['official_finish']}, Speed: {row['speed_rating']}, Time: {row['win_time']}", axis=1
        )
    else:
        df['target_text'] = 'unknown'

    # Preserve necessary columns
    columns_to_keep = ['input_text', 'target_text', 'horse', 'program_number', 'race_number'] + (
        targets if all(col in df.columns for col in targets) else []
    )
    columns_to_keep = [col for col in columns_to_keep if col in df.columns]
    return df[columns_to_keep], scaler

# Step 2: Define the RaceDataset class
class RaceDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.has_targets = all(col in df.columns for col in ['official_finish', 'speed_rating', 'win_time'])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_text = self.df.iloc[idx]['input_text']
        target_text = self.df.iloc[idx]['target_text']
        targets = self.df.iloc[idx][['official_finish', 'speed_rating', 'win_time']].values.astype(float) if self.has_targets else np.zeros(3)

        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_encoding = self.tokenizer(
            target_text,
            max_length=32,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'decoder_input_ids': target_encoding['input_ids'].squeeze(),
            'decoder_attention_mask': target_encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(targets, dtype=torch.float) if self.has_targets else torch.zeros(3, dtype=torch.float)
        }

# Step 3: Define the BartForRegression class
class BartForRegression(nn.Module):
    def __init__(self, bart_model):
        super(BartForRegression, self).__init__()
        self.bart = bart_model
        self.regression_head = nn.Linear(self.bart.config.d_model, 3)

    def forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask):
        outputs = self.bart(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask
        )
        last_hidden_state = outputs.last_hidden_state[:, 0, :]
        predictions = self.regression_head(last_hidden_state)
        return predictions

# Step 4: Define the evaluate_model function
def evaluate_model(model, test_file_or_df, tokenizer, scaler, device, is_external_test=False):
    if isinstance(test_file_or_df, str):
        test_df, _ = load_and_preprocess_data(test_file_or_df, is_test=True, existing_scaler=scaler)
    else:
        test_df = test_file_or_df

    test_dataset = RaceDataset(test_df, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=8)

    model.eval()
    predictions = []
    actuals = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            decoder_input_ids = batch['decoder_input_ids'].to(device)
            decoder_attention_mask = batch['decoder_attention_mask'].to(device)
            labels = batch['labels'].to(device)

            preds = model(input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
            predictions.append(preds.cpu().numpy())
            if test_dataset.has_targets:
                actuals.append(labels.cpu().numpy())

    predictions = np.vstack(predictions)
    output_file = 'external_test_predictions.csv' if is_external_test else 'internal_test_predictions.csv'

    if test_dataset.has_targets:
        actuals = np.vstack(actuals)
        min_samples = min(actuals.shape[0], predictions.shape[0])
        actuals = actuals[:min_samples]
        predictions = predictions[:min_samples]

        mse = mean_squared_error(actuals, predictions, multioutput='raw_values')
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(actuals, predictions, multioutput='raw_values')

        print("\nTest Set Evaluation Metrics:")
        for i, metric in enumerate(['Official Finish', 'Speed Rating', 'Win Time']):
            print(f"{metric}:")
            print(f"  MSE: {mse[i]:.4f}")
            print(f"  RMSE: {rmse[i]:.4f}")
            print(f"  MAE: {mae[i]:.4f}")

        results = pd.DataFrame({
            'actual_finish': actuals[:, 0],
            'predicted_finish': predictions[:, 0],
            'actual_speed': actuals[:, 1],
            'predicted_speed': predictions[:, 1],
            'actual_time': actuals[:, 2],
            'predicted_time': predictions[:, 2],
            'horse': test_df['horse'].iloc[:min_samples],
            'program_number': test_df['program_number'].iloc[:min_samples],
            'race_number': test_df['race_number'].iloc[:min_samples]
        })
    else:
        print("\nNo target columns in test data. Saving predictions only.")
        results = pd.DataFrame({
            'predicted_finish': predictions[:, 0],
            'predicted_speed': predictions[:, 1],
            'predicted_time': predictions[:, 2],
            'horse': test_df['horse'].iloc[:predictions.shape[0]],
            'program_number': test_df['program_number'].iloc[:predictions.shape[0]],
            'race_number': test_df['race_number'].iloc[:predictions.shape[0]]
        })

    results.to_csv(output_file, index=False)
    print(f"\nTest predictions saved to '{output_file}'")
    return predictions, actuals if test_dataset.has_targets else None

# Step 5: Initialize tokenizer and Bart base model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
bart_model = BartModel.from_pretrained('facebook/bart-base')

# Step 6: Initialize the custom BartForRegression model
model = BartForRegression(bart_model)

# Step 7: Load the trained model weights
try:
    model.load_state_dict(torch.load('best_bart_model.pt', map_location=torch.device('cpu')))
    print("Loaded model weights from 'best_bart_model.pt'")
except FileNotFoundError:
    print("Error: 'best_bart_model.pt' not found. Please ensure the model weights file is in the working directory.")
    raise

# Step 8: Set device (GPU or CPU) and move the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

# Step 9: Load the training scaler
try:
    training_scaler = joblib.load('training_scaler.pkl')
    print("Loaded training scaler from 'training_scaler.pkl'")
except FileNotFoundError:
    print("Warning: Training scaler not found. Fitting a new scaler on test data, which may lead to inconsistent predictions.")
    training_scaler = None

# Step 10: Load and preprocess the new dataset
test_file = '/content/CDX0515_filtered.csv'
test_df, scaler = load_and_preprocess_data(test_file, is_test=True, existing_scaler=training_scaler)

# Step 11: Evaluate model and generate predictions
print("\nEvaluating on external test set (CDX0515_filtered.csv)...")
predictions, actuals = evaluate_model(model, test_df, tokenizer, scaler, device, is_external_test=True)

print("\nPredictions generated and saved to 'external_test_predictions.csv'.")

Loaded model weights from 'best_bart_model.pt'
Loaded data with 86 rows and 200 columns
Columns in dataset: ['track_code', 'race_date', 'race_number', 'post_position', 'entry', 'distance', 'surface_code', 'race_type', 'claiming_price_category', 'race_class', 'purse', 'claiming_price', 'claiming_price2', 'speed_rating', 'race_conditions', 'horses_list', 'track_code2', 'race_number2', 'breed_type', 'field_size', 'trainer_name', 'trainer_starts', 'trainer_wins', 'trainer_places', 'trainer_shows', 'trainer_roi', 'jockey_name', 'jockey_starts', 'jockey_wins', 'jockey_places', 'jockey_shows', 'owner_name', 'owner_silks', 'program_number', 'odds', 'horse_name', 'age', 'sex', 'color', 'sire_color', 'weight', 'sire', 'sire_sire', 'dam', 'dam_sire', 'breeder', 'state_bred', 'recentRaceDate1', 'recentRaceDate2', 'recentRaceDate3', 'recentRaceDate4', 'recentRaceDate5', 'recentRaceDate6', 'recentRaceDate7', 'recentRaceDate8', 'recentRaceDate9', 'recentRaceDate10', 'recentRaceTrackCode1', 'recentRac

Evaluating: 100%|██████████| 11/11 [01:24<00:00,  7.65s/it]


No target columns in test data. Saving predictions only.

Test predictions saved to 'external_test_predictions.csv'

Predictions generated and saved to 'external_test_predictions.csv'.



