<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/Copy_of_MVP_HTMP_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

hull_tactical_market_prediction_path = kagglehub.competition_download('hull-tactical-market-prediction')

print('Data source import complete.')


In [None]:
import os
import pandas as pd
import numpy as np
import joblib
import lightgbm as lgb
from pathlib import Path
from kaggle_evaluation.default_inference_server import DefaultInferenceServer
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# --- CONFIGURATION ---
KAGGLE_INPUT_PATH = '/kaggle/input/hull-tactical-market-prediction/'
MODEL_SAVE_PATH = '/tmp/lgbm_model.pkl'
TARGET_COL = 'market_forward_excess_returns'

SCALING_FACTOR = 10000.0

# Comprehensive list of ALL possible feature names for dynamic filtering
ALL_POSSIBLE_FEATURE_NAMES = [
    'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'E1', 'E2', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19', 'E20',
    'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15',
    'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S12', 'S13', 'S14', 'S15',
    'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'M10', 'M11', 'M12', 'M13', 'M14', 'M15', 'M16', 'M17', 'M18', 'M19', 'M20',
    'I1', 'I2', 'I3', 'I4', 'I5', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9',
    'MOM1', 'MOM2', 'MOM3', 'MOM4', 'MOM5', 'MOM6', 'MOM7', 'MOM8', 'MOM9', 'MOM10',
    'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'T15'
]
EXCLUDE_COLS = [
    'date_id', TARGET_COL, 'forward_returns', 'risk_free_rate'
]
GLOBAL_TRAINING_FEATURE_COLS = []


# --- TRAINING PHASE (Model creation and saving) ---
print("--- PHASE I: MODEL TRAINING ---")
try:
    # 1. Load Data
    df_train = pd.read_csv(Path(KAGGLE_INPUT_PATH) / 'train.csv')

    # 2. Dynamic Feature Determination & Cleansing Setup
    current_feature_cols = [col for col in ALL_POSSIBLE_FEATURE_NAMES if col in df_train.columns and col not in EXCLUDE_COLS]
    GLOBAL_TRAINING_FEATURE_COLS = current_feature_cols

    # 3. Data Cleansing (Converts non-numeric strings to NaN)
    for col in GLOBAL_TRAINING_FEATURE_COLS:
        df_train[col] = pd.to_numeric(df_train[col], errors='coerce')

    # 4. Prepare Data for Training (Drop rows with missing target)
    df_train.dropna(subset=[TARGET_COL], inplace=True)

    X = df_train[GLOBAL_TRAINING_FEATURE_COLS]
    y = df_train[TARGET_COL]

    print(f"Training on {len(X)} rows and {len(GLOBAL_TRAINING_FEATURE_COLS)} features...")

    # 5. Train Model (V3's proven configuration)
    lgbm = lgb.LGBMRegressor(
        objective='regression',
        metric='rmse',
        n_estimators=2000,
        learning_rate=0.08,
        num_leaves=63,         # V3's proven complexity
        max_depth=8,
        n_jobs=-1,
        random_state=42,
        verbose=-1 # Suppresses LightGBM output/warnings
    )
    print("Training LightGBM model...")
    lgbm.fit(X, y)
    print("Training complete.")

    # 6. Save the Trained Model
    joblib.dump({
        'model': lgbm,
        'features': GLOBAL_TRAINING_FEATURE_COLS
    }, MODEL_SAVE_PATH)
    print(f"Trained model and feature list saved to: {MODEL_SAVE_PATH}")

except Exception as e:
    print(f"FATAL ERROR during TRAINING PHASE: {e}")
    class DummyModel:
        def predict(self, X): return np.array([0.0] * len(X))
    joblib.dump({'model': DummyModel(), 'features': []}, MODEL_SAVE_PATH)


# --- INFERENCE PHASE (Submission setup and logic) ---
print("\n--- PHASE II: SUBMISSION INFERENCE SETUP ---")
MODEL_LOADED = False
GLOBAL_MODEL = None
GLOBAL_INFERENCE_FEATURE_COLS = []

def predict(test) -> float:
    """
    The function executed by the evaluation API for each time step.
    It returns the optimal allocation (0.0 to 2.0).
    """
    global MODEL_LOADED, GLOBAL_MODEL, GLOBAL_INFERENCE_FEATURE_COLS

    if not MODEL_LOADED:
        try:
            loaded_data = joblib.load(MODEL_SAVE_PATH)
            GLOBAL_MODEL = loaded_data['model']
            GLOBAL_INFERENCE_FEATURE_COLS = loaded_data['features']

            MODEL_LOADED = True
            print("Model and feature list loaded. Starting live prediction...")
        except Exception as e:
            print(f"FATAL ERROR during model loading: {e}")
            return 1.0

    # --- INFERENCE LOGIC ---
    df_test = test.to_pandas() if not isinstance(test, pd.DataFrame) else test

    # 1. Feature Preparation (Using original 86 features)
    X_test = pd.DataFrame(index=df_test.index)

    for col in GLOBAL_INFERENCE_FEATURE_COLS:
        if col in df_test.columns:
            X_test[col] = pd.to_numeric(df_test[col], errors='coerce')
        else:
            X_test[col] = np.nan

    # 2. Prediction
    raw_prediction = GLOBAL_MODEL.predict(X_test[GLOBAL_INFERENCE_FEATURE_COLS])[0]


    # 3. Allocation Sizing (SCALING_FACTOR = 15.0)
    allocation_size = 1.0 + (raw_prediction * SCALING_FACTOR)

    # Enforce the competition's allowed range (0.0 to 2.0)
    final_allocation = np.clip(allocation_size, 0.0, 2.0)

    return final_allocation

# --- 7. RUN THE INFERENCE SERVER (The core submission endpoint) ---

inference_server = DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    print("Running in RERUN mode...")
    inference_server.serve()
else:
    print("Running local gateway for testing...")
    inference_server.run_local_gateway((KAGGLE_INPUT_PATH,))

print("Submission script finished.")

In [None]:
import pandas as pd
import numpy as np
import os

submission_path = '/kaggle/working/submission.parquet'

## ðŸ“Š SUBMISSION FILE VALIDATION

# Check if the file was successfully created
if not os.path.exists(submission_path):
    print(f"Validation Error: Submission file not found at {submission_path}")
    print("NOTE: This file is created during the 'Running local gateway for testing...' step.")
else:
    try:
        # Read the submission file
        df_sub = pd.read_parquet(submission_path)

        # --- Validation Checks ---

        # 1. Identify the prediction column (the single float column)
        float_cols = df_sub.select_dtypes(include=[np.float64]).columns

        if len(float_cols) == 1:
            prediction_col_name = float_cols[0]
            print(f"Column Check: Found single prediction column named '{prediction_col_name}'.")
        else:
            prediction_col_name = 'allocation' # Use standard name for range check fallback
            print(f"Column Check: Found {len(float_cols)} float columns. Using '{prediction_col_name}' for range check.")

        # 2. Check allocation range (0.0 to 2.0)
        if prediction_col_name in df_sub.columns:
            min_val = df_sub[prediction_col_name].min()
            max_val = df_sub[prediction_col_name].max()

            # Check if all values are between 0.0 and 2.0 (inclusive)
            if min_val >= 0.0 and max_val <= 2.0:
                range_check = "PASS"
            else:
                range_check = f"FAIL (Min: {min_val:.4f}, Max: {max_val:.4f})"

            print(f"Allocation Range Check (0.0 to 2.0): {range_check}")

        # 3. Display the file info
        print("\nFirst 5 Rows of Submission:")
        print(df_sub.head())

        print("\nSubmission Info:")
        df_sub.info()

    except Exception as e:
        print(f"Validation Error: Could not read or process the Parquet file. Error: {e}")