# Uma Prediction Model Training (Colab Version)

In [None]:
!pip install pandas scikit-learn tensorflow lightgbm datasets huggingface_hub python-dotenv

In [None]:
import pandas as pd
import json
import sys
import os

from datasets import load_dataset
from huggingface_hub import HfApi, create_repo
from dotenv import load_dotenv

# Google Colab環境でのパス設定
# リポジトリをクローンした場合のパスを想定
if 'google.colab' in sys.modules:
    # Colab環境の場合、カレントディレクトリをリポジトリのルートに移動
    # これは、スクリプトがリポジトリのサブディレクトリにある場合に必要
    # 例: /content/uma_prediction/scripts/model_training
    # os.chdir('/content/uma_prediction') # 必要に応じて調整
    print("Running in Google Colab environment.")
    # Colabで.envファイルを読み込むための設定
    # from google.colab import userdata
    # os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN') # Colab SecretsからHF_TOKENを読み込む場合
    # または、手動でHF_TOKENを設定
    # os.environ["HF_TOKEN"] = "YOUR_HF_TOKEN_HERE"
    
    # scripts/data_preprocessing/lgbm_categorical_processor.py をColabで利用可能にするためのパス追加
    # リポジリのルートディレクトリをsys.pathに追加
    # 例: /content/uma_prediction
    project_root = os.path.abspath(os.path.join(os.getcwd(), '../../..'))
    if project_root not in sys.path:
        sys.path.append(project_root)
    print(f"Added {project_root} to sys.path")

# training_utils.py から関数をインポート
project_root = os.path.abspath(os.path.join(os.getcwd(), '../../..'))
if project_root not in sys.path:
    sys.path.append(project_root)
sys.path.append(".")
from scripts.model_training.training_utils import (
    get_model_path,
    update_training_status,
    preprocess_data,
    train_model,
)

# .envファイルから環境変数を読み込む
load_dotenv()

# Hugging Faceリポジトリの設定
HF_MODEL_REPO_ID = os.getenv("REPO_ID")
hf_token = os.getenv("HF_TOKEN")

if hf_token is None:
    print("Hugging Face token (HF_TOKEN) not found in environment variables or .env file.")
    print("Please ensure .env file exists and contains HF_TOKEN, or set it as an environment variable.")
    sys.exit()

hf_api = HfApi(token=hf_token)

# --- Model Paths ---
DATASET_REPO_ID = os.getenv("DATASET_REPO_ID")

def load_data():
    try:
        # Hugging Face Datasetsからデータを読み込む
        dataset = load_dataset(DATASET_REPO_ID, split="train")
        # Pandas DataFrameに変換
        df = dataset.to_pandas()
        print(f"Successfully loaded data from Hugging Face Dataset: {DATASET_REPO_ID}")
        return df
    except Exception as e:
        print(f"Error loading data from Hugging Face Dataset: {e}")
        return pd.DataFrame()


In [None]:
# Create model repository on Hugging Face if it doesn't exist
try:
    create_repo(repo_id=HF_MODEL_REPO_ID, repo_type="model", exist_ok=True, token=hf_token)
    print(f"Model repository '{HF_MODEL_REPO_ID}' created or already exists.")
except Exception as e:
    print(f"Error creating/checking model repository: {e}")
    sys.exit()

update_training_status(
    {"status": "running", "message": "Starting training process..."}
)
df = load_data()
if df.empty:
    update_training_status({"status": "error", "message": "No data loaded."})
    sys.exit()

for target_mode in ["default", "top3"]:
    # With horse info
    X_rf, y_rf, target_maps_rf = preprocess_data(
        df.copy(), model_type="rf", target_mode=target_mode
    )
    train_model("rf", X_rf, y_rf, target_mode, hf_api=hf_api, hf_token=hf_token, hf_model_repo_id=HF_MODEL_REPO_ID, target_maps=target_maps_rf)
    X_lgbm, y_lgbm, cats_lgbm_with_categories = preprocess_data( # 変数名を変更
        df.copy(), model_type="lgbm", target_mode=target_mode
    )
    train_model("lgbm", X_lgbm, y_lgbm, target_mode,
                hf_api=hf_api, hf_token=hf_token, hf_model_repo_id=HF_MODEL_REPO_ID,
                categorical_features=[col for col in cats_lgbm_with_categories.keys()], # LGBMの引数にはカラム名リストを渡す
                categorical_features_with_categories=cats_lgbm_with_categories)

    # WITHOUT horse info
    X_rf_no, y_rf_no, target_maps_rf_no = preprocess_data(
        df.copy(), model_type="rf", target_mode=target_mode, exclude_horse_info=True
    )
    train_model(
        "rf", X_rf_no, y_rf_no, target_mode, horse_info="excluded",
        hf_api=hf_api, hf_token=hf_token, hf_model_repo_id=HF_MODEL_REPO_ID, target_maps=target_maps_rf_no
    )
    X_lgbm_no, y_lgbm_no, cats_lgbm_no_with_categories = preprocess_data( # 変数名を変更
        df.copy(),
        model_type="lgbm",
        target_mode=target_mode,
        exclude_horse_info=True,
    )
    train_model("lgbm", X_lgbm_no, y_lgbm_no, target_mode,
                hf_api=hf_api, hf_token=hf_token, hf_model_repo_id=HF_MODEL_REPO_ID,
                categorical_features=[col for col in cats_lgbm_no_with_categories.keys()], # LGBMの引数にはカラム名リストを渡す
                categorical_features_with_categories=cats_lgbm_no_with_categories, horse_info="excluded")

    # CNN with categorical features
    X_cnn, y_cnn, flat_cols, imputation_values, class_weight_dict = preprocess_data(
        df.copy(), model_type="cnn", target_mode=target_mode
    )
    train_model(
        "cnn",
        X_cnn,
        y_cnn,
        target_mode,
        hf_api=hf_api, hf_token=hf_token, hf_model_repo_id=HF_MODEL_REPO_ID,
        flat_features_columns=flat_cols,
        imputation_values=imputation_values,
        class_weight_dict=class_weight_dict,
    )

print("Model training finished.")
update_training_status(
    {"status": "completed", "message": "All models trained successfully."}
)