### start

In [2]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import gc
import time
import copy
import logging
import re
from itertools import chain,combinations
import pdb
import math
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import IPython
import requests
import warnings
import boto3
import matplotlib as plt
import sklearn

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'
warnings.filterwarnings('ignore', message='.*frozen modules.*')
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['LOGGING_FILE'] = "../../../Local/logs/wallet_modeling.log"
os.environ['NOTIFICATION_SOUNDS_DIR'] = "../../../Local"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'
plt.rcParams['text.usetex'] = False
plt.rcParams['mathtext.default'] = 'regular'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('../src')

import sage_wallet_modeling.wallet_modeler as wm
from sage_wallet_modeling.wallet_preprocessor import SageWalletsPreprocessor
import sage_wallet_modeling.workflow_orchestrator as wo
import sage_wallet_insights.model_evaluation as sime
import sage_utils.config_validation as ucv

# import data-science modules
sys.path.append(str(Path("..") / ".." / "data-science" / "src"))
import wallet_insights.model_evaluation as wime
import utils as u
from utils import ConfigError



# reload all modules
modules = [
    wm, wo,
    sime,
    ucv,
    wime,
    u,
]

# import utils as u
# Set the custom error handler
ipython = IPython.get_ipython()
# ipython.set_custom_exc((Exception,), u.notify_on_failure)

player = u.AmbientPlayer()
player.stop_all_players()

# configure logger
logger = u.setup_notebook_logger('../logs/notebook_logs.log')
logger.setLevel(logging.INFO)

from IPython.core import ultratb
ultratb.VerboseTB._tb_highlight = "bg:#b45827"


# load all configs
sage_wallets_config = ucv.load_sage_wallets_config(Path('../config/sage_wallets_config.yaml'))
sage_wallets_modeling_config = ucv.load_sage_wallets_modeling_config(Path('../config/sage_wallets_modeling_config.yaml'))


u.export_code(
    code_directories=[
        # 'sage_wallet_insights',
        'sage_wallet_modeling',
        # 'sage_utils',
    ],
    # include_config = True,
    # ipynb_notebook = 'DDA-769 coin model score dist toggle.ipynb'
    output_file="temp/sagemaker_code.py"
)

[importlib.reload(module) for module in modules]
u.notify('retro')

logger.milestone("Good morning, let's get to work")

[0m[28/Jul/25 22:02:47] INFO [utils.export_code:1659] Consolidation complete. All files are saved in temp/sagemaker_code.py[0m
[92m[28/Jul/25 22:02:47] MILESTONE [863424031.<module>:125] Good morning, let's get to work[0m


# Modeling and Scoring

### Initiate orchestrator

In [3]:
[importlib.reload(module) for module in modules]
sage_wallets_config = yaml.safe_load(Path('../config/sage_wallets_config.yaml').read_text(encoding='utf-8'))
sage_wallets_modeling_config = yaml.safe_load(Path('../config/sage_wallets_modeling_config.yaml').read_text(encoding='utf-8'))


date_suffixes = [
    '231008',
    # '231107',
    # '240306',
    # '250301'
]
workflow_orch = wo.WalletWorkflowOrchestrator(sage_wallets_config,sage_wallets_modeling_config)

### Load, preprocess, and upload all data

#### load data

In [4]:
[importlib.reload(module) for module in modules]
sage_wallets_config = yaml.safe_load(Path('../config/sage_wallets_config.yaml').read_text(encoding='utf-8'))
sage_wallets_modeling_config = yaml.safe_load(Path('../config/sage_wallets_modeling_config.yaml').read_text(encoding='utf-8'))

workflow_orch.load_all_training_data(date_suffixes)

[92m[28/Jul/25 22:02:51] MILESTONE [workflow_orchestrator.load_all_training_data:120] <DEV> Loading training data for 1 periods: ['231008'][0m
[0m[28/Jul/25 22:02:51] INFO [workflow_orchestrator.load_all_training_data:136] Training data loaded successfully: 8,000 total rows and 9 offsets for each date_suffix.[0m


##### inspect features

In [None]:
# Create combined NaN count and describe statistics
first_key = next(iter(workflow_orch.training_data))
nan_counts = workflow_orch.training_data[first_key]['x_train'].isna().sum()
describe_stats = workflow_orch.training_data[first_key]['x_train'].describe().T

# Combine into single DataFrame
combined_stats = pd.concat([
    nan_counts.rename('nan_count'),
    describe_stats
], axis=1).sort_index()

u.display_full(combined_stats.sort_index())

#### preprocess data

In [5]:
workflow_orch.preprocess_all_training_data()

[0m[28/Jul/25 22:02:53] INFO [workflow_orchestrator.preprocess_all_training_data:167] Preprocessing 1 date periods...[0m
[0m[28/Jul/25 22:02:53] INFO [wallet_preprocessor.preprocess_training_data:76] Starting preprocessing for SageMaker XGBoost compatibility...[0m
[0m[28/Jul/25 22:02:53] INFO [wallet_preprocessor._handle_missing_values:231] Filled NaN values in 100 columns for x_train.[0m
[0m[28/Jul/25 22:02:53] INFO [wallet_preprocessor._preprocess_y_data:268] Applied classification threshold 0.05 to y_train: 316 positive (31.6%), 684 negative (68.4%)[0m
[0m[28/Jul/25 22:02:53] INFO [wallet_preprocessor._combine_x_y_data:379] Merged y df with target var cw_crypto_net_gain/crypto_inflows/winsorized with X data.[0m
[0m[28/Jul/25 22:02:53] INFO [wallet_preprocessor._save_preprocessed_df:425] Saved preprocessed train split to ../s3_uploads/wallet_training_data_preprocessed/dda_897_td_column_ordering_v1_dev/231008/train.csv[0m
[0m[28/Jul/25 22:02:53] INFO [wallet_preprocessor.

#### upload data

In [6]:
# Upload and retrieve URIs
workflow_orch.upload_training_data(overwrite_existing=False)
s3_uris = workflow_orch.retrieve_training_data_uris(date_suffixes)

[92m[28/Jul/25 22:02:56] MILESTONE [workflow_orchestrator._confirm_upload:603] <DEV> Ready to upload 4000 total rows (6.8 MB) of preprocessed training data across 1 date folders.[0m
[0m[28/Jul/25 22:02:56] INFO [workflow_orchestrator._confirm_upload:623] Target: s3://wallet-training-data/training-data-preprocessed/dda-897-ngain-infl-dev/[DATE]/[0m
[0m[28/Jul/25 22:03:00] INFO [workflow_orchestrator.upload_training_data:213] Beginning approved upload...[0m
[0m[28/Jul/25 22:03:00] INFO [credentials.load:1352] Found credentials in shared credentials file: ~/.aws/credentials[0m
[0m[28/Jul/25 22:03:02] INFO [workflow_orchestrator._upload_csv_files:670] Uploaded train to s3://wallet-training-data/training-data-preprocessed/dda-897-ngain-infl-dev/231008/train.csv[0m
[0m[28/Jul/25 22:03:04] INFO [workflow_orchestrator._upload_csv_files:670] Uploaded test to s3://wallet-training-data/training-data-preprocessed/dda-897-ngain-infl-dev/231008/test.csv[0m
[0m[28/Jul/25 22:03:05] INFO [

### Train all models

In [None]:
workflow_orch.load_all_training_data(date_suffixes)
modeling_results = workflow_orch.train_all_models()

### Train Single Model

#### retrieve file uris

In [7]:
[importlib.reload(module) for module in modules]
sage_wallets_config = yaml.safe_load(Path('../config/sage_wallets_config.yaml').read_text(encoding='utf-8'))
sage_wallets_modeling_config = yaml.safe_load(Path('../config/sage_wallets_modeling_config.yaml').read_text(encoding='utf-8'))

workflow_orch = wo.WalletWorkflowOrchestrator(sage_wallets_config,sage_wallets_modeling_config)

# Generate URIs for specific dates without any prior setup
s3_uris = workflow_orch.retrieve_training_data_uris(date_suffixes)
s3_uris

{'231008': {'train': 's3://wallet-training-data/training-data-preprocessed/dda-897-ngain-infl-dev/231008/train.csv',
  'test': 's3://wallet-training-data/training-data-preprocessed/dda-897-ngain-infl-dev/231008/test.csv',
  'eval': 's3://wallet-training-data/training-data-preprocessed/dda-897-ngain-infl-dev/231008/eval.csv',
  'val': 's3://wallet-training-data/training-data-preprocessed/dda-897-ngain-infl-dev/231008/val.csv'}}

#### train model

In [8]:
[importlib.reload(module) for module in modules]
sage_wallets_config = yaml.safe_load(Path('../config/sage_wallets_config.yaml').read_text(encoding='utf-8'))
sage_wallets_modeling_config = yaml.safe_load(Path('../config/sage_wallets_modeling_config.yaml').read_text(encoding='utf-8'))

date_suffix = date_suffixes[0]

modeler = wm.WalletModeler(
    sage_wallets_config,
    sage_wallets_modeling_config,
    date_suffix,
    s3_uris
)



In [None]:
modeling_results = modeler.train_model()

[0m[28/Jul/25 22:03:28] INFO [wallet_modeler.train_model:120] Starting SageMaker training sequence...[0m
[0m[28/Jul/25 22:03:33] INFO [image_uris._processor:530] Ignoring unnecessary instance type: None.[0m
[0m[28/Jul/25 22:03:33] INFO [wallet_modeler._configure_estimator:183] SageMaker XGBoost container: 246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.7-1[0m
[0m[28/Jul/25 22:03:33] INFO [wallet_modeler._configure_estimator:189] Container version tag: 1.7-1[0m
[0m[28/Jul/25 22:03:33] INFO [wallet_modeler._configure_estimator:193] Requested framework version: 1.7-1[0m
[0m[28/Jul/25 22:03:33] INFO [wallet_modeler.train_model:145] Launching training job: wallet-xgb-dda-897-ngain-infl-dev-231008-20250728-220333[0m
[0m[28/Jul/25 22:03:33] INFO [wallet_modeler.train_model:146] Model output parent directory: s3://wallet-training-data/sagemaker-models/dda-897-ngain-infl-dev/[0m
[0m[28/Jul/25 22:03:33] INFO [telemetry_logging.wrapper:92] SageMaker Python SDK will

2025-07-29 05:03:34 Starting - Starting the training job...
2025-07-29 05:04:09 Downloading - Downloading input data.

### Score validation set

#### get existing model uri

In [None]:
[importlib.reload(module) for module in modules]
sage_wallets_config = yaml.safe_load(Path('../config/sage_wallets_config.yaml').read_text(encoding='utf-8'))
sage_wallets_modeling_config = yaml.safe_load(Path('../config/sage_wallets_modeling_config.yaml').read_text(encoding='utf-8'))



date_suffixes = [
    '231107',
    '240306'
]
date_suffix = date_suffixes[1]


modeler = wm.WalletModeler(
    sage_wallets_config,
    sage_wallets_modeling_config,
    date_suffix
)


model_metadata = modeler.load_existing_model()

#### deploy endpoint for predictions

In [None]:
modeler.list_active_endpoints()

In [None]:
modeler.deploy_endpoint()

#### load and preprocess test/val dfs

In [None]:
x_test = pd.read_parquet(
    Path("../s3_uploads/wallet_training_data_queue") /
    sage_wallets_config['training_data']['local_directory'] /
    f"x_test_{date_suffix}.parquet"
)
x_val = pd.read_parquet(
    Path("../s3_uploads/wallet_training_data_queue") /
    sage_wallets_config['training_data']['local_directory'] /
    f"x_val_{date_suffix}.parquet"
)




In [None]:
preprocessor = SageWalletsPreprocessor(sage_wallets_config)
x_test_processed = preprocessor.preprocess_x_df(x_test)
x_val_processed = preprocessor.preprocess_x_df(x_val)

logger.info(f"Preprocessed X_test {x_test_processed.shape} and "
            f"X_val {x_val_processed.shape}.")

#### predict

In [None]:
y_test_pred = modeler.predict_using_endpoint(x_test_processed, 'test')
y_val_pred = modeler.predict_using_endpoint(x_val_processed, 'val')

#### delete all endpoints

In [None]:
modeler.delete_all_endpoints()

In [None]:
modeler.list_active_endpoints()
modeler.list_all_endpoints()

# Model Evaluation

### Analyze predictions

In [None]:

date_suffixes = [
    '231107',
    '240306'
]
date_suffix = date_suffixes[1]

In [None]:
# Single function call for complete evaluation
evaluator = sime.run_sagemaker_evaluation(
    sage_wallets_config, sage_wallets_modeling_config, date_suffix
)

In [None]:
def load_sagemaker_predictions(
    data_type: str,
    sage_wallets_config: dict,
    sage_wallets_modeling_config: dict,
    date_suffix: str
) -> tuple[pd.Series, pd.Series]:
    """
    Load SageMaker predictions and corresponding actuals for a given data type.

    Params:
    - data_type (str): Either 'test' or 'val'
    - sage_wallets_config (dict): Configuration for training data paths
    - sage_wallets_modeling_config (dict): Configuration for model parameters
    - date_suffix (str): Date suffix for file naming

    Returns:
    - tuple: (predictions_series, actuals_series) with aligned indices
    """
    # Load predictions
    pred_path = Path(sage_wallets_modeling_config['metaparams']['endpoint_preds_dir']) / \
                f"endpoint_y_pred_{data_type}_{sage_wallets_config['training_data']['local_directory']}_{date_suffix}.csv"
    pred_df = pd.read_csv(pred_path)

    if 'score' not in pred_df.columns:
        raise ValueError(f"SageMaker predictions are missing the 'score' column. "
                        f"Available columns: {pred_df.columns}")
    pred_series = pred_df['score']

    # Load actuals
    training_data_path = (
        Path(f"../s3_uploads") / "wallet_training_data_queue" /
        f"{sage_wallets_config['training_data']['local_directory']}"
    )
    actuals_path = training_data_path / f"y_{data_type}_{date_suffix}.parquet"
    actuals_df = pd.read_parquet(actuals_path)

    if len(actuals_df.columns) > 1:
        raise ValueError(f"Found unexpected columns in y_{data_type}_df. "
                        f"Expected 1 column, found {actuals_df.columns}.")
    actuals_series = actuals_df.iloc[:, 0]

    # Validate lengths and align indices
    if len(pred_series) != len(actuals_series):
        raise ValueError(f"Length of y_{data_type}_pred ({len(pred_series)}) does "
                        f"not match length of y_{data_type}_true ({len(actuals_series)}).")

    pred_series.index = actuals_series.index

    return pred_series, actuals_series

In [None]:
# Load predictions and actuals
y_test_pred_series, y_test_true_series = load_sagemaker_predictions(
    'test', sage_wallets_config, sage_wallets_modeling_config, date_suffix
)
y_val_pred_series, y_val_true_series = load_sagemaker_predictions(
    'val', sage_wallets_config, sage_wallets_modeling_config, date_suffix
)

# Load remaining training data
training_data_path = (
    Path(f"../s3_uploads") / "wallet_training_data_queue" /
    f"{sage_wallets_config['training_data']['local_directory']}"
)
X_train = pd.read_parquet(training_data_path / f"x_train_{date_suffix}.parquet")
y_train = pd.read_parquet(training_data_path / f"y_train_{date_suffix}.parquet")
X_test = pd.read_parquet(training_data_path / f"x_test_{date_suffix}.parquet")
X_val = pd.read_parquet(training_data_path / f"x_val_{date_suffix}.parquet")

# Identify target variable and model type
target_variable = y_val_true_series.name or y_train.columns[0]
objective = sage_wallets_modeling_config['training']['hyperparameters']['objective']
model_type = 'regression' if objective[:3] == 'reg' else 'unknown'

In [None]:
# Create model_id and modeling_config
model_id = f"sagemaker_{sage_wallets_config['training_data']['local_directory']}_{date_suffix}"

modeling_config = {
    'target_variable': target_variable,
    'model_type': model_type,
    'returns_winsorization': 0.005,  # Default for winsorizing returns
    'training_data': {
        'modeling_period_duration': 30  # Default performance window
    },
    'sagemaker_metadata': {
        'objective': objective,
        'local_directory': sage_wallets_config['training_data']['local_directory'],
        'date_suffix': date_suffix
    }
}

# Create minimal wallet_model_results for SageMaker evaluation
wallet_model_results = {
    'model_id': model_id,
    'modeling_config': modeling_config,
    'model_type': model_type,

    # Training data
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test_true_series,
    'y_pred': y_test_pred_series,
    'training_cohort_pred': None,
    'training_cohort_actuals': None,

    # Validation data
    'X_validation': X_val,
    'y_validation': y_val_true_series,
    'y_validation_pred': y_val_pred_series,
    'validation_target_vars_df': None,

    # Fixed mock pipeline
    'pipeline': type('MockPipeline', (), {
        'named_steps': {'estimator': type('MockModel', (), {
            'get_params': lambda self: {'objective': objective}  # Accept self argument
        })()},
        '__getitem__': lambda self, key: type('MockTransformer', (), {
            'transform': lambda self, X: X  # Accept self argument
        })()
    })()
}

# Create evaluator
wallet_evaluator = wime.RegressorEvaluator(wallet_model_results)

# Run basic evaluation
wallet_evaluator.summary_report()
wallet_evaluator.plot_wallet_evaluation()

In [None]:
target_variable