### start

In [2]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import gc
import time
import copy
import logging
import re
from itertools import chain,combinations
import pdb
import math
from pathlib import Path
import pickle
import cloudpickle
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import IPython
import requests
import warnings
import boto3
import matplotlib as plt

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'
warnings.filterwarnings('ignore', message='.*frozen modules.*')
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['LOGGING_FILE'] = "../../../Local/logs/wallet_modeling.log"
os.environ['NOTIFICATION_SOUNDS_DIR'] = "../../../Local"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'
plt.rcParams['text.usetex'] = False
plt.rcParams['mathtext.default'] = 'regular'

import utils as u
# Set the custom error handler
ipython = IPython.get_ipython()
# ipython.set_custom_exc((Exception,), u.notify_on_failure)

player = u.AmbientPlayer()
player.stop_all_players()

# configure logger
logger = u.setup_notebook_logger('../logs/notebook_logs.log')
logger.setLevel(logging.INFO)

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')

import sagemaker_wallets.wallet_modeler as wm
import sagemaker_wallets.wallet_preprocessor as wp
import sagemaker_wallets.workflow_orchestrator as wo
from utils import ConfigError
import utils as u

# reload all modules
modules = [
    wm, wp, wo,
    u
]

from IPython.core import ultratb
ultratb.VerboseTB._tb_highlight = "bg:#b45827"


# load all configs
sage_wallets_config = yaml.safe_load(Path('../config/sage_wallets_config.yaml').read_text(encoding='utf-8'))
sage_wallets_modeling_config = yaml.safe_load(Path('../config/sage_wallets_modeling_config.yaml').read_text(encoding='utf-8'))


u.export_code(
    code_directories=[
        # 'etls',
        'sagemaker_wallets',
    ],
    # include_config = True,
    # ipynb_notebook = 'DDA-769 coin model score dist toggle.ipynb'
)

[importlib.reload(module) for module in modules]
u.notify('retro')

logger.milestone("Good morning, let's get to work")

[0m[15/Jul/25 15:46:15] INFO [utils.export_code:1667] Consolidation complete. All files are saved in temp/consolidated_code.py[0m
[92m[15/Jul/25 15:46:15] MILESTONE [2965290251.<module>:113] Good morning, let's get to work[0m


# Initial ETLs

# Code begins

### Initiate orchestrator

In [80]:
[importlib.reload(module) for module in modules]
sage_wallets_config = yaml.safe_load(Path('../config/sage_wallets_config.yaml').read_text(encoding='utf-8'))
sage_wallets_modeling_config = yaml.safe_load(Path('../config/sage_wallets_modeling_config.yaml').read_text(encoding='utf-8'))


workflow_orch = wo.WalletWorkflowOrchestrator(sage_wallets_config)

### Load, preprocess, and upload data

#### load data

In [81]:
[importlib.reload(module) for module in modules]
sage_wallets_config = yaml.safe_load(Path('../config/sage_wallets_config.yaml').read_text(encoding='utf-8'))
sage_wallets_modeling_config = yaml.safe_load(Path('../config/sage_wallets_modeling_config.yaml').read_text(encoding='utf-8'))

date_suffixes = [
    '231107',
    '240306'
]
workflow_orch.load_training_data(date_suffixes)

[92m[12/Jul/25 14:33:33] MILESTONE [workflow_orchestrator.load_training_data:91] <PROD> Loading training data for 2 periods: ['231107', '240306'][0m
[0m[12/Jul/25 14:33:34] INFO [workflow_orchestrator.load_training_data:109] Training data loaded successfully: 8 splits, 398,964 total rows[0m


##### inspect features

In [39]:
# Create combined NaN count and describe statistics
nan_counts = workflow_orch.training_data['x_train'].isna().sum()
describe_stats = workflow_orch.training_data['x_train'].describe().T

# Combine into single DataFrame
combined_stats = pd.concat([
    nan_counts.rename('nan_count'),
    describe_stats
], axis=1).sort_index()

u.display_full(combined_stats.sort_index())

Unnamed: 0,nan_count,count,mean,std,min,25%,50%,75%,max
cluster|k5_cluster_k0,0,87727,0.219521925975,0.413925117657,0.0,0.0,0.0,0.0,1.0
cluster|k5_cluster_k1,0,87727,0.2671241465,0.442460245051,0.0,0.0,0.0,1.0,1.0
cluster|k5_cluster_k2,0,87727,0.125377591847,0.331148458122,0.0,0.0,0.0,0.0,1.0
cluster|k5_cluster_k3,0,87727,0.207507380852,0.405524280788,0.0,0.0,0.0,0.0,1.0
cluster|k5_cluster_k4,0,87727,0.180468954826,0.384579766889,0.0,0.0,0.0,0.0,1.0
cw_cluster|k5_cluster_k0,0,87727,0.413236517834,0.492417365792,0.0,0.0,0.0,1.0,1.0
cw_cluster|k5_cluster_k1,0,87727,0.135705085094,0.342477082399,0.0,0.0,0.0,0.0,1.0
cw_cluster|k5_cluster_k2,0,87727,0.0938023641524,0.291555225639,0.0,0.0,0.0,0.0,1.0
cw_cluster|k5_cluster_k3,0,87727,0.14589579035,0.353003723997,0.0,0.0,0.0,0.0,1.0
cw_cluster|k5_cluster_k4,0,87727,0.211360242571,0.408275630572,0.0,0.0,0.0,0.0,1.0


#### preprocess data

In [82]:
preprocessor = wp.SageWalletsPreprocessor(sage_wallets_config)
preprocessed_data = preprocessor.preprocess_training_data(workflow_orch.training_data)

[0m[12/Jul/25 14:33:34] INFO [wallet_preprocessor.preprocess_training_data:42] Starting preprocessing for SageMaker XGBoost compatibility...[0m
[0m[12/Jul/25 14:33:34] INFO [wallet_preprocessor._handle_missing_values:165] Filled NaN values in 100 columns for x_train.[0m
[0m[12/Jul/25 14:33:34] INFO [wallet_preprocessor._combine_x_y_data:259] Merged y df with target var cw_crypto_net_gain/crypto_inflows/winsorized with X data.[0m
[0m[12/Jul/25 14:33:34] INFO [wallet_preprocessor.preprocess_training_data:77] Preprocessed train: 87,727 rows × 222 cols.[0m
[0m[12/Jul/25 14:33:34] INFO [wallet_preprocessor._handle_missing_values:165] Filled NaN values in 100 columns for x_test.[0m
[0m[12/Jul/25 14:33:34] INFO [wallet_preprocessor._combine_x_y_data:259] Merged y df with target var cw_crypto_net_gain/crypto_inflows/winsorized with X data.[0m
[0m[12/Jul/25 14:33:34] INFO [wallet_preprocessor.preprocess_training_data:77] Preprocessed test: 16,450 rows × 222 cols.[0m
[0m[12/Jul/25

#### upload data

In [83]:
# Upload and retrieve URIs
workflow_orch.upload_training_data(preprocessed_data, overwrite_existing=True)
s3_uris = workflow_orch.retrieve_training_data_uris(date_suffixes)

[0m[12/Jul/25 14:33:34] INFO [workflow_orchestrator.upload_training_data:152] <PROD> Ready to upload 8 preprocessed training data files across 2 date folders.[0m
[0m[12/Jul/25 14:33:34] INFO [workflow_orchestrator.upload_training_data:154] Target variable: cw_crypto_net_gain_crypto_inflows_winsorized[0m
[0m[12/Jul/25 14:33:34] INFO [workflow_orchestrator.upload_training_data:155] Target: s3://wallet-training-data/training-data-preprocessed/dda-897-td-col-ordering/[DATE]/[0m
[0m[12/Jul/25 14:33:37] INFO [workflow_orchestrator.upload_training_data:187] Uploading train_cw_crypto_net_gain_crypto_inflows_winsorized for 231107: 87,727 rows[0m
[0m[12/Jul/25 14:34:00] INFO [workflow_orchestrator.upload_training_data:196] Uploaded train to s3://wallet-training-data/training-data-preprocessed/dda-897-td-col-ordering/231107/train_cw_crypto_net_gain_crypto_inflows_winsorized.csv[0m
[0m[12/Jul/25 14:34:00] INFO [workflow_orchestrator.upload_training_data:187] Uploading test_cw_crypto_net

### retrieve uris

In [84]:
date_suffixes = [
    '231107',
    # '240306'
]

# Generate URIs for specific dates without any prior setup
s3_uris = workflow_orch.retrieve_training_data_uris(date_suffixes)
s3_uris

{'231107': {'train': 's3://wallet-training-data/training-data-preprocessed/dda-897-td-col-ordering/231107/train_cw_crypto_net_gain_crypto_inflows_winsorized.csv',
  'test': 's3://wallet-training-data/training-data-preprocessed/dda-897-td-col-ordering/231107/test_cw_crypto_net_gain_crypto_inflows_winsorized.csv',
  'eval': 's3://wallet-training-data/training-data-preprocessed/dda-897-td-col-ordering/231107/eval_cw_crypto_net_gain_crypto_inflows_winsorized.csv',
  'val': 's3://wallet-training-data/training-data-preprocessed/dda-897-td-col-ordering/231107/val_cw_crypto_net_gain_crypto_inflows_winsorized.csv'}}

### dda 872 devspace

In [85]:
[importlib.reload(module) for module in modules]
sage_wallets_config = yaml.safe_load(Path('../config/sage_wallets_config.yaml').read_text(encoding='utf-8'))
sage_wallets_modeling_config = yaml.safe_load(Path('../config/sage_wallets_modeling_config.yaml').read_text(encoding='utf-8'))


modeler = wm.WalletModeler(sage_wallets_config, s3_uris)




In [86]:
modeler.train_model()

[0m[12/Jul/25 14:35:15] INFO [wallet_modeler.train_model:55] Starting SageMaker XGBoost training[0m
[0m[12/Jul/25 14:35:15] INFO [image_uris._processor:530] Ignoring unnecessary instance type: None.[0m
[0m[12/Jul/25 14:35:15] INFO [wallet_modeler.train_model:100] Launching training job: wallet-xgb-231107-143515[0m
[0m[12/Jul/25 14:35:15] INFO [telemetry_logging.wrapper:91] SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.[0m
[0m[12/Jul/25 14:35:15] INFO [session.submit:1053] Creating training-job with name: wallet-xgb-231107-143515[0m


2025-07-12 21:35:19 Starting - Starting the training job...
2025-07-12 21:35:34 Starting - Preparing the instances for training...
2025-07-12 21:35:55 Downloading - Downloading input data...
2025-07-12 21:36:45 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index
[2025-07-12 21:37:46.562 ip-10-0-91-152.us-west-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2025-07-12 21:37:46.583 ip-10-0-91-152.us-west-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.
[2025-07-12:21:37:46:INFO] Imported framework sagemaker_xgboost_container.training
[2025-07-12:21:37:46:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.
Returning the value itself
[2025-07-12:21:37:46:INFO] No GPUs detected (normal if no gpus installed)
[2025-07-12:21:37:46:INFO] Running XGBoost Sagemaker in algorithm mode
[2025-07-12:21:37:46:INFO] Determined 0 GPU(s) available on the instance.
[2025-07-

[0m[12/Jul/25 14:40:42] INFO [wallet_modeler.train_model:115] Training completed. Model stored at: s3://wallet-training-data/sagemaker-models/wallet-xgb-231107-143515/output/model.tar.gz[0m


{'model_uri': 's3://wallet-training-data/sagemaker-models/wallet-xgb-231107-143515/output/model.tar.gz',
 'training_job_name': 'wallet-xgb-231107-143515'}

In [79]:
modeler.train_model()

[0m[12/Jul/25 14:30:09] INFO [wallet_modeler.train_model:55] Starting SageMaker XGBoost training[0m
[0m[12/Jul/25 14:30:09] INFO [image_uris._processor:530] Ignoring unnecessary instance type: None.[0m
[0m[12/Jul/25 14:30:09] INFO [wallet_modeler.train_model:100] Launching training job: wallet-xgb-231107-143009[0m
[0m[12/Jul/25 14:30:09] INFO [telemetry_logging.wrapper:91] SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.[0m
[0m[12/Jul/25 14:30:09] INFO [session.submit:1053] Creating training-job with name: wallet-xgb-231107-143009[0m


2025-07-12 21:30:11 Starting - Starting the training job...
2025-07-12 21:30:26 Starting - Preparing the instances for training...
2025-07-12 21:30:48 Downloading - Downloading input data...
2025-07-12 21:31:33 Downloading - Downloading the training image......
2025-07-12 21:32:44 Training - Training image download completed. Training in progress.
  from pandas import MultiIndex, Int64Index
[2025-07-12 21:32:34.259 ip-10-0-100-103.us-west-2.compute.internal:8 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2025-07-12 21:32:34.283 ip-10-0-100-103.us-west-2.compute.internal:8 INFO profiler_config_parser.py:111] User has disabled profiler.
[2025-07-12:21:32:34:INFO] Imported framework sagemaker_xgboost_container.training
[2025-07-12:21:32:34:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.
Returning the value itself
[2025-07-12:21:32:34:INFO] No GPUs detected (normal if no gpus installed)
[2025-07-12:21:32:34:INFO] Running XGBoost Sagemaker in algorith

[0m[12/Jul/25 14:33:33] INFO [wallet_modeler.train_model:115] Training completed. Model stored at: s3://wallet-training-data/sagemaker-models/wallet-xgb-231107-143009/output/model.tar.gz[0m


{'model_uri': 's3://wallet-training-data/sagemaker-models/wallet-xgb-231107-143009/output/model.tar.gz',
 'training_job_name': 'wallet-xgb-231107-143009'}

In [None]:
workflow_orch.training_data['x_train'].shape