### start

In [20]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import gc
import time
import copy
import logging
import re
from itertools import chain,combinations
import pdb
import math
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import IPython
import requests
import warnings
import boto3
import matplotlib as plt
import sklearn

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'
warnings.filterwarnings('ignore', message='.*frozen modules.*')
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['LOGGING_FILE'] = "../../../Local/logs/wallet_modeling.log"
os.environ['NOTIFICATION_SOUNDS_DIR'] = "../../../Local"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'
plt.rcParams['text.usetex'] = False
plt.rcParams['mathtext.default'] = 'regular'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')

import sagemaker_wallets.wallet_modeler as wm
import sagemaker_wallets.wallet_preprocessor as wp
import sagemaker_wallets.workflow_orchestrator as wo
import sage_utils.config_validation as ucv
import utils as u
from utils import ConfigError

# reload all modules
modules = [
    wm, wp, wo,
    u, ucv
]

# import utils as u
# Set the custom error handler
ipython = IPython.get_ipython()
# ipython.set_custom_exc((Exception,), u.notify_on_failure)

player = u.AmbientPlayer()
player.stop_all_players()

# configure logger
logger = u.setup_notebook_logger('../logs/notebook_logs.log')
logger.setLevel(logging.INFO)

from IPython.core import ultratb
ultratb.VerboseTB._tb_highlight = "bg:#b45827"


# load all configs
sage_wallets_config = yaml.safe_load(Path('../config/sage_wallets_config.yaml').read_text(encoding='utf-8'))
sage_wallets_modeling_config = yaml.safe_load(Path('../config/sage_wallets_modeling_config.yaml').read_text(encoding='utf-8'))


u.export_code(
    code_directories=[
        'sagemaker_wallets',
        # 'sage_utils',
    ],
    # include_config = True,
    # ipynb_notebook = 'DDA-769 coin model score dist toggle.ipynb'
    output_file="temp/sagemaker_code.py"
)

[importlib.reload(module) for module in modules]
u.notify('retro')

logger.milestone("Good morning, let's get to work")

[0m[16/Jul/25 17:15:34] INFO [utils.export_code:1659] Consolidation complete. All files are saved in temp/sagemaker_code.py[0m
[92m[16/Jul/25 17:15:34] MILESTONE [1720352528.<module>:114] Good morning, let's get to work[0m


# Code begins

### Initiate orchestrator

In [None]:
[importlib.reload(module) for module in modules]
sage_wallets_config = yaml.safe_load(Path('../config/sage_wallets_config.yaml').read_text(encoding='utf-8'))
sage_wallets_modeling_config = yaml.safe_load(Path('../config/sage_wallets_modeling_config.yaml').read_text(encoding='utf-8'))


date_suffixes = [
    '231107',
    '240306'
]
workflow_orch = wo.WalletWorkflowOrchestrator(sage_wallets_config)

### Load, preprocess, and upload data

#### load data

In [None]:
[importlib.reload(module) for module in modules]
sage_wallets_config = yaml.safe_load(Path('../config/sage_wallets_config.yaml').read_text(encoding='utf-8'))
sage_wallets_modeling_config = yaml.safe_load(Path('../config/sage_wallets_modeling_config.yaml').read_text(encoding='utf-8'))

workflow_orch.load_training_data(date_suffixes)

##### inspect features

In [None]:
# # Create combined NaN count and describe statistics
# nan_counts = workflow_orch.training_data['x_train'].isna().sum()
# describe_stats = workflow_orch.training_data['x_train'].describe().T

# # Combine into single DataFrame
# combined_stats = pd.concat([
#     nan_counts.rename('nan_count'),
#     describe_stats
# ], axis=1).sort_index()

# u.display_full(combined_stats.sort_index())

#### preprocess data

In [None]:
preprocessor = wp.SageWalletsPreprocessor(sage_wallets_config)
preprocessed_data = preprocessor.preprocess_training_data(workflow_orch.training_data)

#### upload data

In [None]:
# Upload and retrieve URIs
workflow_orch.upload_training_data(preprocessed_data, overwrite_existing=True)
s3_uris = workflow_orch.retrieve_training_data_uris(date_suffixes)

### Train Model

#### retrieve file uris

In [4]:
date_suffixes = [
    '231107',
    '240306'
]
workflow_orch = wo.WalletWorkflowOrchestrator(sage_wallets_config)

# Generate URIs for specific dates without any prior setup
s3_uris = workflow_orch.retrieve_training_data_uris(date_suffixes)
s3_uris

{'231107': {'train': 's3://wallet-training-data/training-data-preprocessed/dda-897/231107/train_cw_crypto_net_gain_crypto_inflows_winsorized.csv',
  'test': 's3://wallet-training-data/training-data-preprocessed/dda-897/231107/test_cw_crypto_net_gain_crypto_inflows_winsorized.csv',
  'eval': 's3://wallet-training-data/training-data-preprocessed/dda-897/231107/eval_cw_crypto_net_gain_crypto_inflows_winsorized.csv',
  'val': 's3://wallet-training-data/training-data-preprocessed/dda-897/231107/val_cw_crypto_net_gain_crypto_inflows_winsorized.csv'},
 '240306': {'train': 's3://wallet-training-data/training-data-preprocessed/dda-897/240306/train_cw_crypto_net_gain_crypto_inflows_winsorized.csv',
  'test': 's3://wallet-training-data/training-data-preprocessed/dda-897/240306/test_cw_crypto_net_gain_crypto_inflows_winsorized.csv',
  'eval': 's3://wallet-training-data/training-data-preprocessed/dda-897/240306/eval_cw_crypto_net_gain_crypto_inflows_winsorized.csv',
  'val': 's3://wallet-training-d

#### train model

In [None]:
[importlib.reload(module) for module in modules]
sage_wallets_config = yaml.safe_load(Path('../config/sage_wallets_config.yaml').read_text(encoding='utf-8'))
sage_wallets_modeling_config = yaml.safe_load(Path('../config/sage_wallets_modeling_config.yaml').read_text(encoding='utf-8'))

date_suffix = date_suffixes[0]

modeler = wm.WalletModeler(
    sage_wallets_config,
    sage_wallets_modeling_config,
    date_suffix,
    s3_uris
)



In [None]:
modeling_results = modeler.train_model()

### Score validation set

#### get existing model uri

In [9]:
[importlib.reload(module) for module in modules]
sage_wallets_config = yaml.safe_load(Path('../config/sage_wallets_config.yaml').read_text(encoding='utf-8'))
sage_wallets_modeling_config = yaml.safe_load(Path('../config/sage_wallets_modeling_config.yaml').read_text(encoding='utf-8'))



date_suffixes = [
    '231107',
    '240306'
]
date_suffix = date_suffixes[0]
modeler = wm.WalletModeler(
    sage_wallets_config,
    sage_wallets_modeling_config,
    date_suffix
)


model_metadata = modeler.load_existing_model()

[0m[16/Jul/25 15:27:58] INFO [credentials.load:1352] Found credentials in shared credentials file: ~/.aws/credentials[0m
[0m[16/Jul/25 15:27:58] INFO [wallet_modeler.load_existing_model:285] Loaded most recent model (timestamp: 20250716-145925): s3://wallet-training-data/sagemaker-models/dda-897/wallet-xgb-dda-897-231107-20250716-145925/output/model.tar.gz[0m


In [None]:
# model_path = modeler.download_existing_model()

[0m[16/Jul/25 15:27:59] INFO [wallet_modeler.download_existing_model:424] Using existing model archive: ../models/dda_897_td_column_ordering_v1/model.tar.gz[0m
[0m[16/Jul/25 15:27:59] INFO [wallet_modeler.download_existing_model:438] Model ready at: ../models/dda_897_td_column_ordering_v1/xgboost-model[0m


In [13]:
model_path = '../models/dda_897_td_column_ordering_v1/xgboost-model'

In [14]:
validation_df = pd.read_parquet(
    Path("../s3_uploads/wallet_training_data_queue") /
    sage_wallets_config['training_data']['local_directory'] /
    f"x_val_{date_suffix}.parquet"
)


In [15]:
import sklearn
predictions = modeler.predict_with_local_model(validation_df)

[0m[16/Jul/25 15:28:11] INFO [wallet_modeler.download_existing_model:424] Using existing model archive: ../models/dda_897_td_column_ordering_v1/model.tar.gz[0m
[0m[16/Jul/25 15:28:11] INFO [wallet_modeler.download_existing_model:438] Model ready at: ../models/dda_897_td_column_ordering_v1/xgboost-model[0m
[0m[16/Jul/25 15:28:11] INFO [wallet_modeler.predict_with_local_model:477] Falling back to pickle.load()[0m


In [19]:
model_path

'../models/dda_897_td_column_ordering_v1/xgboost-model'

In [17]:
import re, pathlib, binascii

model_file = pathlib.Path(model_path)
with open(model_file, "rb") as f:
    blob = f.read()

match = re.search(rb"version.?['\"]?[:]?\s*['\"]?(\d+\.\d+\.\d+)", blob)
print("Found:", match.group(1).decode() if match else "none")

Found: none


In [7]:
from pathlib import Path

model_file = Path(model_path)   # same variable you’ve been using
with open(model_file, "rb") as f:
    raw = f.read(32)            # first 32 bytes
print(raw[:20])                 # show leading 20 bytes
print(raw[:20].decode(errors="replace"))

b'\x80\x04\x95\x05\x06\x00\x00\x00\x00\x00\x00\x8c\x0cxgboost'
��      �xgboost


In [6]:
# Test sklearn availability
try:
    import sklearn
    print(f"sklearn version: {sklearn.__version__}")
    print("sklearn import successful")
except ImportError as e:
    print(f"sklearn import failed: {e}")

# Test XGBoost's sklearn detection
import xgboost as xgb
print(f"XGBoost version: {xgb.__version__}")

# Check XGBoost's internal sklearn detection
try:
    from xgboost.sklearn import SKLEARN_INSTALLED
    print(f"XGBoost thinks sklearn is installed: {SKLEARN_INSTALLED}")
except ImportError:
    print("Can't access XGBoost's sklearn detection")

sklearn version: 1.3.2
sklearn import successful
XGBoost version: 1.6.2
XGBoost thinks sklearn is installed: True


In [7]:
# Check what's actually in the model file
with open(model_path, 'rb') as f:
    first_bytes = f.read(100)
    print(f"First 100 bytes: {first_bytes}")

# Also check the tar contents to see if there are other files
import tarfile
from pathlib import Path

models_dir = Path(model_path).parent
tar_path = models_dir / 'model.tar.gz'

with tarfile.open(tar_path, 'r:gz') as tar:
    print(f"All files in tar: {tar.getnames()}")
    for member in tar.getmembers():
        print(f"  {member.name}: {member.size} bytes")

First 100 bytes: b'\x80\x04\x95\x05\x06\x00\x00\x00\x00\x00\x00\x8c\x0cxgboost.core\x94\x8c\x07Booster\x94\x93\x94)\x81\x94}\x94(\x8c\rfeature_names\x94]\x94(\x8c\x02f0\x94\x8c\x02f1\x94\x8c\x02f2\x94\x8c\x02f3\x94\x8c\x02f4\x94\x8c\x02f5\x94\x8c\x02f6\x94\x8c\x02'
All files in tar: ['xgboost-model']
  xgboost-model: 365007 bytes


In [None]:
import tarfile
from pathlib import Path

# Re-extract and list contents
models_dir = Path(model_path).parent
tar_path = models_dir / 'model.tar.gz'

print(f"Tar file size: {tar_path.stat().st_size}")
with tarfile.open(tar_path, 'r:gz') as tar:
    print(f"Tar contents: {tar.getnames()}")

In [None]:
# Check if the file exists and has reasonable size
from pathlib import Path
model_file = Path(model_path)
print(f"Model file exists: {model_file.exists()}")
print(f"Model file size: {model_file.stat().st_size if model_file.exists() else 'N/A'} bytes")

# Check XGBoost versions
import xgboost as xgb
print(f"Local XGBoost version: {xgb.__version__}")

In [10]:
# Test sklearn availability
try:
    import sklearn
    print(f"sklearn version: {sklearn.__version__}")
    print("sklearn import successful")
except ImportError as e:
    print(f"sklearn import failed: {e}")

# Test XGBoost's sklearn detection
import xgboost as xgb
print(f"XGBoost version: {xgb.__version__}")

# Check XGBoost's internal sklearn detection
try:
    from xgboost.sklearn import SKLEARN_INSTALLED
    print(f"XGBoost thinks sklearn is installed: {SKLEARN_INSTALLED}")
except ImportError:
    print("Can't access XGBoost's sklearn detection")

sklearn version: 1.3.2
sklearn import successful
XGBoost version: 1.7.4
XGBoost thinks sklearn is installed: True


### DDA 884 devspace