### start

In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import gc
import time
import copy
import logging
import re
from itertools import chain,combinations
import pdb
from pathlib import Path
import pickle
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import IPython
import requests
import pandas_gbq
from google.cloud import bigquery
import scipy
from scipy import stats
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    explained_variance_score,
    mean_absolute_percentage_error,
    log_loss,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['LOGGING_FILE'] = "../../../Local/logs/wallet_modeling.log"
os.environ['NOTIFICATION_SOUNDS_DIR'] = "../../../Local"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import insights.modeling as m
import insights.analysis as ia
import insights.experiments as exp
import feature_engineering.coin_flow_features_orchestrator as cffo

# Wallet features
import wallet_features.clustering_features as wcl
import wallet_features.market_cap_features as wmc
import wallet_features.market_timing_features as wmt
import wallet_features.trading_features as wtf
import wallet_features.performance_features as wpf
import wallet_features.transfers_features as wts
import wallet_features.scenario_features as wsc
import wallet_features.balance_features as wbf
import wallet_features.macroeconomic_features as wmac
import wallet_features.wallet_features_orchestrator as wfo

# Base modeling
import base_modeling.base_model as bm
import base_modeling.feature_selection as fs
import base_modeling.pipeline as bp
import base_modeling.pipeline as bsc

# Wallet modeling
import wallet_modeling.wallet_training_data_orchestrator as wtdo
import wallet_modeling.wallet_epochs_orchestrator as weo
import wallet_modeling.wallet_training_data as wtd
import wallet_modeling.wallet_model as wm
import wallet_modeling.wallet_model_orchestrator as wmo
import wallet_modeling.wallets_config_manager as wcm
from wallet_modeling.wallets_config_manager import WalletsConfig

# Wallet insights
import wallet_insights.wallet_model_reporting as wimr
import wallet_insights.model_evaluation as wime
import wallet_insights.wallet_validation_analysis as wiva
import wallet_insights.wallet_cluster_analysis as wica

# Coin features
import coin_wallet_features.coin_features_orchestrator as cfo
import coin_wallet_features.wallet_metrics as cwwm
import coin_wallet_features.wallet_metrics_flattening as cwwmf
import coin_wallet_features.wallet_segmentation as cws

# Coin modeling
import coin_modeling.coin_model as cm
import coin_modeling.coin_epochs_orchestrator as ceo
from coin_modeling.coin_config_manager import WalletsCoinConfig

# Coin insights
import coin_insights.coin_validation_analysis as civa
import coin_insights.coin_model_reporting as cimr


# reload all modules
modules = [
    u, dr, pri, cwm, ind, fg, flt, ds, tv, prp, m, ia, exp, cffo,
    wtdo, weo, wtd, wm, wmo, wcm,
    wcl, wmc, wmt, wtf, wpf, wts, wsc, wbf, wmac, wfo,
    bm, fs, bp, bsc,
    wimr, wime, wiva, wica,
    cfo, cwwm, cwwmf, cws,
    cm, ceo,
    civa, cimr,
]

# load all configs
(
    coin_flow_config,
    coin_flow_metrics_config,
    coin_flow_modeling_config,
    coin_flow_experiments_config
) = u.load_all_configs('../config')
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')
wcm.validate_config_alignment(coin_flow_config,wallets_config,wallets_coin_config)

wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))
wallets_coins_metrics_config = u.load_config('../config/wallets_coins_metrics_config.yaml')

# make parquet dirs if they don't already exist
Path(wallets_config['training_data']['parquet_folder']).mkdir(parents=True, exist_ok=True)
Path(wallets_coin_config['training_data']['parquet_folder']).mkdir(parents=True, exist_ok=True)

# Set the custom error handler
ipython = IPython.get_ipython()
ipython.set_custom_exc((Exception,), u.notify_on_failure)

# configure logger
logger = u.setup_notebook_logger('../logs/notebook_logs.log')
logger.setLevel(logging.INFO)


# u.export_code(
#     code_directories=[
#         # 'training_data',
#         # 'wallet_modeling',
#         # 'wallet_features',
#         # 'coin_wallet_features',
#         'base_modeling',
#         'coin_modeling',
#         # 'coin_insights',
#         # 'wallet_insights'
#     ],
#     # include_config = True,
#     # ipynb_notebook = 'DDA-769 coin model score dist toggle.ipynb'
# )


[importlib.reload(module) for module in modules]
u.notify('retro')

logger.info("Good morning, let's get to work")

# Wallet Model Construction

### Load complete wallet datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))

# Initiate orchestrator
epochs_orchestrator = weo.WalletEpochsOrchestrator(
    wallets_config.config,
    wallets_metrics_config,
    wallets_features_config,
    wallets_epochs_config
)

epochs_orchestrator.load_complete_raw_datasets()

### Generate modeling and validation features (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))

parquet_folder = wallets_config['training_data']['parquet_folder']
complete_profits_df = pd.read_parquet(f"{parquet_folder}/complete_profits_df.parquet")
complete_market_data_df = pd.read_parquet(f"{parquet_folder}/complete_market_data_df.parquet")
complete_macro_trends_df = pd.read_parquet(f"{parquet_folder}/complete_macro_trends_df.parquet")
complete_hybrid_cw_id_df = pd.read_parquet(f"{parquet_folder}/complete_hybrid_cw_id_df.parquet")

# Initiate orchestrator
epochs_orchestrator = weo.WalletEpochsOrchestrator(
    wallets_config.config,
    wallets_metrics_config,
    wallets_features_config,
    wallets_epochs_config,
    complete_profits_df,
    complete_market_data_df,
    complete_macro_trends_df,
    complete_hybrid_cw_id_df
)

# Generate training and modeling dfs for all windows
(wallet_training_data_df,modeling_wallet_features_df,
 validation_training_data_df,validation_wallet_features_df) = epochs_orchestrator.generate_epochs_training_data()


# Confirm all pairs in profits_df have a hybrid mapping
if complete_hybrid_cw_id_df is not None:
    wtdo.validate_hybrid_mapping_completeness(wallet_training_data_df,complete_hybrid_cw_id_df)
    if not validation_training_data_df.empty:
        wtdo.validate_hybrid_mapping_completeness(validation_training_data_df,complete_hybrid_cw_id_df)

# Save files
wallet_training_data_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_wallet_training_data_df.parquet",index=True)
modeling_wallet_features_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_modeling_wallet_features_df.parquet",index=True)
validation_training_data_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_validation_training_data_df.parquet",index=True)
validation_wallet_features_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_validation_wallet_features_df.parquet",index=True)

# sorted(list(wallet_training_data_df.columns))

#### parse columns

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# put features into dict and analyze list
features_dict = {}
features_dict['feature'] = list(wallet_training_data_df.columns)
features_dict['importance'] = [1] * len(wallet_training_data_df.columns)
feature_importances_df = wiva.analyze_wallet_model_importance(features_dict)
feature_importances_df = feature_importances_df.copy()

feature_categories_filter = [
    # 'performance',
    # 'timing',
    # 'cw_timing',
    'trading',
    # 'transfers',
    # 'mktcap',
    # 'scenario',
    # 'macro',
    # 'cluster',
]

feature_names_filter = [
    # 'price_sma_2',
    # 'price_rsi_5',
    # 'volume_sma_5',
    # 'market_cap_filled',
    # 'mktcap',
    # 'cluster',
    # 'btc_vdd_multiple',
    'gtrends_memecoin_us',
]

groups = [
    # 'feature_category',
    # 'feature_name',
    # 'feature_comparison',
    # 'feature_aggregation',
    # 'training_segment',
    # 'feature'
]

(feature_importances_df
 [feature_importances_df['feature_category'].isin(feature_categories_filter)]
#  [feature_importances_df['feature_name'].isin(feature_names_filter)]
 .fillna('None')
 .groupby(groups)['importance']
 .agg(['sum', 'count'])
 .sort_values(by='sum',ascending=False)
)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

fs.validate_drop_params(wallet_training_data_df,wallets_config)

### Construct wallet model

#### wallet model w validation (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Load modeling and validation files
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_wallet_training_data_df.parquet")
modeling_wallet_features_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_modeling_wallet_features_df.parquet")
validation_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_validation_training_data_df.parquet")
validation_wallet_features_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_validation_wallet_features_df.parquet")

In [None]:
validation_training_data_df.shape

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config['modeling'])
wallet_model_results = wallet_model.construct_wallet_model(
    wallet_training_data_df, modeling_wallet_features_df,
    validation_training_data_df, validation_wallet_features_df
)

# Print summary
if 'y_train' in wallet_model_results:

    # Generate and save all model artifacts
    model_id, wallet_evaluator, modeling_wallet_scores_df = wimr.generate_and_save_wallet_model_artifacts(
        model_results=wallet_model_results,
        base_path='../artifacts/wallet_modeling',
        configs = {
            'wallets_config': wallets_config.config,
            'wallets_metrics_config': wallets_metrics_config,
            'wallets_features_config': wallets_features_config,
            'wallets_epochs_config': wallets_epochs_config
        }
    )
    wallet_evaluator.summary_report()
    wallet_evaluator.plot_wallet_evaluation()
else:
    display(wallet_model.generate_search_report())

In [None]:
validation_wallet_features_df

##### wallet model 2

In [None]:
# no w1
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config['modeling'])
wallet_model_results = wallet_model.construct_wallet_model(
    wallet_training_data_df, modeling_wallet_features_df,
    validation_training_data_df, validation_wallet_features_df
)

# Print summary
if 'y_train' in wallet_model_results:

    # Generate and save all model artifacts
    model_id, wallet_evaluator, modeling_wallet_scores_df = wimr.generate_and_save_wallet_model_artifacts(
        model_results=wallet_model_results,
        base_path='../artifacts/wallet_modeling',
        configs = {
            'wallets_config': wallets_config.config,
            'wallets_metrics_config': wallets_metrics_config,
            'wallets_features_config': wallets_features_config,
            'wallets_epochs_config': wallets_epochs_config
        }
    )
    wallet_evaluator.summary_report()
    wallet_evaluator.plot_wallet_evaluation()
else:
    display(wallet_model.generate_search_report())

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config['modeling'])
wallet_model_results = wallet_model.construct_wallet_model(
    wallet_training_data_df, modeling_wallet_features_df,
    validation_training_data_df, validation_wallet_features_df
)

# Print summary
if 'y_train' in wallet_model_results:

    # Generate and save all model artifacts
    model_id, wallet_evaluator, modeling_wallet_scores_df = wimr.generate_and_save_wallet_model_artifacts(
        model_results=wallet_model_results,
        base_path='../artifacts/wallet_modeling',
        configs = {
            'wallets_config': wallets_config.config,
            'wallets_metrics_config': wallets_metrics_config,
            'wallets_features_config': wallets_features_config,
            'wallets_epochs_config': wallets_epochs_config
        }
    )
    wallet_evaluator.summary_report()
    wallet_evaluator.plot_wallet_evaluation()
else:
    display(wallet_model.generate_search_report())

#### wallet model without validation (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Load modeling files
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_wallet_training_data_df.parquet")
modeling_wallet_features_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_modeling_wallet_features_df.parquet")

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config['modeling'])
wallet_model_results = wallet_model.construct_wallet_model(
    wallet_training_data_df, modeling_wallet_features_df
)

# Print summary
if 'y_train' in wallet_model_results:

    # Generate and save all model artifacts
    model_id, wallet_evaluator, modeling_wallet_scores_df = wimr.generate_and_save_wallet_model_artifacts(
        model_results=wallet_model_results,
        base_path='../artifacts/wallet_modeling',
        configs = {
            'wallets_config': wallets_config.config,
            'wallets_metrics_config': wallets_metrics_config,
            'wallets_features_config': wallets_features_config
        }
    )
    wallet_evaluator.summary_report()
else:
    display(wallet_model.generate_search_report())

#### save individual scores for modeling and coin_modeling features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

score_name = wallets_config['modeling']['score_name']
base_path = wallets_config['training_data']['model_artifacts_folder']

# model_id = '8e55189e-a040-4b68-9d69-83c3f95ee652'
score_name = 'dda528_net_gain_max_inv_025'

# Load and predict
wamo_training_data_df = pd.read_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}"
                                        "/wamo_training_data_df.parquet")
wamo_y_pred = wiva.load_and_predict(model_id,wamo_training_data_df,base_path)
wamo_wallet_scores_df = pd.DataFrame({
    f'score|{score_name}': wamo_y_pred
})
wamo_wallet_scores_df.to_parquet(f"temp/wallet_modeling_score_dfs/{score_name}|wamo.parquet",index=True)

# Load and predict
como_training_data_df = pd.read_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}"
                                             "/como_training_data_df.parquet")
como_y_pred = wiva.load_and_predict(model_id,como_training_data_df,base_path)
como_wallet_scores_df = pd.DataFrame({
    f'score|{score_name}': como_y_pred
})
como_wallet_scores_df.to_parquet(f"temp/wallet_modeling_score_dfs/{score_name}|como.parquet",index=True)

u.notify(2)


#### predict training data with existing model only

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

base_path = wallets_config['training_data']['model_artifacts_folder']
model_id = '85e79c0e-c6a6-4514-97bb-277b945086fd'
score_name = 'dda785_net_flows'

# Load and predict
training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}"
                                        "/multiwindow_wallet_training_data_df.parquet")
y_pred = wiva.load_and_predict(model_id,training_data_df,base_path)
wallet_scores_df = pd.DataFrame({
    'score': y_pred
})

wallet_scores_df = wtdo.dehybridize_wallet_address(wallet_scores_df,complete_hybrid_cw_id_df)
wallet_scores_df = wallet_scores_df.reset_index()
wallet_scores_df['model_id'] = model_id
wallet_scores_df['scored_at'] = datetime.now()
wallet_scores_df['model_type'] = wallets_config['modeling']['model_type']
wallet_scores_df['target_var'] = wallets_config['modeling']['target_variable']
wallet_scores_df['target_var_threshold'] = wallets_config['modeling']['target_var_min_threshold']


table_name = f"wallets_{datetime.now().strftime('%Y%m%d_%Hh%Mm%Ss')}_{wallets_config['modeling']['target_variable'].replace('/','_')}"


# Basic syntax
pandas_gbq.to_gbq(
    dataframe=wallet_scores_df,
    destination_table=f'scores.{table_name}',
    project_id='western-verve-411004',
    if_exists='fail'  # Options: 'fail', 'replace', or 'append'
)


In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

complete_hybrid_cw_id_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}"
                                        "/complete_hybrid_cw_id_df.parquet")


In [None]:
all_values_present = complete_hybrid_cw_id_df['hybrid_cw_id'].isin(wallet_scores_df.index.get_level_values('wallet_address')).all()

In [None]:
wallet_scores_df.index.get_level_values('wallet_address').astype(float)

In [None]:
xw_ids = set(complete_hybrid_cw_id_df['hybrid_cw_id'].astype(int))
scores_ids = set(wallet_scores_df.index.get_level_values('wallet_address').astype(int))


In [None]:
len(scores_ids)

In [None]:
len(xw_ids)

In [None]:
scores_ids - xw_ids

In [None]:
xw_ids

In [None]:
missing_values = complete_hybrid_cw_id_df['hybrid_cw_id'][~complete_hybrid_cw_id_df['hybrid_cw_id'].isin(wallet_scores_df.index.get_level_values('wallet_address'))]
missing_values

In [None]:
missing_values

In [None]:
all_values_present

In [None]:
wallet_scores_df.index.get_level_values('wallet_address')

In [None]:
wtdo.dehybridize_wallet_address(wallet_scores_df,complete_hybrid_cw_id_df)

In [None]:
wallet_scores_df

In [None]:
wallet_scores_df = wallet_scores_df.reset_index().rename(columns={'wallet_address': 'hybrid_cw_id'})
wallet_scores_df

In [None]:
wallet_scores_df.merge(complete_hybrid_cw_id_df,on='hybrid_cw_id',how='inner')

In [None]:
len(wallet_scores_df)

In [None]:
df = wallet_scores_df
hybrid_cw_id_df = complete_hybrid_cw_id_df

logger.info("De‑hybridizing DataFrame with shape %s …", df.shape)
df_out = df.copy()
original_index = df_out.index.names

# -------------------------------------------------
# Detect where the hybrid IDs live and rename the
# column/index level to *hybrid_cw_id* so we can
# merge cleanly without duplicate label errors.
# -------------------------------------------------
if 'wallet_address' in df_out.columns:
    # Column case → rename in‑place
    df_out = df_out.rename(columns={'wallet_address': 'hybrid_cw_id'})
    wallet_col = 'hybrid_cw_id'
    used_index = False

elif 'wallet_address' in df_out.index.names:
    # Index case → bring into columns, then rename
    # Drop any duplicate column first to avoid reset_index collision
    if 'wallet_address' in df_out.columns:
        df_out = df_out.drop(columns=['wallet_address'])

    df_out = df_out.reset_index()
    df_out = df_out.rename(columns={'wallet_address': 'hybrid_cw_id'})
    wallet_col = 'hybrid_cw_id'
    used_index = True

else:
    raise ValueError(
        "dehybridize_wallet_address: expected 'wallet_address' either as "
        "a column or an index level containing hybrid IDs."
    )

# Merge to add wallet_id + coin_id
df_out = df_out.merge(hybrid_cw_id_df, on=wallet_col, how='left')

# Verify mapping completeness
if df_out['wallet_id'].isna().any():
    missing = df_out['wallet_id'].isna().sum()
    raise ValueError(
        f"Failed to de‑hybridize {missing} rows – missing hybrid_cw_id "
        "mappings."
    )

# Replace hybrid id with original wallet_id and rename for consistency
df_out['wallet_address'] = df_out['wallet_id']
df_out = df_out.drop(columns=['wallet_id', 'hybrid_cw_id'])

# Restore original index structure
if used_index:
    # Re‑establish MultiIndex (wallet_address, coin_id, [other …])
    df_out = df_out.set_index(
        ['wallet_address', 'coin_id'] +
        [col for col in original_index if col not in ('wallet_address', 'coin_id')]
    )
else:
    # Ensure wallet_address & coin_id are regular columns
    pass  # nothing required

logger.info("Completed de‑hybridization; resulting shape %s.", df_out.shape)
# return df_out

In [None]:
df_out

In [None]:
mapping_df

### Wallet Model Evaluation

#### load evaluation report

In [None]:
[importlib.reload(module) for module in modules]

model_id = '3493a19d-0ee3-4272-ab52-40afc6ab6d1b'
base_path = wallets_config['training_data']['model_artifacts_folder']
configs_output = 'temp/configs_revival/dda_691_3493a19d'

report = wimr.load_model_report(model_id, base_path, configs_output)

#### importance analysis

In [None]:
wallet_evaluator.importance_summary(1)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Reload evaluator
wallet_evaluator = wime.ClassifierEvaluator(wallet_model_results)

feature_importances_df = wiva.analyze_wallet_model_importance(wallet_evaluator.metrics['importances'])
feature_importances_df = feature_importances_df.copy()

feature_categories_filter = [
    # 'performance',
    # 'cw_timing',
    # 'trading',
    # 'transfers',
    # 'cw_mktcap',
    # 'scenario',
    # 'macro',
    # 'cluster',
]

feature_names_filter = [
    # 'price_sma_2',
    # 'price_rsi_5',
    # 'volume_sma_5',
    # 'market_cap_filled',
    # 'mktcap',
    # 'cluster',
    # 'portfolio_mcap_max',
    # 'crypto_net_flows',
]

groups = [
    # 'feature_category',
    # 'feature_name',
    # 'feature_comparison',
    # 'feature_aggregation',
    'training_segment',
    # 'feature'
]

(feature_importances_df
#  [feature_importances_df['feature_category'].isin(feature_categories_filter)]
#  [feature_importances_df['feature_name'].isin(feature_names_filter)]
 .fillna('None')
 .groupby(groups)['importance']
 .agg(['sum', 'count'])
 .sort_values(by='sum',ascending=False)
)

#### predict validation data with existing model

In [None]:
model_id = 'bfa55a33-712e-4d82-bb5c-11fc942bcb62'
validation_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}"
                                              "/multiwindow_validation_training_data_df.parquet")

validation_y_pred = wiva.load_and_predict(
    model_id,
    validation_training_data_df,
    wallets_config['training_data']['model_artifacts_folder']
)


In [None]:
validation_wallet_features_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_validation_wallet_features_df.parquet")

validation_y_true = validation_wallet_features_df[wallets_config['modeling']['target_variable']]

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    log_loss,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)

def evaluate_classification(y_true: pd.Series, y_prob: pd.Series, threshold: float = 0.0) -> dict:
    """
    Calculate metrics for binary classification based on continuous values.

    Params:
    - y_true (Series): Actual continuous values.
    - y_prob (Series): Predicted continuous values.
    - threshold (float): Threshold for converting to binary. Default 0.0.

    Returns:
    - dict: Core performance metrics computed on overlapping ids.
    """
    # Identify common ids between y_true and y_prob
    common_idx = y_true.index.intersection(y_prob.index)
    if len(common_idx) == 0:
        raise ValueError("No overlapping ids between y_true and y_prob")

    # Filter to only overlapping ids
    y_true_common = y_true.loc[common_idx].values
    y_prob_common = y_prob.loc[common_idx].values

    # Convert continuous values to binary for classification metrics
    y_true_binary = (y_true_common > threshold).astype(int)
    y_pred_binary = (y_prob_common > threshold).astype(int)

    # Compute metrics
    metrics = {
        'accuracy': accuracy_score(y_true_binary, y_pred_binary),
        'precision': precision_score(y_true_binary, y_pred_binary),
        'recall': recall_score(y_true_binary, y_pred_binary),
        'f1': f1_score(y_true_binary, y_pred_binary),
        'confusion_matrix': confusion_matrix(y_true_binary, y_pred_binary).tolist(),
        'mse': mean_squared_error(y_true_common, y_prob_common),
        'mae': mean_absolute_error(y_true_common, y_prob_common)
    }

    # Add ROC AUC if we have both positive and negative classes
    if len(np.unique(y_true_binary)) > 1:
        metrics['roc_auc'] = roc_auc_score(y_true_binary, y_prob_common)

    return metrics


evaluate_classification(validation_y_true,validation_y_pred)

#### assess segment performance

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
pd.set_option('display.max_colwidth', None)  # Shows full text in columns


# Reload evaluator
if wallet_model_results['model_type'] == 'regression':
    wallet_evaluator = wime.RegressorEvaluator(wallet_model_results)
else:
    wallet_evaluator = wime.ClassifierEvaluator(wallet_model_results)

segmentation_features = [
    # 'mktcap|portfolio_mcap_mean/market_cap_unadj|all_windows',
    'mktcap|volume_wtd_market_cap/market_cap_filled|all_windows',
    # 'timing|btc_mvrv_z_score/buy_weighted|all_windows',
    # 'timing|btc_mvrv_z_score/sell_weighted|all_windows',
    # 'macro|btc_mvrv_z_score_first|all_windows',
    # 'macro|btc_mvrv_z_score_last|all_windows',
    'trading|crypto_net_gain|all_windows',
    'trading|total_volume|all_windows',
    'trading|crypto_net_cash_flows|all_windows',
    'trading|unique_coins_traded|all_windows',
    # 'transfers|first_buy/median_avg_wallet_rank|all_windows',
    'trading|max_investment|all_windows'
]
segmentation_features = [
    # 'mktcap|portfolio_mcap_mean/market_cap_unadj|w5',
    'mktcap|volume_wtd_market_cap/market_cap_filled|w5',
    # 'timing|btc_mvrv_z_score/buy_weighted|w5',
    # 'timing|btc_mvrv_z_score/sell_weighted|w5',
    # 'macro|btc_mvrv_z_score_first|w5',
    # 'macro|btc_mvrv_z_score_last|w5',
    'trading|crypto_net_gain|w5',
    'trading|total_volume|w5',
    'trading|crypto_net_cash_flows|w5',
    'trading|unique_coins_traded|w5',
    # 'transfers|first_buy/median_avg_wallet_rank|w5',
    'trading|max_investment|w5'
]


# get raw segments
segments_df = wallet_evaluator.identify_predictive_populations(
    segmentation_features,
    min_pop_pct=0.02,
    max_segments=25
)

# coerce the formatted strings to numbers, then sort
# segments_df.sort_values('RMSE vs Overall', ascending=True)
segments_df.sort_values('R2 vs Overall', ascending=False)
# segments_df.describe()


#### modeling multi window r2 comparison

In [None]:
epochs = sorted(list(modeling_wallet_scores_df.index.get_level_values('epoch_start_date').unique()))

for epoch in epochs:
    epoch_mask = modeling_wallet_scores_df.index.get_level_values('epoch_start_date') == epoch
    # Add cohort filter
    cohort_mask = modeling_wallet_scores_df['in_modeling_cohort'] == True
    combined_mask = epoch_mask & cohort_mask

    y_true = modeling_wallet_scores_df[combined_mask]['actual']
    y_pred = modeling_wallet_scores_df[combined_mask]['score']

    # Skip epochs with no actual values
    if y_true.isna().all():
        continue

    metrics = wiva.evaluate_predictions(y_true, y_pred)
    print(f"Epoch {epoch}: R² = {metrics['r2']:.3f}")

#### Cluster analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Load parquet
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet")


# List of the x features with the highest importance in the model
x_features = 6
top_feature_metrics = list((pd.DataFrame(wallet_evaluator.metrics['importances'])
                      .sort_values(by='importance',ascending=False)
                      .head(x_features)['feature']))
comparison_metrics = list(set(top_feature_metrics))



# Cluster numbers
n_clusters=4

styled_df,cluster_results_df = wica.create_cluster_report(wallet_training_data_df, wallet_model_results, n_clusters, comparison_metrics, 'median')

del(wallet_training_data_df)
gc.collect()

styled_df

In [None]:
modeling_df = wallet_training_data_df.copy()

base_metrics = [
    'trading|max_investment|all_windows',
    'trading|crypto_net_gain|all_windows',
    'mktcap|end_portfolio_wtd_market_cap|all_windows',
    'performance|crypto_net_gain/max_investment/base|all_windows',
]
cluster_cols = [col for col in modeling_df.columns if col.startswith('cluster|')]
cluster_analysis_df = modeling_df[list(set(cluster_cols + base_metrics + comparison_metrics))].copy()


# Assign wallets to categorical clusters based on the distance values
cluster_assignments_df = wcl.assign_clusters_from_distances(cluster_analysis_df,
                                                        wallets_config['features']['clustering_n_clusters'])
# cluster_analysis_df = cluster_analysis_df.join(cluster_assignments_df,how='inner')



In [None]:
list(cluster_analysis_df.columns)

In [None]:
cluster_assignments_df

# Coin Model Construction

## Coin model training data generation

### pull all data and generate all features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')


# Initiate orchestrator
coin_epochs_orchestrator = ceo.CoinEpochsOrchestrator(
    wallets_coin_config,
    wallets_coins_metrics_config,
    wallets_config,
    wallets_metrics_config,
    wallets_features_config,
    wallets_epochs_config,
    coin_flow_config,
    coin_flow_modeling_config,
    coin_flow_metrics_config
)

coin_epochs_orchestrator.load_complete_raw_datasets()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Initiate orchestrator
coin_epochs_orchestrator = ceo.CoinEpochsOrchestrator(
    wallets_coin_config,
    wallets_coins_metrics_config,
    wallets_config,
    wallets_metrics_config,
    wallets_features_config,
    wallets_epochs_config,
    coin_flow_config,
    coin_flow_modeling_config,
    coin_flow_metrics_config
)

coin_epochs_orchestrator.load_complete_raw_datasets()

coin_epochs_orchestrator.orchestrate_coin_epochs()

### dda 741 devspace

#### make training data

In [None]:

custom_offset_days = [
    # 30,
    # 60,
    # 90,

    120,
    150,
    180,

    # 210,
    # 240,
    # 270

    # 0,
    # -30,
    # -60,
]
# file_prefix='investing_'
file_prefix='investing_val'

coin_epochs_orchestrator.orchestrate_coin_epochs(
    custom_offset_days,
    file_prefix
)

#### score training data

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

model_id = 'bb10db73-1fda-4141-b9ca-64716b41db00'

file_prefix='investing_val'
parquet_folder = wallets_coin_config['training_data']['parquet_folder']
como_features_df = pd.read_parquet(f"{parquet_folder}/{file_prefix}multiwindow_como_coin_training_data_df_full.parquet")
como_target_var_df = pd.read_parquet(f"{parquet_folder}/{file_prefix}multiwindow_como_coin_target_var_df.parquet")

# Create a list of the current index level dtypes
idx_dtypes = list(como_target_var_df.index.dtypes)
# Convert only the coin_id level (assuming it's the first level)
como_target_var_df.index = como_target_var_df.index.set_levels(
    como_target_var_df.index.levels[0].astype(str),
    level=0
)

como_scores_df = coin_epochs_orchestrator.score_coin_training_data(
    model_id,
    '../artifacts/coin_modeling',
    como_features_df,
)
como_scores_df.describe()
plot_return_vs_rank(como_scores_df['score'],como_target_var_df['coin_return_winsorized'])

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

model_id = 'cd4fe30a-dae6-4bd5-aac8-51b9d03030cd'

file_prefix='investing_val'
parquet_folder = wallets_coin_config['training_data']['parquet_folder']
como_features_df = pd.read_parquet(f"{parquet_folder}/{file_prefix}multiwindow_como_coin_training_data_df_full.parquet")
como_target_var_df = pd.read_parquet(f"{parquet_folder}/{file_prefix}multiwindow_como_coin_target_var_df.parquet")

# Create a list of the current index level dtypes
idx_dtypes = list(como_target_var_df.index.dtypes)
# Convert only the coin_id level (assuming it's the first level)
como_target_var_df.index = como_target_var_df.index.set_levels(
    como_target_var_df.index.levels[0].astype(str),
    level=0
)

como_scores_df = coin_epochs_orchestrator.score_coin_training_data(
    model_id,
    '../artifacts/coin_modeling',
    como_features_df,
)
como_scores_df.describe()
plot_return_vs_rank(como_scores_df['score'],como_target_var_df['coin_return_winsorized'])

In [None]:
def plot_return_vs_rank(y_pred_proba: pd.Series, target_variable: pd.Series,
                        ax=None, n_buckets: int = 10):
    """
    Plot histogram of prediction probabilities and returns by probability bins.

    Params:
    - y_pred_proba (Series): Prediction probabilities with wallet indices
    - target_variable (Series): Actual returns with matching wallet indices
    - ax (matplotlib.axes): Optional axes to plot on, creates one if None
    - n_buckets (int): Number of probability buckets to divide data into

    Returns:
    - ax (matplotlib.axes): The axes with the plotted data
    """
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import textwrap

    # Create axes if not provided
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 6))

    # Align data using indices
    returns = target_variable.reindex(y_pred_proba.index)

    # Winsorize returns (cap outliers)
    def winsorize(series, limit):
        lower = series.quantile(limit)
        upper = series.quantile(1 - limit)
        return series.clip(lower=lower, upper=upper)

    returns_winsorized = winsorize(returns, 0.01)

    # Create dataframe for analysis
    df = pd.DataFrame({
        "proba": y_pred_proba,
        "ret": returns,
        "ret_win": returns_winsorized
    }).dropna()

    # Define score bins
    try:
        score_min, score_max = df["proba"].min(), df["proba"].max()
        bin_edges = np.linspace(score_min, score_max, n_buckets + 1)
        df["score_bin"] = pd.cut(df["proba"], bins=bin_edges, include_lowest=True)
    except ValueError:
        ax.text(0.5, 0.5, 'Insufficient score spread to generate bins.',
                ha='center', va='center')
        return ax

    # Compute stats per bin
    bin_counts = df.groupby("score_bin", observed=True).size()
    bin_mean_ret = df.groupby("score_bin", observed=True)["ret"].mean()
    bin_median_ret = df.groupby("score_bin", observed=True)["ret"].median()
    bin_winsorized_ret = df.groupby("score_bin", observed=True)["ret_win"].mean()

    # Drop bins with zero count
    valid_bins = bin_counts[bin_counts > 0]
    valid_centers = [
        interval.left + (interval.right - interval.left) / 2
        for interval in valid_bins.index
    ]
    valid_counts = valid_bins.values
    valid_mean_ret = bin_mean_ret.reindex(valid_bins.index).values
    valid_median_ret = bin_median_ret.reindex(valid_bins.index).values
    valid_winsorized_ret = bin_winsorized_ret.reindex(valid_bins.index).values
    width = bin_edges[1] - bin_edges[0]

    # Primary axis: histogram of counts
    ax.bar(valid_centers, valid_counts, width=width, alpha=0.6, label="Count")
    ax.set_yscale('log')

    # Secondary axis: return lines
    ax2 = ax.twinx()
    abs_returns = np.abs(df["ret"])
    linthresh = np.percentile(abs_returns, 95)
    if linthresh <= 0:
        max_abs = abs_returns.max()
        linthresh = max_abs * 0.05 if max_abs > 0 else 1.0
    ax2.set_yscale("linear")

    # Plot return metrics
    ax2.plot(valid_centers, valid_median_ret, marker='o', linestyle='-',
             linewidth=2, label="Median Return", color="#8000ff")
    ax2.plot(valid_centers, valid_winsorized_ret, marker='o', linestyle='-',
             linewidth=2, label="Winsorized Return", color="#ffe000")
    ax2.plot(valid_centers, valid_mean_ret, marker='o', linestyle='-',
             linewidth=2, label="Mean Return", color="#22DD22")

    # Overall mean return line
    overall_mean = df["ret"].mean()
    ax2.axhline(overall_mean, linestyle="--", color="#afc6ba",
                linewidth=1, label="Overall mean return")

    # Annotate extremes of winsorized returns
    if len(bin_winsorized_ret) > 0:
        low_interval = bin_winsorized_ret.idxmin()
        high_interval = bin_winsorized_ret.idxmax()
        x_low = (low_interval.left + low_interval.right) / 2
        x_high = (high_interval.left + high_interval.right) / 2
        y_low = bin_winsorized_ret.loc[low_interval]
        y_high = bin_winsorized_ret.loc[high_interval]
        ax2.annotate(f"{y_low:.2f}", xy=(x_low, y_low),
                     xytext=(0, -10), textcoords="offset points",
                     ha="center", va="top")
        ax2.annotate(f"{y_high:.2f}", xy=(x_high, y_high),
                     xytext=(0, 10), textcoords="offset points",
                     ha="center", va="bottom")

    # Labels and title
    ax.set_xlabel("Prediction Score")
    ax.set_ylabel("Number of Wallets")
    label = "Target Variable Returns"
    wrapped_label = "\n".join(textwrap.wrap(label, width=30))
    ax2.set_ylabel(wrapped_label)
    ax.set_title("Prediction Score Distribution and Returns")
    ax.grid(True, linestyle=":", alpha=0.3)

    # Combine legends from both axes
    lines, labels = ax.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax.legend(lines + lines2, labels + labels2, loc="upper left")

    return ax

### stepwise coin model generation

#### Train all models and save all scores (parquet loadable)

In [None]:
# Load multiwindow modeling and validation files
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_wallet_training_data_df.parquet")
modeling_wallet_features_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_modeling_wallet_features_df.parquet")
validation_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_validation_training_data_df.parquet")
validation_wallet_features_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_validation_wallet_features_df.parquet")

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Instantiate orchestrator
model_orchestrator = wmo.WalletModelOrchestrator(
        wallets_config.config,
        wallets_metrics_config,
        wallets_features_config,
        wallets_epochs_config,
        wallets_coin_config.config
)

# Train all models
models_dict = model_orchestrator.train_wallet_models(
    wallet_training_data_df,
    modeling_wallet_features_df,
    validation_training_data_df,
    validation_wallet_features_df
)

##### load or generate training data dfs (parquet loadable)

In [None]:
# Here we create wallet model training data for dates through the end of the original modeling period.
# This will be used to create "current" scores as of the end of the modeling period, that can be
# used to generate features for the "current" coin model built at the end of the modeling period.

[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))

complete_profits_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_profits_df.parquet")
complete_market_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_market_data_df.parquet")
complete_macro_trends_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_macro_trends_df.parquet")

# Identify offset needed to generate training data directly following the modeling period to the validation period start
modeling_offset = wallets_config['training_data']['modeling_period_duration']
coin_modeling_epochs_config = {
    'offset_epochs': {
        'offsets': [modeling_offset],
        'validation_offsets': [modeling_offset*2]
    }
}
# Initiate orchestrator
epochs_orchestrator = weo.WalletEpochsOrchestrator(
    wallets_config.config,
    wallets_metrics_config,
    wallets_features_config,
    coin_modeling_epochs_config,
    complete_profits_df,
    complete_market_data_df,
    complete_macro_trends_df
)



# Save files
wamo_training_data_df.to_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}/wamo_training_data_df.parquet",index=True)
wamo_modeling_data_df.to_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}/wamo_modeling_data_df.parquet",index=True)
como_training_data_df.to_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}/como_training_data_df.parquet",index=True)
como_modeling_data_df.to_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}/como_modeling_data_df.parquet",index=True)


##### generate and save scores for all models (parquet loadable)

In [None]:
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Score training data with all models
wamo_training_data_df = pd.read_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}"
                                        "/wamo_training_data_df.parquet")

# Instantiate orchestrator
model_orchestrator = wmo.WalletModelOrchestrator(
        wallets_config,
        wallets_metrics_config,
        wallets_features_config,
        wallets_epochs_config,
        wallets_coin_config
)
# Load dict
with open(f"{wallets_coin_config['training_data']['parquet_folder']}/wallet_model_ids.json") as f:
    models_dict = json.load(f)

model_orchestrator.predict_and_store(models_dict,wamo_training_data_df)

#### Feature generation for predicting coin modeling period outcomes

##### load wallet modeling period files (parquet loadable)

In [None]:
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wamo_training_data_df= pd.read_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}/wamo_training_data_df.parquet")
wamo_modeling_data_df= pd.read_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}/wamo_modeling_data_df.parquet")
como_training_data_df= pd.read_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}/como_training_data_df.parquet")
como_modeling_data_df= pd.read_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}/como_modeling_data_df.parquet")

In [None]:
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

(training_coin_cohort, wamo_profits_df, como_market_data_df, como_profits_df, investing_market_data_df
    ) = cfo.load_wallet_data_for_coin_features(wallets_config)

##### generate all features

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

features_generator = cfo.CoinFeaturesOrchestrator(
    wallets_config,
    wallets_coin_config,
    metrics_config,
    config,
    modeling_config,
    training_coin_cohort
)

# Feature generation for training data set
wamo_modeling_suffix = pd.to_datetime(wallets_config['training_data']['coin_modeling_period_start']).strftime('%Y%m%d')
wamo_coin_training_data_df_full = features_generator.generate_coin_features_for_period(
    wamo_profits_df,
    wamo_training_data_df,
    'modeling',
    wamo_modeling_suffix
)

# Feature gene ration for validation set
como_coin_training_data_df_full = features_generator.generate_coin_features_for_period(
    como_profits_df,
    como_training_data_df,
    'coin_modeling',
    wamo_modeling_suffix  # predict validation outcomes with the same model
)

wamo_coin_training_data_df_full.to_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}"
                                           "/wamo_coin_training_data_df_full.parquet",index=True)
como_coin_training_data_df_full.to_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}"
                                          "/como_coin_training_data_df_full.parquet",index=True)


##### target vars (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# Load data
wamo_coin_training_data_df_full = pd.read_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}"
                                           "/wamo_coin_training_data_df_full.parquet")
como_coin_training_data_df_full = pd.read_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}"
                                          "/como_coin_training_data_df_full.parquet")
(training_coin_cohort, wamo_profits_df, como_market_data_df, como_profits_df, investing_market_data_df
    ) = cfo.load_wallet_data_for_coin_features(wallets_config)


# Instantiate orchestrator
features_generator = cfo.CoinFeaturesOrchestrator(
    wallets_config,
    wallets_coin_config,
    metrics_config,
    config,
    modeling_config,
    training_coin_cohort
)

# Target var for wallet modeling period is performance during the coin modeling period
wamo_coin_target_var_df = features_generator.calculate_target_variables(
    como_market_data_df,
    wallets_config['training_data']['coin_modeling_period_start'],
    wallets_config['training_data']['coin_modeling_period_end'],
    set(wamo_coin_training_data_df_full.index)
)


# Target var for coin modeling period is performance during the investing period
como_coin_target_var_df = features_generator.calculate_target_variables(
    investing_market_data_df,
    wallets_config['training_data']['investing_period_start'],
    wallets_config['training_data']['investing_period_end'],
    set(como_coin_training_data_df_full.index)
)


### parse columns

In [None]:

# List all cols
wamo_coin_training_data_df_full = pd.read_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}"
                                                  "/multiwindow_wamo_coin_training_data_df_full.parquet")
list(wamo_coin_training_data_df_full.columns)

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules


# Load df
# Create dataframe of column names
df = pd.DataFrame(wamo_coin_training_data_df_full.columns)
df.columns = ['feature']
feature_details_df = cfo.parse_feature_names(df)

# Select features
segment_category_filter = [
    # 'all_wallets',
    # 'score_quantile',
    # 'score_binary',
    # 'training_clusters',
    # 'time_series',
    # 'wallet_cohorts',
    'macro',
]
segment_family_filter = [
    # 'all',
    # 'cw_return_rate_min_025',
    'global_market_cap',
    'btc_mvrv_z_score',
    'btc_price',
    # 'wallet_cohorts',
]
metric_filter = [
    # 'trading',
    'balances',
]
metric_detail_filter = [
    # 'crypto_net_gain',
    'usd_balance_ending',
]
transformation_category_filter = [
    # 'aggregations',
    'score_wtd',
    # 'score_dist',
]
transformation_base_filter = [
    'cw_return_rate_min_000_score',
]
transformation_method_filter = [
    # 'count',
    # 'sum',
    'kurt'
]

groups = [
    'segment_category',
    'segment_family',
    # 'segment_value',
    # 'metric',
    # 'metric_detail',
    # 'transformation_category',
    # 'transformation_base',
    # 'transformation_method',
    'feature_full',

]
pd.DataFrame(feature_details_df
 [
  (feature_details_df['segment_category'].isin(feature_details_df['segment_category']))  # Dummy line that always evaluates to True
 & (feature_details_df['segment_category'].isin(segment_category_filter))
 & (feature_details_df['segment_family'].isin(segment_family_filter))
#  & (feature_details_df['metric'].isin(metric_filter))
#  & (feature_details_df['metric_detail'].isin(metric_detail_filter))
#  & (feature_details_df['transformation_category'].isin(transformation_category_filter))
#  & (feature_details_df['transformation_base'].isin(transformation_base_filter))
#  & (feature_details_df['transformation_method'].isin(transformation_method_filter))
    ]
 .fillna('None').groupby(groups)
 .size()
# ).columns
).sort_values(by=0,ascending=False)


### Build coin model

In [None]:
parquet_folder = wallets_coin_config['training_data']['parquet_folder']
wamo_coin_training_data_df_full = pd.read_parquet(f"{parquet_folder}/multiwindow_wamo_coin_training_data_df_full.parquet")
wamo_coin_target_var_df         = pd.read_parquet(f"{parquet_folder}/multiwindow_wamo_coin_target_var_df.parquet")
como_coin_training_data_df_full = pd.read_parquet(f"{parquet_folder}/multiwindow_como_coin_training_data_df_full.parquet")
como_coin_target_var_df         = pd.read_parquet(f"{parquet_folder}/multiwindow_como_coin_target_var_df.parquet")
investing_coin_training_data_df_full = pd.read_parquet(f"{parquet_folder}/investing_multiwindow_como_coin_training_data_df_full.parquet")
investing_coin_target_var_df         = pd.read_parquet(f"{parquet_folder}/investing_multiwindow_como_coin_target_var_df.parquet")


In [None]:
if len(set(wamo_coin_training_data_df_full.index.get_level_values('coin_epoch_start_date')).intersection(
set(investing_coin_training_data_df_full.index.get_level_values('coin_epoch_start_date')))) > 0:
    raise ValueError("no overlap allowed")

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Initialize and run model
coin_model = cm.CoinModel(modeling_config=wallets_coin_config['coin_modeling'])
coin_model_results = coin_model.construct_coin_model(
    wamo_coin_training_data_df_full,wamo_coin_target_var_df,
    # como_coin_training_data_df_full,como_coin_target_var_df
    investing_coin_training_data_df_full,investing_coin_target_var_df
)

# Print summary
if 'y_train' in coin_model_results:


    # Generate and save all model artifacts
    coin_model_id, coin_evaluator, coin_scores_df = cimr.generate_and_save_coin_model_artifacts(
        model_results=coin_model_results,
        base_path='../artifacts/coin_modeling',
        configs = {
            'wallets_coin_config': wallets_coin_config.config,
            'wallets_config': wallets_config.config,
            'wallets_epochs_config': wallets_epochs_config,
            'wallets_features_config': wallets_features_config,
            'wallets_metrics_config': wallets_metrics_config,
        }
    )
    coin_evaluator.plot_wallet_evaluation()
else:
    display(coin_model.generate_search_report())


In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

model_id = coin_model_id
# model_id = 'bb10db73-1fda-4141-b9ca-64716b41db00'

file_prefix='investing_val'
parquet_folder = wallets_coin_config['training_data']['parquet_folder']
como_features_df = pd.read_parquet(f"{parquet_folder}/{file_prefix}multiwindow_como_coin_training_data_df_full.parquet")
como_target_var_df = pd.read_parquet(f"{parquet_folder}/{file_prefix}multiwindow_como_coin_target_var_df.parquet")

if len(set(como_target_var_df.index.get_level_values('coin_epoch_start_date')).intersection(
set(investing_coin_training_data_df_full.index.get_level_values('coin_epoch_start_date')))) > 0:
    raise ValueError("no overlap allowed")


# Create a list of the current index level dtypes
idx_dtypes = list(como_target_var_df.index.dtypes)
# Convert only the coin_id level (assuming it's the first level)
como_target_var_df.index = como_target_var_df.index.set_levels(
    como_target_var_df.index.levels[0].astype(str),
    level=0
)

como_scores_df = coin_epochs_orchestrator.score_coin_training_data(
    model_id,
    '../artifacts/coin_modeling',
    como_features_df,
)
logger.info(set(como_features_df.index.get_level_values('coin_epoch_start_date')))
plot_return_vs_rank(como_scores_df['score'],como_target_var_df['coin_return_winsorized'],n_buckets=50)

#### model 2

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')


# Initialize and run model
coin_model = cm.CoinModel(modeling_config=wallets_coin_config['coin_modeling'])
coin_model_results = coin_model.construct_coin_model(
    wamo_coin_training_data_df_full,wamo_coin_target_var_df,
    como_coin_training_data_df_full,como_coin_target_var_df
)

# Print summary
if 'y_train' in coin_model_results:


    # Generate and save all model artifacts
    coin_model_id, coin_evaluator, coin_scores_df = cimr.generate_and_save_coin_model_artifacts(
        model_results=coin_model_results,
        base_path='../artifacts/coin_modeling',
        configs = {
            'wallets_coin_config': wallets_coin_config.config,
            'wallets_config': wallets_config.config,
            'wallets_epochs_config': wallets_epochs_config,
            'wallets_features_config': wallets_features_config,
            'wallets_metrics_config': wallets_metrics_config,
        }
    )
    coin_evaluator.plot_wallet_evaluation()
else:
    display(coin_model.generate_search_report())


### importance analysis

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules

# Load df
importances_df = pd.DataFrame(coin_evaluator.metrics['importances'])
feature_details_df = cfo.parse_feature_names(importances_df,'importance')

# Select features
segment_category_filter = [
    # 'all_wallets',
    # 'macro',
    # 'score_quantile',
    # 'score_binary',
    # 'training_clusters',
    # 'time_series',
    # 'wallet_cohorts',
]
segment_family_filter = [
    # 'all_wallets',
    # 'net_gain_winsorized_dda619_grid_score',
    'cw_return_rate_regression_score',
    # 'time_series',
    # 'wallet_cohorts',
]
segment_value_filter = [
    'cluster_4',
]
metric_filter = [
    'trading',
    'balances',
]
metric_detail_filter = [
    'crypto_net_gain',
    'usd_balance_241031',
]
transformation_category_filter = [
    # 'aggregations',
    # 'score_wtd',
    'score_dist',
]
transformation_base_filter = [
    # 'aggregations',
    'cw_return_rate_min_040_score',
]
transformation_method_filter = [
    # 'count',
    # 'sum',
    # 'dda528_net_gain_max_inv_025_score_p90'
]

groups = [
    'segment_category',
    'segment_family',
    # 'segment_value',
    # 'metric',
    # 'metric_detail',
    # 'transformation_category',
    # 'transformation_base',
    # 'transformation_method',
    # 'feature_full',

]
pd.DataFrame(feature_details_df
 [
  (feature_details_df['segment_category'].isin(feature_details_df['segment_category']))  # Dummy line that always evaluates to True
#  & (feature_details_df['segment_category'].isin(segment_category_filter))
#  & (feature_details_df['segment_family'].isin(segment_family_filter))
#  & (feature_details_df['segment_value'].isin(segment_value_filter))
#  & (feature_details_df['metric'].isin(metric_filter))
#  & (feature_details_df['metric_detail'].isin(metric_detail_filter))
#  & (feature_details_df['transformation_category'].isin(transformation_category_filter))
#  & (feature_details_df['transformation_base'].isin(transformation_base_filter))
#  & (feature_details_df['transformation_method'].isin(transformation_method_filter))
    ]
 .fillna('None')
 .groupby(groups)['importance']
 .agg(['sum', 'count'])
).sort_values(by='sum',ascending=False).head(20)


### Generate wallet scores for investing period

#### wallet training data for the coin modeling period

In [None]:
# Here we create wallet model training data for dates through the end of the original modeling period.
# This will be used to create "current" scores as of the end of the modeling period, that can be
# used to generate features for the "current" coin model built at the end of the modeling period.

[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))

complete_profits_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_profits_df.parquet")
complete_market_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_market_data_df.parquet")
complete_macro_trends_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_macro_trends_df.parquet")

# Identify offset needed to generate training data directly following the modeling period to the validation period start
modeling_offset = (datetime.strptime(wallets_config['training_data']['modeling_period_end'], '%Y-%m-%d') - datetime.strptime(wallets_config['training_data']['training_period_end'], '%Y-%m-%d')).days
coin_modeling_epochs_config = {
    'offset_epochs': {
        'offsets': [modeling_offset]
    }
}
# Initiate orchestrator
epochs_orchestrator = weo.WalletEpochsOrchestrator(
    wallets_config.config,
    wallets_metrics_config,
    wallets_features_config,
    coin_modeling_epochs_config,
    complete_profits_df,
    complete_market_data_df,
    complete_macro_trends_df
)

# Generate TRAINING_DATA_DF for the modeling period offset window
como_training_data_df, como_modeling_data_df, _, _ = epochs_orchestrator.generate_epochs_training_data()

# Save files
como_training_data_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/como_training_data_df.parquet",index=True)
como_modeling_data_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/como_modeling_data_df.parquet",index=True)


#### save scores for coin modeling training data

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# model_id = 'c1fd04e8-5d57-48d7-9d7d-57b61afff9d5'
score_name = wallets_config['modeling']['score_name']


# Load and predict
como_training_data_df = pd.read_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}"
                                           "/como_coin_training_data_df_full.parquet")

base_path = wallets_config['training_data']['model_artifacts_folder']
como_y_pred = wiva.load_and_predict(model_id,como_training_data_df,base_path)

# Create wallet scores DataFrame with both cohorts
modeling_wallet_scores_df = pd.DataFrame({
    f'score|{score_name}': como_y_pred
})
modeling_wallet_scores_df.to_parquet(f"temp/wallet_modeling_score_dfs/{score_name}.parquet",index=True)

u.notify(2)


## Post model analysis

### performance report

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')


# Initialize evaluator
coin_evaluator = wime.RegressorEvaluator(coin_model_results)

print(coin_evaluator.summary_report())
coin_evaluator.plot_coin_evaluation()
coin_evaluator.importance_summary(0)

### importance analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')


feature_details_df = civa.analyze_coin_model_importance(coin_evaluator.metrics['importances'])

segment_category_filter = [
    # 'all_wallets',
    'score_quantile',
    # 'time_series',
    # 'wallet_cohorts',
    # 'training_clusters',
]
segment_family_filter = [
    'all_wallets',
    'net_gain_winsorized_dda619_grid_score',
    # 'time_series',
    # 'wallet_cohorts',
]
metric_filter = [
    # 'trading',
    'balances',
]
metric_detail_filter = [
    'crypto_net_gain',
    'usd_balance_241031',
]
transformation_filter = [
    # 'aggregations',
    # 'score_wtd',
]
transformation_method_filter = [
    'net_gain_winsorized_dda619_grid_residual_p10',
    # 'sum',
]

groups = [
    'segment_category',
    # 'segment_family',
    # 'segment_value',
    'metric',
    'metric_detail',
    'transformation',
    'transformation_method',
    # 'feature_full',

]

pd.DataFrame(feature_details_df
 [
 (feature_details_df['segment_category'].isin(segment_category_filter))
#  & (feature_details_df['segment_family'].isin(segment_family_filter))
#  & (feature_details_df['metric'].isin(metric_filter))
#  & (feature_details_df['metric_detail'].isin(metric_detail_filter))
#  & (feature_details_df['transformation'].isin(transformation_filter))
#  & (feature_details_df['transformation_method'].isin(transformation_method_filter))
    ]
 .fillna('None').groupby(groups)
 .sum('importance')
# ).columns
).sort_values(by='importance',ascending=False)


In [None]:
# Load importances
feature_importance_df = pd.DataFrame(coin_evaluator.metrics['importances'])

# Split on pipe delimiters
split_df = feature_importance_df['feature'].str.split('|', expand=True)
split_df.columns = ['segment_category','segment_family','metric','transformation']

# Split nested components
segment_families = split_df['segment_family'].str.split('/', expand=True)
segment_families.columns = ['segment_family', 'segment_value']

metrics = split_df['metric'].str.split('/', expand=True)
metrics.columns = ['metric', 'metric_detail']

transformations = split_df['transformation'].str.split('/', expand=True)
transformations.columns = ['transformation', 'transformation_method']

# Combine all components
feature_details_df = pd.concat([
    split_df['segment_category'],
    segment_families,
    metrics,
    transformations,
    feature_importance_df['importance']
], axis=1)

feature_details_df

In [None]:
list(feature_importance_df['feature'])

In [None]:
groups = [
    'segment_category',
    'segment_family',
    # 'segment_value',
    'metric',
    'metric_detail',
    # 'transformation',
    # 'transformation_method',
]

feature_details_df.groupby(groups).sum('importance').sort_values(by='importance',ascending=False)

In [None]:
result_df

## analyze features

### basic correlation

In [None]:
import pandas as pd

# Assuming your DataFrame is named `df`
# Calculate correlations
correlation_matrix = coin_modeling_df.corr()

# Extract correlations with the target variable
target_correlations = correlation_matrix[target_var_column].sort_values(ascending=False)

# Display the top features correlated with the target
target_correlations[:15]
# target_correlations

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')



# # Wallet metrics to analyze
# wallet_metrics = [
# ]

wallet_metrics = coin_modeling_df.columns
wallet_metrics = target_correlations[:15].index.values

# number of score buckets
n_quantiles = 5

analyze_df = civa.analyze_metric_segments(
    coin_modeling_df,
    wallet_metrics,
    n_quantiles,
    target_var_column,
)
civa.style_metric_segments(analyze_df)

# Pre Coin Model Analysis

### Wallet aggregated analysis

#### generate validation wallet features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')


# Create a DataFrame with all wallets that should exist
validation_wallet_features_df = pd.DataFrame(index=training_wallet_cohort)
validation_wallet_features_df.index.name = 'wallet_address'


# Calculate modeling period wallet metrics
validation_trading_features_df = wtf.calculate_wallet_trading_features(validation_profits_df,
                                                            wallets_config['training_data']['validation_period_start'],
                                                            wallets_config['training_data']['validation_period_end'],
                                                            include_twb_metrics=False)
validation_wallet_features_df = validation_wallet_features_df.join(validation_trading_features_df, how='left')\
    .fillna({col: 0 for col in validation_trading_features_df.columns})

# Performance features (inner join, no fill)
performance_features_df = wpf.calculate_performance_features(validation_wallet_features_df,include_twb_metrics=False)
validation_wallet_features_df = validation_wallet_features_df.join(performance_features_df, how='inner')

In [None]:
validation_wallet_features_df

#### wallet validation period trading/performance by score quantile

In [None]:
# Create base df with all wallet addresses and scores
modeling_wallet_scores_df = cfo.load_wallet_scores(wallets_coin_config['wallet_segments']['wallet_scores'],
                                            wallets_coin_config['wallet_segments']['wallet_scores_path'])


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create analysis by prediction bands
metrics = [
    'crypto_net_gain/max_investment/winsorized',
    'crypto_net_gain/max_investment/base',
    'crypto_net_gain/max_investment/ntile_rank',
    'crypto_net_gain/active_twb/winsorized',
    'crypto_net_gain/active_twb/base',
    'max_investment',
    'crypto_net_gain',
    'crypto_net_flows',
    'total_volume',
]

min_wallet_volume_usd = 0
num_quantiles = 5

wiva.create_quantile_report(
    validation_wallet_features_df,
    modeling_wallet_scores_df[wallets_config['modeling']['score_name']],
    metrics,  # Your existing metrics list
    num_quantiles,  # Split into ntiles
    min_wallet_volume_usd
)


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create analysis by prediction bands
metrics = [
    'crypto_net_gain/max_investment/winsorized',
    'crypto_net_gain/max_investment/base',
    'crypto_net_gain/max_investment/ntile_rank',
    'crypto_net_gain/active_twb/winsorized',
    'crypto_net_gain/active_twb/base',
    'max_investment',
    'crypto_net_gain',
    'crypto_net_flows',
    'total_volume',
]

min_wallet_volume_usd = 0
num_quantiles = 5

wiva.create_quantile_report(
    validation_wallet_features_df,
    modeling_wallet_scores_df[wallets_config['modeling']['score_name']],
    metrics,  # Your existing metrics list
    num_quantiles,  # Split into ntiles
    min_wallet_volume_usd
)


### old analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')


# Wallet metrics to analyze
wallet_metrics = [
    'top_100pct/balance_wtd_mean_score',
    'top_10pct/count',
    'top_25pct/count',
    'top_50pct/count',
    'top_100pct/count',
    'top_10pct/count_pct',
    'top_10pct/balance_pct',
    'top_25pct/count_pct',
    'top_25pct/balance_pct',
    'top_50pct/count_pct',
    'top_50pct/balance_pct',
]
# wallet_metrics = list(validation_coin_wallet_features_df.columns)

# Create styled performance analysis
civa.create_top_coins_wallet_metrics_report(validation_coin_wallet_features_df,percentile=90,wallet_metrics=wallet_metrics,method='mean')


#### plotting coin feature performance vs market cap

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Get the analysis results
segment_results, summary_df = civa.analyze_market_cap_segments(
    coin_wallet_features_df,
    top_n=10
)

# Or create the visualizations
civa.plot_segment_heatmap(summary_df)
civa.plot_metric_consistency(summary_df)  # Optional secondary visualization


#### coin performance of top n for each bucket

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Run analysis
top_n = wallets_config['coin_validation_analysis']['top_n']
max_market_cap = wallets_config['coin_validation_analysis']['max_market_cap']
min_market_cap = wallets_config['coin_validation_analysis']['min_market_cap']

metric_top_coin_performance_df = civa.validate_coin_performance(coin_wallet_features_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

civa.print_performance_analysis(coin_wallet_features_df)

# Appendix: Single Window Construction


### Training Data Sequence

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# Load orchestrator
training_data_orchestrator = wtdo.WalletTrainingDataOrchestrator(
    copy.deepcopy(wallets_config.config),
    wallets_metrics_config,
    wallets_features_config
)

In [None]:
# Retrieve data
_,_,_,_ = training_data_orchestrator.retrieve_period_datasets(
    wallets_config['training_data']['training_period_start'],
    wallets_config['training_data']['training_period_end'],
    parquet_prefix='training'
)

In [None]:
# Select cohort and prepare training data
parquet_folder = wallets_config['training_data']['parquet_folder']
training_profits_df_full = pd.read_parquet(f"{parquet_folder}/training_profits_df_full.parquet")
training_market_data_df_full = pd.read_parquet(f"{parquet_folder}/training_market_data_df_full.parquet")
training_macro_trends_df_full = pd.read_parquet(f"{parquet_folder}/training_macro_trends_df_full.parquet")


_ = training_data_orchestrator.prepare_training_data(
    training_profits_df_full,
    training_market_data_df_full,
    training_macro_trends_df_full
)

# Store hybrid ID map
if wallets_config['training_data']['hybridize_wallet_ids']:
    pd.to_pickle(training_data_orchestrator.hybrid_cw_id_map, f"{parquet_folder}/hybrid_cw_id_map.pkl")

In [None]:
# Generate training features
parquet_folder = wallets_config['training_data']['parquet_folder']
training_profits_df = pd.read_parquet(f"{parquet_folder}/training_profits_df.parquet")
training_market_indicators_df = pd.read_parquet(f"{parquet_folder}/training_market_indicators_data_df.parquet")
training_macro_indicators_df = pd.read_parquet(f"{parquet_folder}/training_macro_indicators_df.parquet")
training_transfers_df = pd.read_parquet(f"{parquet_folder}/training_transfers_sequencing_df.parquet")

training_data_orchestrator.generate_training_features(
    training_profits_df,
    training_market_indicators_df,
    training_macro_indicators_df,
    training_transfers_df
)

u.notify(3)

### Wallet Model Target Variable and Wallet Cohort

In [None]:
# Load modeling datasets
training_coin_cohort = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/training_market_indicators_data_df.parquet",
                                       columns=['coin_id'])['coin_id'].unique()
_,_,_,_ = training_data_orchestrator.retrieve_period_datasets(
    wallets_config['training_data']['modeling_period_start'],
    wallets_config['training_data']['modeling_period_end'],
    training_coin_cohort,
    parquet_prefix='modeling'
)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

training_wallet_cohort = pd.read_parquet(
    f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet",
    columns=[]
).index.values

# Load orchestrator
training_data_orchestrator = wtdo.WalletTrainingDataOrchestrator(
    copy.deepcopy(wallets_config.config),
    wallets_metrics_config,
    wallets_features_config,
    training_wallet_cohort
)

In [None]:
# Prepare modeling features for target variables
modeling_profits_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/modeling_profits_df_full.parquet")
hybrid_cw_id_map = None
if wallets_config['training_data']['hybridize_wallet_ids']:
    hybrid_cw_id_map = pd.read_pickle(f"{wallets_config['training_data']['parquet_folder']}/hybrid_cw_id_map.pkl")

_ = training_data_orchestrator.prepare_modeling_features(
    modeling_profits_df_full,
    hybrid_cw_id_map
)

u.notify(3)

### Wallet Model Construction and Analysis

#### select target variable (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create MODELING_DF and Construct Wallet Model
# ----------------------------------------------------------
# Retrieve training data for the full training wallet cohort
modeling_wallet_features_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/modeling_wallet_features_df.parquet")

# Filter training data to only the modeling cohort through inner join to target variable
modeling_cohort_target_var_df = modeling_wallet_features_df[['in_modeling_cohort', wallets_config['modeling']['target_variable']]].copy()

# Retrieve training data for the full training wallet cohort
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet")
logger.info("Training data df shape: %s", wallet_training_data_df.shape)
# sorted(list(wallet_training_data_df.columns))

#### build wallet model or run search

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Retrieve training data for the full training wallet cohort
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet")

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config['modeling'])

# Validate indices match
if not all(
    wallet_training_data_df.sort_index().index.get_level_values(level).equals(
        modeling_cohort_target_var_df.sort_index().index.get_level_values(level)
    ) for level in wallet_training_data_df.index.names
):
    raise ValueError("Merged training and modeling DataFrames have mismatched indices.")


wallet_model_results = wallet_model.construct_wallet_model(wallet_training_data_df,modeling_cohort_target_var_df)
del wallet_training_data_df
gc.collect()

# Print summary
if 'y_train' in wallet_model_results:

    # Generate and save all model artifacts
    model_id, wallet_evaluator, modeling_wallet_scores_df = wimr.generate_and_save_wallet_model_artifacts(
        model_results=wallet_model_results,
        base_path='../artifacts/wallet_modeling',
        configs = {
            'wallets_config': wallets_config.config,
            'wallets_metrics_config': wallets_metrics_config,
            'wallets_features_config': wallets_features_config
        },
        save_scores=False
    )
    print(wallet_evaluator.summary_report())
else:
    display(wallet_model.generate_search_report())

# Junkyard

# Tests failing

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')
