### start

In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import gc
import time
import copy
import logging
import re
from itertools import chain,combinations
import pdb
from pathlib import Path
import pickle
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import IPython
import requests
import pandas_gbq
from google.cloud import bigquery
import scipy
from scipy import stats
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    explained_variance_score,
    mean_absolute_percentage_error,
    log_loss,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['LOGGING_FILE'] = "../../../Local/logs/wallet_modeling.log"
os.environ['NOTIFICATION_SOUNDS_DIR'] = "../../../Local"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import insights.modeling as m
import insights.analysis as ia
import insights.experiments as exp
import feature_engineering.coin_flow_features_orchestrator as cffo

# Wallet features
import wallet_features.clustering_features as wcl
import wallet_features.market_cap_features as wmc
import wallet_features.market_timing_features as wmt
import wallet_features.trading_features as wtf
import wallet_features.performance_features as wpf
import wallet_features.transfers_features as wts
import wallet_features.scenario_features as wsc
import wallet_features.balance_features as wbf
import wallet_features.macroeconomic_features as wmac
import wallet_features.wallet_features_orchestrator as wfo

# Base modeling
import base_modeling.base_model as bm
import base_modeling.feature_selection as fs
import base_modeling.pipeline as bp
import base_modeling.pipeline as bsc

# Wallet modeling
import wallet_modeling.wallet_training_data_orchestrator as wtdo
import wallet_modeling.wallet_epochs_orchestrator as weo
import wallet_modeling.wallet_training_data as wtd
import wallet_modeling.wallet_model as wm
import wallet_modeling.wallet_model_orchestrator as wmo
import wallet_modeling.wallets_config_manager as wcm
from wallet_modeling.wallets_config_manager import WalletsConfig

# Wallet insights
import wallet_insights.wallet_model_reporting as wimr
import wallet_insights.model_evaluation as wime
import wallet_insights.wallet_validation_analysis as wiva
import wallet_insights.wallet_cluster_analysis as wica

# Coin features
import coin_wallet_features.coin_features_orchestrator as cfo
import coin_wallet_features.wallet_metrics as cfwm
import coin_wallet_features.wallet_metrics_flattening as cfwmf
import coin_wallet_features.wallet_segmentation as cws

# Coin modeling
import coin_modeling.coin_model as cm
import coin_modeling.coin_epochs_orchestrator as ceo
from coin_modeling.coin_config_manager import WalletsCoinConfig

# Coin insights
import coin_insights.coin_validation_analysis as civa
import coin_insights.coin_model_reporting as cimr


# reload all modules
modules = [
    u, dr, pri, cwm, ind, fg, flt, ds, tv, prp, m, ia, exp, cffo,
    wtdo, weo, wtd, wm, wmo, wcm,
    wcl, wmc, wmt, wtf, wpf, wts, wsc, wbf, wmac, wfo,
    bm, fs, bp, bsc,
    wimr, wime, wiva, wica,
    cfo, cfwm, cfwmf, cws,
    cm, ceo,
    civa, cimr,
]

# load all configs
(
    coin_flow_config,
    coin_flow_metrics_config,
    coin_flow_modeling_config,
    coin_flow_experiments_config
) = u.load_all_configs('../config')
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')
wcm.validate_config_alignment(coin_flow_config,wallets_config,wallets_coin_config)

wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))
wallets_coins_metrics_config = u.load_config('../config/wallets_coins_metrics_config.yaml')

# make parquet dirs if they don't already exist
Path(wallets_config['training_data']['parquet_folder']).mkdir(parents=True, exist_ok=True)
Path(wallets_coin_config['training_data']['parquet_folder']).mkdir(parents=True, exist_ok=True)

# Set the custom error handler
ipython = IPython.get_ipython()
ipython.set_custom_exc((Exception,), u.notify_on_failure)

# configure logger
logger = u.setup_notebook_logger('../logs/notebook_logs.log')
logger.setLevel(logging.INFO)


# u.export_code(
#     code_directories=[
#         # 'training_data',
#         'wallet_modeling',
#         'wallet_features',
#         'feature_engineering',
#         'coin_wallet_features',
# #         'base_modeling',
# #         'coin_modeling',
# #         # 'coin_insights',
# #         # 'wallet_insights'
#     ],
#     # include_config = True,
#     # ipynb_notebook = 'DDA-769 coin model score dist toggle.ipynb'
# )

[importlib.reload(module) for module in modules]
u.notify('retro')

logger.info("Good morning, let's get to work")

# Wallet Model Construction

### Load complete wallet datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))

# Initiate orchestrator
epochs_orchestrator = weo.WalletEpochsOrchestrator(
    wallets_config.config,
    wallets_metrics_config,
    wallets_features_config,
    wallets_epochs_config
)

epochs_orchestrator.load_complete_raw_datasets()

### Generate modeling and validation features (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))

parquet_folder = wallets_config['training_data']['parquet_folder']
complete_profits_df = pd.read_parquet(f"{parquet_folder}/complete_profits_df.parquet")
complete_market_data_df = pd.read_parquet(f"{parquet_folder}/complete_market_data_df.parquet")
complete_macro_trends_df = pd.read_parquet(f"{parquet_folder}/complete_macro_trends_df.parquet")
complete_hybrid_cw_id_df = pd.read_parquet(f"{parquet_folder}/complete_hybrid_cw_id_df.parquet")

# Initiate orchestrator
epochs_orchestrator = weo.WalletEpochsOrchestrator(
    wallets_config.config,
    wallets_metrics_config,
    wallets_features_config,
    wallets_epochs_config,
    complete_profits_df,
    complete_market_data_df,
    complete_macro_trends_df,
    complete_hybrid_cw_id_df
)

# Generate training and modeling dfs for all windows
(wallet_training_data_df,wallet_target_vars_df,
 validation_training_data_df,validation_target_vars_df) = epochs_orchestrator.generate_epochs_training_data()


# Confirm all pairs in profits_df have a hybrid mapping
if complete_hybrid_cw_id_df is not None:
    wtdo.validate_hybrid_mapping_completeness(wallet_training_data_df,complete_hybrid_cw_id_df)
    if not validation_training_data_df.empty:
        wtdo.validate_hybrid_mapping_completeness(validation_training_data_df,complete_hybrid_cw_id_df)

# Save files
wallet_training_data_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_wallet_training_data_df.parquet",index=True)
wallet_target_vars_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_wallet_target_vars_df.parquet",index=True)
validation_training_data_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_validation_training_data_df.parquet",index=True)
validation_target_vars_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_validation_target_vars_df.parquet",index=True)

# sorted(list(wallet_training_data_df.columns))

#### parse columns

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# put features into dict and analyze list
features_dict = {}
features_dict['feature'] = list(wallet_training_data_df.columns)
features_dict['importance'] = [1] * len(wallet_training_data_df.columns)
feature_importances_df = wiva.analyze_wallet_model_importance(features_dict)
feature_importances_df = feature_importances_df.copy()

feature_categories_filter = [
    # 'performance',
    # 'timing',
    # 'cw_timing',
    'trading',
    # 'transfers',
    # 'mktcap',
    # 'scenario',
    # 'macro',
    # 'cluster',
]

feature_names_filter = [
    # 'price_sma_2',
    # 'price_rsi_5',
    # 'volume_sma_5',
    # 'market_cap_filled',
    # 'mktcap',
    # 'cluster',
    # 'btc_vdd_multiple',
    'gtrends_memecoin_us',
]

groups = [
    # 'feature_category',
    # 'feature_name',
    # 'feature_comparison',
    # 'feature_aggregation',
    # 'training_segment',
    # 'feature'
]

(feature_importances_df
 [feature_importances_df['feature_category'].isin(feature_categories_filter)]
#  [feature_importances_df['feature_name'].isin(feature_names_filter)]
 .fillna('None')
 .groupby(groups)['importance']
 .agg(['sum', 'count'])
 .sort_values(by='sum',ascending=False)
)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

fs.validate_drop_params(wallet_training_data_df,wallets_config)

### Construct wallet model

#### wallet model w validation (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Load modeling and validation files
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_wallet_training_data_df.parquet")
wallet_target_vars_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_wallet_target_vars_df.parquet")
validation_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_validation_training_data_df.parquet")
validation_target_vars_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_validation_target_vars_df.parquet")

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config['modeling'])
wallet_model_results = wallet_model.construct_wallet_model(
    wallet_training_data_df, wallet_target_vars_df,
    validation_training_data_df, validation_target_vars_df
)

# Print summary
if 'y_train' in wallet_model_results:

    # Generate and save all model artifacts
    model_id, wallet_evaluator, modeling_wallet_scores_df = wimr.generate_and_save_wallet_model_artifacts(
        model_results=wallet_model_results,
        base_path='../artifacts/wallet_modeling',
        configs = {
            'wallets_config': wallets_config.config,
            'wallets_metrics_config': wallets_metrics_config,
            'wallets_features_config': wallets_features_config,
            'wallets_epochs_config': wallets_epochs_config
        }
    )
    wallet_evaluator.summary_report()
    wallet_evaluator.plot_wallet_evaluation()
else:
    display(wallet_model.generate_search_report())

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# # Run the experiment and get results
# wallet_model = wm.WalletModel(wallets_config['modeling'])
# wallet_model_results = wallet_model.construct_wallet_model(
#     wallet_training_data_df, wallet_target_vars_df,
#     validation_training_data_df, validation_target_vars_df
# )

# Print summary
if 'y_train' in wallet_model_results:

    # Generate and save all model artifacts
    model_id, wallet_evaluator, modeling_wallet_scores_df = wimr.generate_and_save_wallet_model_artifacts(
        model_results=wallet_model_results,
        base_path='../artifacts/wallet_modeling',
        configs = {
            'wallets_config': wallets_config.config,
            'wallets_metrics_config': wallets_metrics_config,
            'wallets_features_config': wallets_features_config,
            'wallets_epochs_config': wallets_epochs_config
        }
    )
    wallet_evaluator.summary_report()
    wallet_evaluator.plot_wallet_evaluation()
else:
    display(wallet_model.generate_search_report())

##### wallet model 2

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config['modeling'])
wallet_model_results = wallet_model.construct_wallet_model(
    wallet_training_data_df, wallet_target_vars_df,
    validation_training_data_df, validation_target_vars_df
)

# Print summary
if 'y_train' in wallet_model_results:

    # Generate and save all model artifacts
    model_id, wallet_evaluator, modeling_wallet_scores_df = wimr.generate_and_save_wallet_model_artifacts(
        model_results=wallet_model_results,
        base_path='../artifacts/wallet_modeling',
        configs = {
            'wallets_config': wallets_config.config,
            'wallets_metrics_config': wallets_metrics_config,
            'wallets_features_config': wallets_features_config,
            'wallets_epochs_config': wallets_epochs_config
        }
    )
    wallet_evaluator.summary_report()
    wallet_evaluator.plot_wallet_evaluation()
else:
    display(wallet_model.generate_search_report())

#### wallet model without validation (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Load modeling files
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_wallet_training_data_df.parquet")
wallet_target_vars_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_wallet_target_vars_df.parquet")

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config['modeling'])
wallet_model_results = wallet_model.construct_wallet_model(
    wallet_training_data_df, wallet_target_vars_df
)

# Print summary
if 'y_train' in wallet_model_results:

    # Generate and save all model artifacts
    model_id, wallet_evaluator, modeling_wallet_scores_df = wimr.generate_and_save_wallet_model_artifacts(
        model_results=wallet_model_results,
        base_path='../artifacts/wallet_modeling',
        configs = {
            'wallets_config': wallets_config.config,
            'wallets_metrics_config': wallets_metrics_config,
            'wallets_features_config': wallets_features_config
        }
    )
    wallet_evaluator.summary_report()
else:
    display(wallet_model.generate_search_report())

#### upload scores to bigquery

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

parquet_folder = wallets_config['training_data']['parquet_folder']
complete_hybrid_cw_id_df = pd.read_parquet(f"{parquet_folder}/complete_hybrid_cw_id_df.parquet")
training_data_df = pd.read_parquet(f"{parquet_folder}"
                                        "/multiwindow_wallet_training_data_df_pred.parquet")
model_id = '5267f929-3d7c-40f5-afc7-e40377a60c2a'
score_name = 'dda800 non-cw net flows classifier 0.80'
score_notes = 'test set auc 1.0 hmmm'

wimr.generate_and_upload_wallet_cw_scores(
    wallets_config,
    training_data_df,
    complete_hybrid_cw_id_df,
    model_id,
    score_name,
    score_notes,
)

#### predict training data with existing model only

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

base_path = wallets_config['training_data']['model_artifacts_folder']
model_id = '85e79c0e-c6a6-4514-97bb-277b945086fd'
score_name = 'dda785_net_flows'

# Load and predict
training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}"
                                        "/multiwindow_wallet_training_data_df.parquet")
y_pred = wiva.load_and_predict(model_id,training_data_df,base_path)
wallet_scores_df = pd.DataFrame({
    'score': y_pred
})

wallet_scores_df = wtdo.dehybridize_wallet_address(wallet_scores_df,complete_hybrid_cw_id_df)
wallet_scores_df = wallet_scores_df.reset_index()
wallet_scores_df['model_id'] = model_id
wallet_scores_df['scored_at'] = datetime.now()
wallet_scores_df['model_type'] = wallets_config['modeling']['model_type']
wallet_scores_df['target_var'] = wallets_config['modeling']['target_variable']
wallet_scores_df['target_var_threshold'] = wallets_config['modeling']['target_var_min_threshold']


table_name = f"wallets_{datetime.now().strftime('%Y%m%d_%Hh%Mm%Ss')}_{wallets_config['modeling']['target_variable'].replace('/','_')}"


# Basic syntax
pandas_gbq.to_gbq(
    dataframe=wallet_scores_df,
    destination_table=f'scores.{table_name}',
    project_id='western-verve-411004',
    if_exists='fail'  # Options: 'fail', 'replace', or 'append'
)


### Wallet Model Evaluation

#### load evaluation report

In [None]:
[importlib.reload(module) for module in modules]

model_id = '3493a19d-0ee3-4272-ab52-40afc6ab6d1b'
base_path = wallets_config['training_data']['model_artifacts_folder']
configs_output = 'temp/configs_revival/dda_691_3493a19d'

report = wimr.load_model_report(model_id, base_path, configs_output)

#### importance analysis

In [None]:
wallet_evaluator.importance_summary(1)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Reload evaluator
wallet_evaluator = wime.ClassifierEvaluator(wallet_model_results)

feature_importances_df = wiva.analyze_wallet_model_importance(wallet_evaluator.metrics['importances'])
feature_importances_df = feature_importances_df.copy()

feature_categories_filter = [
    # 'performance',
    # 'cw_timing',
    # 'trading',
    # 'transfers',
    # 'cw_mktcap',
    # 'scenario',
    # 'macro',
    # 'cluster',
]

feature_names_filter = [
    # 'price_sma_2',
    # 'price_rsi_5',
    # 'volume_sma_5',
    # 'market_cap_filled',
    # 'mktcap',
    # 'cluster',
    # 'portfolio_mcap_max',
    # 'crypto_net_flows',
]

groups = [
    # 'feature_category',
    # 'feature_name',
    # 'feature_comparison',
    # 'feature_aggregation',
    'training_segment',
    # 'feature'
]

(feature_importances_df
#  [feature_importances_df['feature_category'].isin(feature_categories_filter)]
#  [feature_importances_df['feature_name'].isin(feature_names_filter)]
 .fillna('None')
 .groupby(groups)['importance']
 .agg(['sum', 'count'])
 .sort_values(by='sum',ascending=False)
)

#### predict validation data with existing model

In [None]:
model_id = 'bfa55a33-712e-4d82-bb5c-11fc942bcb62'
validation_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}"
                                              "/multiwindow_validation_training_data_df.parquet")

validation_y_pred = wiva.load_and_predict(
    model_id,
    validation_training_data_df,
    wallets_config['training_data']['model_artifacts_folder']
)


In [None]:
validation_target_vars_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_validation_target_vars_df.parquet")

validation_y_true = validation_target_vars_df[wallets_config['modeling']['target_variable']]

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    log_loss,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)

def evaluate_classification(y_true: pd.Series, y_prob: pd.Series, threshold: float = 0.0) -> dict:
    """
    Calculate metrics for binary classification based on continuous values.

    Params:
    - y_true (Series): Actual continuous values.
    - y_prob (Series): Predicted continuous values.
    - threshold (float): Threshold for converting to binary. Default 0.0.

    Returns:
    - dict: Core performance metrics computed on overlapping ids.
    """
    # Identify common ids between y_true and y_prob
    common_idx = y_true.index.intersection(y_prob.index)
    if len(common_idx) == 0:
        raise ValueError("No overlapping ids between y_true and y_prob")

    # Filter to only overlapping ids
    y_true_common = y_true.loc[common_idx].values
    y_prob_common = y_prob.loc[common_idx].values

    # Convert continuous values to binary for classification metrics
    y_true_binary = (y_true_common > threshold).astype(int)
    y_pred_binary = (y_prob_common > threshold).astype(int)

    # Compute metrics
    metrics = {
        'accuracy': accuracy_score(y_true_binary, y_pred_binary),
        'precision': precision_score(y_true_binary, y_pred_binary),
        'recall': recall_score(y_true_binary, y_pred_binary),
        'f1': f1_score(y_true_binary, y_pred_binary),
        'confusion_matrix': confusion_matrix(y_true_binary, y_pred_binary).tolist(),
        'mse': mean_squared_error(y_true_common, y_prob_common),
        'mae': mean_absolute_error(y_true_common, y_prob_common)
    }

    # Add ROC AUC if we have both positive and negative classes
    if len(np.unique(y_true_binary)) > 1:
        metrics['roc_auc'] = roc_auc_score(y_true_binary, y_prob_common)

    return metrics


evaluate_classification(validation_y_true,validation_y_pred)

#### assess segment performance

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
pd.set_option('display.max_colwidth', None)  # Shows full text in columns


# Reload evaluator
if wallet_model_results['model_type'] == 'regression':
    wallet_evaluator = wime.RegressorEvaluator(wallet_model_results)
else:
    wallet_evaluator = wime.ClassifierEvaluator(wallet_model_results)

segmentation_features = [
    # 'mktcap|portfolio_mcap_mean/market_cap_unadj|all_windows',
    'mktcap|volume_wtd_market_cap/market_cap_filled|all_windows',
    # 'timing|btc_mvrv_z_score/buy_weighted|all_windows',
    # 'timing|btc_mvrv_z_score/sell_weighted|all_windows',
    # 'macro|btc_mvrv_z_score_first|all_windows',
    # 'macro|btc_mvrv_z_score_last|all_windows',
    'trading|crypto_net_gain|all_windows',
    'trading|total_volume|all_windows',
    'trading|crypto_net_cash_flows|all_windows',
    'trading|unique_coins_traded|all_windows',
    # 'transfers|first_buy/median_avg_wallet_rank|all_windows',
    'trading|max_investment|all_windows'
]
segmentation_features = [
    # 'mktcap|portfolio_mcap_mean/market_cap_unadj|w5',
    'mktcap|volume_wtd_market_cap/market_cap_filled|w5',
    # 'timing|btc_mvrv_z_score/buy_weighted|w5',
    # 'timing|btc_mvrv_z_score/sell_weighted|w5',
    # 'macro|btc_mvrv_z_score_first|w5',
    # 'macro|btc_mvrv_z_score_last|w5',
    'trading|crypto_net_gain|w5',
    'trading|total_volume|w5',
    'trading|crypto_net_cash_flows|w5',
    'trading|unique_coins_traded|w5',
    # 'transfers|first_buy/median_avg_wallet_rank|w5',
    'trading|max_investment|w5'
]


# get raw segments
segments_df = wallet_evaluator.identify_predictive_populations(
    segmentation_features,
    min_pop_pct=0.02,
    max_segments=25
)

# coerce the formatted strings to numbers, then sort
# segments_df.sort_values('RMSE vs Overall', ascending=True)
segments_df.sort_values('R2 vs Overall', ascending=False)
# segments_df.describe()


#### modeling multi window r2 comparison

In [None]:
epochs = sorted(list(modeling_wallet_scores_df.index.get_level_values('epoch_start_date').unique()))

for epoch in epochs:
    epoch_mask = modeling_wallet_scores_df.index.get_level_values('epoch_start_date') == epoch
    # Add cohort filter
    cohort_mask = modeling_wallet_scores_df['in_modeling_cohort'] == True
    combined_mask = epoch_mask & cohort_mask

    y_true = modeling_wallet_scores_df[combined_mask]['actual']
    y_pred = modeling_wallet_scores_df[combined_mask]['score']

    # Skip epochs with no actual values
    if y_true.isna().all():
        continue

    metrics = wiva.evaluate_predictions(y_true, y_pred)
    print(f"Epoch {epoch}: R² = {metrics['r2']:.3f}")

#### Cluster analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Load parquet
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet")


# List of the x features with the highest importance in the model
x_features = 6
top_feature_metrics = list((pd.DataFrame(wallet_evaluator.metrics['importances'])
                      .sort_values(by='importance',ascending=False)
                      .head(x_features)['feature']))
comparison_metrics = list(set(top_feature_metrics))



# Cluster numbers
n_clusters=4

styled_df,cluster_results_df = wica.create_cluster_report(wallet_training_data_df, wallet_model_results, n_clusters, comparison_metrics, 'median')

del(wallet_training_data_df)
gc.collect()

styled_df

In [None]:
modeling_df = wallet_training_data_df.copy()

base_metrics = [
    'trading|max_investment|all_windows',
    'trading|crypto_net_gain|all_windows',
    'mktcap|end_portfolio_wtd_market_cap|all_windows',
    'performance|crypto_net_gain/max_investment/base|all_windows',
]
cluster_cols = [col for col in modeling_df.columns if col.startswith('cluster|')]
cluster_analysis_df = modeling_df[list(set(cluster_cols + base_metrics + comparison_metrics))].copy()


# Assign wallets to categorical clusters based on the distance values
cluster_assignments_df = wcl.assign_clusters_from_distances(cluster_analysis_df,
                                                        wallets_config['features']['clustering_n_clusters'])
# cluster_analysis_df = cluster_analysis_df.join(cluster_assignments_df,how='inner')



In [None]:
list(cluster_analysis_df.columns)

In [None]:
cluster_assignments_df

# Coin Model Construction

## Coin model training data generation

### pull all data and generate all features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')
coin_flow_config, coin_flow_metrics_config, coin_flow_modeling_config, coin_flow_experiments_config = u.load_all_configs('../config')
wcm.validate_config_alignment(coin_flow_config,wallets_config,wallets_coin_config)
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))
wallets_coins_metrics_config = u.load_config('../config/wallets_coins_metrics_config.yaml')


# Initiate orchestrator
coin_epochs_orchestrator = ceo.CoinEpochsOrchestrator(
    wallets_coin_config,
    wallets_coins_metrics_config,
    wallets_config,
    wallets_metrics_config,
    wallets_features_config,
    wallets_epochs_config,
    coin_flow_config,
    coin_flow_modeling_config,
    coin_flow_metrics_config
)

coin_epochs_orchestrator.load_complete_raw_datasets()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')
coin_flow_config, coin_flow_metrics_config, coin_flow_modeling_config, coin_flow_experiments_config = u.load_all_configs('../config')
wcm.validate_config_alignment(coin_flow_config,wallets_config,wallets_coin_config)
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))
wallets_coins_metrics_config = u.load_config('../config/wallets_coins_metrics_config.yaml')

# Initiate orchestrator
coin_epochs_orchestrator = ceo.CoinEpochsOrchestrator(
    wallets_coin_config,
    wallets_coins_metrics_config,
    wallets_config,
    wallets_metrics_config,
    wallets_features_config,
    wallets_epochs_config,
    coin_flow_config,
    coin_flow_modeling_config,
    coin_flow_metrics_config
)

coin_epochs_orchestrator.load_complete_raw_datasets()

coin_epochs_orchestrator.orchestrate_coin_epochs(
    wallets_coin_config['training_data']['coin_epochs_training'],
    file_prefix='training_'
)
if len(wallets_coin_config['training_data']['coin_epochs_validation']) > 0:
    coin_epochs_orchestrator.orchestrate_coin_epochs(
        wallets_coin_config['training_data']['coin_epochs_validation'],
        file_prefix='validation_'
    )

### parse columns

In [None]:

# List all cols
parquet_folder = wallets_coin_config['training_data']['parquet_folder']
coin_training_data_df = pd.read_parquet(f"{parquet_folder}/training_multiwindow_coin_training_data_df.parquet")
list(coin_training_data_df.columns)

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules


# Load df
# Create dataframe of column names
df = pd.DataFrame(coin_training_data_df.columns)
df.columns = ['feature']
feature_details_df = cfo.parse_feature_names(df)

# Select features
segment_category_filter = [
    # 'all_wallets',
    # 'score_quantile',
    'score_binary',
    # 'training_clusters',
    # 'market_data',
    # 'wallet_cohorts',
    # 'macro',
]
segment_family_filter = [
    # 'all',
    # 'cw_return_rate_min_025',
    'global_market_cap',
    'btc_mvrv_z_score',
    'btc_price',
    # 'wallet_cohorts',
    'volume',
    'market_cap'
]
metric_filter = [
    # 'trading',
    'balances',
]
metric_detail_filter = [
    # 'crypto_net_gain',
    'usd_balance_ending',
]
transformation_category_filter = [
    # 'aggregations',
    'score_wtd',
    # 'score_dist',
]
transformation_base_filter = [
    'cw_return_rate_min_000_score',
]
transformation_method_filter = [
    # 'count',
    # 'sum',
    'kurt'
]

groups = [
    'segment_category',
    'segment_family',
    # 'segment_value',
    # 'metric',
    # 'metric_detail',
    # 'transformation_category',
    # 'transformation_base',
    # 'transformation_method',
    # 'feature_full',

]
pd.DataFrame(feature_details_df
 [
  (feature_details_df['segment_category'].isin(feature_details_df['segment_category']))  # Dummy line that always evaluates to True
 & (feature_details_df['segment_category'].isin(segment_category_filter))
#  & (feature_details_df['segment_family'].isin(segment_family_filter))
#  & (feature_details_df['metric'].isin(metric_filter))
#  & (feature_details_df['metric_detail'].isin(metric_detail_filter))
#  & (feature_details_df['transformation_category'].isin(transformation_category_filter))
#  & (feature_details_df['transformation_base'].isin(transformation_base_filter))
#  & (feature_details_df['transformation_method'].isin(transformation_method_filter))
    ]
 .fillna('None').groupby(groups)
 .size()
# ).columns
).sort_values(by=0,ascending=False)


### Build coin model

In [None]:
parquet_folder = wallets_coin_config['training_data']['parquet_folder']
coin_training_data_df       = pd.read_parquet(f"{parquet_folder}/training_multiwindow_coin_training_data_df.parquet")
coin_target_var_df          = pd.read_parquet(f"{parquet_folder}/training_multiwindow_coin_target_var_df.parquet")
validation_training_data_df = pd.read_parquet(f"{parquet_folder}/validation_multiwindow_coin_training_data_df.parquet")
validation_target_var_df    = pd.read_parquet(f"{parquet_folder}/validation_multiwindow_coin_target_var_df.parquet")

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Initialize and run model
coin_model = cm.CoinModel(modeling_config=wallets_coin_config['coin_modeling'])
coin_model_results = coin_model.construct_coin_model(
    coin_training_data_df,coin_target_var_df,
    validation_training_data_df,validation_target_var_df
)

# Print summary
if 'y_train' in coin_model_results:


    # Generate and save all model artifacts
    coin_model_id, coin_evaluator, coin_scores_df = cimr.generate_and_save_coin_model_artifacts(
        model_results=coin_model_results,
        base_path='../artifacts/coin_modeling',
        configs = {
            'wallets_coin_config': wallets_coin_config.config,
            'wallets_config': wallets_config.config,
            'wallets_epochs_config': wallets_epochs_config,
            'wallets_features_config': wallets_features_config,
            'wallets_metrics_config': wallets_metrics_config,
        }
    )
    coin_evaluator.plot_wallet_evaluation()
else:
    display(coin_model.generate_search_report())


##### model 2

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Initialize and run model
coin_model = cm.CoinModel(modeling_config=wallets_coin_config['coin_modeling'])
coin_model_results = coin_model.construct_coin_model(
    coin_training_data_df,coin_target_var_df,
    validation_training_data_df,validation_target_var_df
)

# Print summary
if 'y_train' in coin_model_results:


    # Generate and save all model artifacts
    coin_model_id, coin_evaluator, coin_scores_df = cimr.generate_and_save_coin_model_artifacts(
        model_results=coin_model_results,
        base_path='../artifacts/coin_modeling',
        configs = {
            'wallets_coin_config': wallets_coin_config.config,
            'wallets_config': wallets_config.config,
            'wallets_epochs_config': wallets_epochs_config,
            'wallets_features_config': wallets_features_config,
            'wallets_metrics_config': wallets_metrics_config,
        }
    )
    coin_evaluator.plot_wallet_evaluation()
else:
    display(coin_model.generate_search_report())


### upload to bigquery

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

model_id = 'c5729c12-acad-4148-b600-0d0e31a8cd31'
score_name = 'dda805 coin return .3 with market data'
score_notes = 'sharp rise after .80'
file_prefix =
training_data_df = pd.read_parquet(f"{parquet_folder}/{file_prefix}training_multiwindow_coin_training_data_df.parquet")

cimr.generate_and_upload_coin_scores(
    wallets_coin_config,
    training_data_df,
    model_id,
    score_name,
    score_notes
)

### importance analysis

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules

# Load df
importances_df = pd.DataFrame(coin_evaluator.metrics['importances'])
feature_details_df = cfo.parse_feature_names(importances_df,'importance')

# Select features
segment_category_filter = [
    # 'all_wallets',
    'macro',
    # 'score_quantile',
    # 'score_binary',
    # 'training_clusters',
    # 'market_data',
    # 'wallet_cohorts',
]
segment_family_filter = [
    # 'all_wallets',
    # 'net_gain_winsorized_dda619_grid_score',
    # 'cw_return_rate_regression_score',
    'price',
    # 'wallet_cohorts',
]
segment_value_filter = [
    'cluster_4',
]
metric_filter = [
    'trading',
    'balances',
]
metric_detail_filter = [
    'crypto_net_gain',
    'usd_balance_241031',
]
transformation_category_filter = [
    # 'aggregations',
    # 'score_wtd',
    'score_dist',
]
transformation_base_filter = [
    # 'aggregations',
    'cw_return_rate_min_040_score',
]
transformation_method_filter = [
    # 'count',
    # 'sum',
    # 'dda528_net_gain_max_inv_025_score_p90'
]

groups = [
    'segment_category',
    'segment_family',
    # 'segment_value',
    # 'metric',
    # 'metric_detail',
    # 'transformation_category',
    # 'transformation_base',
    # 'transformation_method',
    # 'feature_full',

]
pd.DataFrame(feature_details_df
 [
  (feature_details_df['segment_category'].isin(feature_details_df['segment_category']))  # Dummy line that always evaluates to True
 & (feature_details_df['segment_category'].isin(segment_category_filter))
#  & (feature_details_df['segment_family'].isin(segment_family_filter))
#  & (feature_details_df['segment_value'].isin(segment_value_filter))
#  & (feature_details_df['metric'].isin(metric_filter))
#  & (feature_details_df['metric_detail'].isin(metric_detail_filter))
#  & (feature_details_df['transformation_category'].isin(transformation_category_filter))
#  & (feature_details_df['transformation_base'].isin(transformation_base_filter))
#  & (feature_details_df['transformation_method'].isin(transformation_method_filter))
    ]
 .fillna('None')
 .groupby(groups)['importance']
 .agg(['sum', 'count'])
).sort_values(by='sum',ascending=False).head(20)
# ).sort_values(by='feature_full',ascending=True).head(20)


### Generate wallet scores for investing period

#### wallet training data for the coin modeling period

In [None]:
# Here we create wallet model training data for dates through the end of the original modeling period.
# This will be used to create "current" scores as of the end of the modeling period, that can be
# used to generate features for the "current" coin model built at the end of the modeling period.

[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))

complete_profits_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_profits_df.parquet")
complete_market_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_market_data_df.parquet")
complete_macro_trends_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_macro_trends_df.parquet")

# Identify offset needed to generate training data directly following the modeling period to the validation period start
modeling_offset = (datetime.strptime(wallets_config['training_data']['modeling_period_end'], '%Y-%m-%d') - datetime.strptime(wallets_config['training_data']['training_period_end'], '%Y-%m-%d')).days
coin_modeling_epochs_config = {
    'offset_epochs': {
        'offsets': [modeling_offset]
    }
}
# Initiate orchestrator
epochs_orchestrator = weo.WalletEpochsOrchestrator(
    wallets_config.config,
    wallets_metrics_config,
    wallets_features_config,
    coin_modeling_epochs_config,
    complete_profits_df,
    complete_market_data_df,
    complete_macro_trends_df
)

# Generate TRAINING_DATA_DF for the modeling period offset window
como_training_data_df, como_modeling_data_df, _, _ = epochs_orchestrator.generate_epochs_training_data()

# Save files
como_training_data_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/como_training_data_df.parquet",index=True)
como_modeling_data_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/como_modeling_data_df.parquet",index=True)


#### score training data without target var

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

model_id = '773ee0c8-18a8-4f58-a27e-29db4fdd2f71'

file_prefix='investing_'
parquet_folder = wallets_coin_config['training_data']['parquet_folder']
coin_features_df = pd.read_parquet(f"{parquet_folder}/{file_prefix}training_multiwindow_coin_training_data_df.parquet")

coin_scores_df = coin_epochs_orchestrator.score_coin_training_data(
    model_id,
    '../artifacts/coin_modeling',
    coin_features_df,
)
coin_scores_df.describe()
# plot_return_vs_rank(como_scores_df['score'],como_target_var_df['coin_return_winsorized'])

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

parquet_folder = wallets_config['training_data']['parquet_folder']
coin_features_df = pd.read_parquet(f"{parquet_folder}/{file_prefix}training_multiwindow_coin_training_data_df.parquet")

model_id = '773ee0c8-18a8-4f58-a27e-29db4fdd2f71'
score_name = 'dda802 scores for may'
score_notes = 'above 0.6 looks good, above 0.8 looks very strong'

cimr.generate_and_upload_wallet_cw_scores(
    wallets_config,
    training_data_df,
    complete_hybrid_cw_id_df,
    model_id,
    score_name,
    score_notes,
)

#### save scores for coin modeling training data

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# model_id = 'c1fd04e8-5d57-48d7-9d7d-57b61afff9d5'
score_name = wallets_config['modeling']['score_name']


# Load and predict
como_training_data_df = pd.read_parquet(f"{wallets_coin_config['training_data']['parquet_folder']}"
                                           "/como_coin_training_data_df_full.parquet")

base_path = wallets_config['training_data']['model_artifacts_folder']
como_y_pred = wiva.load_and_predict(model_id,como_training_data_df,base_path)

# Create wallet scores DataFrame with both cohorts
modeling_wallet_scores_df = pd.DataFrame({
    f'score|{score_name}': como_y_pred
})
modeling_wallet_scores_df.to_parquet(f"temp/wallet_modeling_score_dfs/{score_name}.parquet",index=True)

u.notify(2)


### Stepwise coin model generation

#### initiate orchestrator

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')
coin_flow_config, coin_flow_metrics_config, coin_flow_modeling_config, coin_flow_experiments_config = u.load_all_configs('../config')
wcm.validate_config_alignment(coin_flow_config,wallets_config,wallets_coin_config)
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))
wallets_coins_metrics_config = u.load_config('../config/wallets_coins_metrics_config.yaml')


# Initiate orchestrator
c_orch = ceo.CoinEpochsOrchestrator(
    wallets_coin_config,
    wallets_coins_metrics_config,
    wallets_config,
    wallets_metrics_config,
    wallets_features_config,
    wallets_epochs_config,
    coin_flow_config,
    coin_flow_modeling_config,
    coin_flow_metrics_config
)

c_orch.load_complete_raw_datasets()

##### _process_coin_epoch()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# Method Params
lookback_duration = 0
include_validation = True

In [None]:
# 1) Prepare config files
# -----------------------
epoch_wallets_config = c_orch._prepare_coin_epoch_base_config(lookback_duration)
epoch_coins_config = c_orch._prepare_epoch_coins_config(epoch_wallets_config)
epoch_date = pd.to_datetime(epoch_wallets_config['training_data']['coin_modeling_period_start'])

# Shortcut: if both feature and target parquet files exist, load and return them
toggle_rebuild_features = epoch_coins_config['features']['toggle_rebuild_all_features']
base_folder = epoch_coins_config['training_data']['parquet_folder']
feat_path = Path(base_folder) / "coin_training_data_df_full.parquet"
tgt_path  = Path(base_folder) / "coin_target_var_df.parquet"
if (feat_path.exists() and tgt_path.exists() and not toggle_rebuild_features):
    coin_features_df = pd.read_parquet(feat_path)
    coin_target_df   = pd.read_parquet(tgt_path)
    logger.milestone(
        "Coin epoch %s training data loaded from existing feature and target files.",
        epoch_date.strftime('%Y-%m-%d')
    )
    # return epoch_date, coin_features_df, coin_target_df


In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

# 2) Wallet-Level Features
# ------------------------
# Prepare epoch-specific orchestrator without heavy data generation
epoch_weo = weo.WalletEpochsOrchestrator(
    base_config=epoch_wallets_config,
    metrics_config=c_orch.wallets_metrics_config,
    features_config=c_orch.wallets_features_config,
    epochs_config=c_orch.wallets_epochs_config,
    complete_profits_df=c_orch.complete_profits_df,
    complete_market_data_df=c_orch.complete_market_data_df,
    complete_macro_trends_df=c_orch.complete_macro_trends_df,
)
epoch_weo.all_epochs_configs = epoch_weo.generate_epoch_configs()

# Generate wallets training & modeling data
epoch_training_dfs = epoch_weo.generate_epochs_training_data()

# Train and score wallet models for this epoch's coin modeling period
wallet_training_data_df = c_orch._train_and_score_wallet_epoch(
    epoch_weo,
    epoch_coins_config,
    epoch_training_dfs,
    include_validation_period=include_validation
)


In [None]:
# 3) Coin-Level Features
# ----------------------
# Generate and save coin features for this epoch
(
    coin_features_df,
    coin_market_data_df,
) = c_orch._generate_coin_features(
    epoch_weo,
    epoch_coins_config,
    wallet_training_data_df
)

# Generate and save target variables for this epoch
try:
    coin_target_var_df = c_orch._generate_coin_target_vars(
        epoch_weo,
        epoch_coins_config,
        coin_features_df,
        coin_market_data_df
    )
except Exception as e:
    logger.warning(
        "Target variable generation failed for epoch %s: %s",
        epoch_date.strftime('%Y-%m-%d'),
        e
    )
    # fallback to empty targets to allow features-only epochs
    coin_target_var_df = pd.DataFrame(index=coin_features_df.index)


#### _generate_coin_features()

In [None]:
# 1) Load base dfs needed for coin feature generation
training_wallet_cohort = pd.Series(wallet_training_data_df.index.get_level_values('wallet_address'))
(
    profits_df,
    coin_market_data_df,
    training_coin_cohort
) = c_orch._load_wallet_data_for_coin_features(
    epoch_weo.base_config,
    training_wallet_cohort
)

# 2) Prepare datasets
macro_df = c_orch._generate_epoch_macro_indicators(
    epoch_weo.base_config['training_data']['modeling_period_start'],
    epoch_weo.base_config['training_data']['modeling_period_end'],
)


#### dda 805 devspace

In [None]:
period_start_date = epoch_weo.base_config['training_data']['modeling_period_start']
period_end_date = epoch_weo.base_config['training_data']['modeling_period_end']


# Trim, clean, and impute missing values in complete_macro_trends_df
period_market_data_df = dr.clean_market_data(
    c_orch.complete_market_data_df.reset_index(),
    c_orch.wallets_config,
    earliest_date = c_orch.complete_market_data_df.index.get_level_values('date').min(),  # retain historical data for indicators
    latest_date = period_end_date
)

# # Use existing training data orchestrator for consistency
# wtdo_instance = wtdo.WalletTrainingDataOrchestrator(
#     c_orch.wallets_config,                # has no impact on indicators output
#     c_orch.wallets_coins_metrics_config,  # coins metrics coCoinFeaturesOrchestratornfig
#     c_orch.wallets_features_config        # has no impact on indicators output
# )

# # Call the public indicator generation method
# market_indicators_df = wtdo_instance.generate_indicators_df(
#     period_market_data_df.reset_index(),
#     period_start_date=period_start_date,
#     period_end_date=period_end_date,
#     metric_type='macro_trends',
#     parquet_filename=None
# )

# # # Set date index for consistency with rest of pipeline
# # macro_indicators_df = macro_indicators_df.set_index('date')

# # logger.info(f"Generated {len(macro_indicators_df.columns)} macro indicators for period "
# #             f"{period_start_date} to {period_end_date}")

# # return macro_indicators_df

In [None]:
period_market_data_df.describe()

In [None]:
period_end_date

In [None]:
period_market_data_df.describe()

#### code resumes

In [None]:
# 3) Generate features
cfo_inst = cfo.CoinFeaturesOrchestrator(
    epoch_weo.base_config,
    epoch_coins_config,
    c_orch.wallets_coins_metrics_config,
    c_orch.coin_flow_config,
    c_orch.coin_flow_modeling_config,
    c_orch.coin_flow_metrics_config,
    training_coin_cohort,
)

file_prefix = pd.to_datetime(
    epoch_weo.base_config['training_data']['coin_modeling_period_start']
).strftime('%Y%m%d')


coin_features_df = cfo_inst.generate_coin_features_for_period(
    profits_df,
    wallet_training_data_df,
    macro_df,
    "modeling",
    file_prefix,
)

In [None]:
# 4) Persist results to parquet
base_folder = epoch_coins_config['training_data']['parquet_folder']
coin_features_df.to_parquet(
    f"{base_folder}/coin_training_data_df_full.parquet"
)


#### cfo.generate_coin_features_for_period()

In [None]:
# Params
profits_df = profits_df
training_data_df = wallet_training_data_df
macro_indicators_df = macro_df
period = "modeling"
prefix = file_prefix


In [None]:
logger.info("Beginning coin feature generation...")
u.notify('intro_4')

# Guard: profits_df covers expected date range
u.assert_period(
    profits_df,
    cfo_inst.wallets_config['training_data'][f'{period}_period_start'],
    cfo_inst.wallets_config['training_data'][f'{period}_period_end']
)
# Guard: training_data_df has unique wallet rows (needed for segmentation)
if training_data_df.index.duplicated().any():
    raise ValueError("training_data_df contains duplicated wallet rows.")

# Generate metrics for coin-wallet pairs (includes all on-chain wallets)
cw_metrics_df = cfwm.compute_coin_wallet_metrics(
    cfo_inst.wallets_coin_config,
    profits_df,
    cfo_inst.wallets_config['training_data'][f'{period}_period_start'],
    cfo_inst.wallets_config['training_data'][f'{period}_period_end']
)



In [None]:
# Assign wallets in training_data_df to segments
wallet_segmentation_df = cws.build_wallet_segmentation(
    cfo_inst.wallets_coin_config,
    training_data_df,
    score_suffix=prefix
)


In [None]:
# Flatten cw_metrics into single values for each coin-segment pair
coin_wallet_features_df = cfwmf.flatten_cw_to_coin_segment_features(
    cw_metrics_df,
    wallet_segmentation_df,
    cfo_inst.training_coin_cohort,
    cfo_inst.wallets_coin_config['features']['score_distributions'],
    cfo_inst.wallets_coin_config['n_threads']['cw_flattening_threads']
)


In [None]:

# Instantiate full features df
coin_training_data_df_full = coin_wallet_features_df

# Generate and merge macro features if configured
if self.wallets_coin_config['features']['toggle_macro_features']:
    macro_features_df = self._generate_macro_features(macro_indicators_df)
    # Cross join macro features to coin features DataFrame, prefixing "macro|"
    prefixed_macro_features = {
        f"macro|{col}": val
        for col, val in macro_features_df.iloc[0].to_dict().items()
    }
    coin_training_data_df_full = coin_training_data_df_full.assign(**prefixed_macro_features)

# Generate and merge Coin Flow Model features if configured
if self.wallets_coin_config['features']['toggle_coin_flow_model_features']:

    # Generate and merge all features
    coin_flows_model_features_df = self._generate_coin_flow_model_features()
    coin_flows_model_features_df.to_parquet(
        f"{self.wallets_coin_config['training_data']['parquet_folder']}"
        f"/{prefix}_coin_flows_model_features_df.parquet",
        index=True
    )
    coin_training_data_df_full = self._merge_all_features(
        coin_wallet_features_df,
        coin_flows_model_features_df
    )

u.notify('notification_toast')
logger.info("Successfully generated coin_training_data_df with shape "
            f"({coin_training_data_df_full.shape}).")


## Post model analysis

### performance report

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')


# Initialize evaluator
coin_evaluator = wime.RegressorEvaluator(coin_model_results)

print(coin_evaluator.summary_report())
coin_evaluator.plot_coin_evaluation()
coin_evaluator.importance_summary(0)

### importance analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')


feature_details_df = civa.analyze_coin_model_importance(coin_evaluator.metrics['importances'])

segment_category_filter = [
    # 'all_wallets',
    'score_quantile',
    # 'time_series',
    # 'wallet_cohorts',
    # 'training_clusters',
]
segment_family_filter = [
    'all_wallets',
    'net_gain_winsorized_dda619_grid_score',
    # 'time_series',
    # 'wallet_cohorts',
]
metric_filter = [
    # 'trading',
    'balances',
]
metric_detail_filter = [
    'crypto_net_gain',
    'usd_balance_241031',
]
transformation_filter = [
    # 'aggregations',
    # 'score_wtd',
]
transformation_method_filter = [
    'net_gain_winsorized_dda619_grid_residual_p10',
    # 'sum',
]

groups = [
    'segment_category',
    # 'segment_family',
    # 'segment_value',
    'metric',
    'metric_detail',
    'transformation',
    'transformation_method',
    # 'feature_full',

]

pd.DataFrame(feature_details_df
 [
 (feature_details_df['segment_category'].isin(segment_category_filter))
#  & (feature_details_df['segment_family'].isin(segment_family_filter))
#  & (feature_details_df['metric'].isin(metric_filter))
#  & (feature_details_df['metric_detail'].isin(metric_detail_filter))
#  & (feature_details_df['transformation'].isin(transformation_filter))
#  & (feature_details_df['transformation_method'].isin(transformation_method_filter))
    ]
 .fillna('None').groupby(groups)
 .sum('importance')
# ).columns
).sort_values(by='importance',ascending=False)


In [None]:
# Load importances
feature_importance_df = pd.DataFrame(coin_evaluator.metrics['importances'])

# Split on pipe delimiters
split_df = feature_importance_df['feature'].str.split('|', expand=True)
split_df.columns = ['segment_category','segment_family','metric','transformation']

# Split nested components
segment_families = split_df['segment_family'].str.split('/', expand=True)
segment_families.columns = ['segment_family', 'segment_value']

metrics = split_df['metric'].str.split('/', expand=True)
metrics.columns = ['metric', 'metric_detail']

transformations = split_df['transformation'].str.split('/', expand=True)
transformations.columns = ['transformation', 'transformation_method']

# Combine all components
feature_details_df = pd.concat([
    split_df['segment_category'],
    segment_families,
    metrics,
    transformations,
    feature_importance_df['importance']
], axis=1)

feature_details_df

In [None]:
list(feature_importance_df['feature'])

In [None]:
groups = [
    'segment_category',
    'segment_family',
    # 'segment_value',
    'metric',
    'metric_detail',
    # 'transformation',
    # 'transformation_method',
]

feature_details_df.groupby(groups).sum('importance').sort_values(by='importance',ascending=False)

In [None]:
result_df

## analyze features

### basic correlation

In [None]:
import pandas as pd

# Assuming your DataFrame is named `df`
# Calculate correlations
correlation_matrix = coin_modeling_df.corr()

# Extract correlations with the target variable
target_correlations = correlation_matrix[target_var_column].sort_values(ascending=False)

# Display the top features correlated with the target
target_correlations[:15]
# target_correlations

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')



# # Wallet metrics to analyze
# wallet_metrics = [
# ]

wallet_metrics = coin_modeling_df.columns
wallet_metrics = target_correlations[:15].index.values

# number of score buckets
n_quantiles = 5

analyze_df = civa.analyze_metric_segments(
    coin_modeling_df,
    wallet_metrics,
    n_quantiles,
    target_var_column,
)
civa.style_metric_segments(analyze_df)

# Pre Coin Model Analysis

### Wallet aggregated analysis

#### generate validation wallet features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')


# Create a DataFrame with all wallets that should exist
validation_target_vars_df = pd.DataFrame(index=training_wallet_cohort)
validation_target_vars_df.index.name = 'wallet_address'


# Calculate modeling period wallet metrics
validation_trading_features_df = wtf.calculate_wallet_trading_features(validation_profits_df,
                                                            wallets_config['training_data']['validation_period_start'],
                                                            wallets_config['training_data']['validation_period_end'],
                                                            include_twb_metrics=False)
validation_target_vars_df = validation_target_vars_df.join(validation_trading_features_df, how='left')\
    .fillna({col: 0 for col in validation_trading_features_df.columns})

# Performance features (inner join, no fill)
performance_features_df = wpf.calculate_performance_features(validation_target_vars_df,include_twb_metrics=False)
validation_target_vars_df = validation_target_vars_df.join(performance_features_df, how='inner')

In [None]:
validation_target_vars_df

#### wallet validation period trading/performance by score quantile

In [None]:
# Create base df with all wallet addresses and scores
modeling_wallet_scores_df = cfo.load_wallet_scores(wallets_coin_config['wallet_segments']['wallet_scores'],
                                            wallets_coin_config['wallet_segments']['wallet_scores_path'])


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create analysis by prediction bands
metrics = [
    'crypto_net_gain/max_investment/winsorized',
    'crypto_net_gain/max_investment/base',
    'crypto_net_gain/max_investment/ntile_rank',
    'crypto_net_gain/active_twb/winsorized',
    'crypto_net_gain/active_twb/base',
    'max_investment',
    'crypto_net_gain',
    'crypto_net_flows',
    'total_volume',
]

min_wallet_volume_usd = 0
num_quantiles = 5

wiva.create_quantile_report(
    validation_target_vars_df,
    modeling_wallet_scores_df[wallets_config['modeling']['score_name']],
    metrics,  # Your existing metrics list
    num_quantiles,  # Split into ntiles
    min_wallet_volume_usd
)


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create analysis by prediction bands
metrics = [
    'crypto_net_gain/max_investment/winsorized',
    'crypto_net_gain/max_investment/base',
    'crypto_net_gain/max_investment/ntile_rank',
    'crypto_net_gain/active_twb/winsorized',
    'crypto_net_gain/active_twb/base',
    'max_investment',
    'crypto_net_gain',
    'crypto_net_flows',
    'total_volume',
]

min_wallet_volume_usd = 0
num_quantiles = 5

wiva.create_quantile_report(
    validation_target_vars_df,
    modeling_wallet_scores_df[wallets_config['modeling']['score_name']],
    metrics,  # Your existing metrics list
    num_quantiles,  # Split into ntiles
    min_wallet_volume_usd
)


### old analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')


# Wallet metrics to analyze
wallet_metrics = [
    'top_100pct/balance_wtd_mean_score',
    'top_10pct/count',
    'top_25pct/count',
    'top_50pct/count',
    'top_100pct/count',
    'top_10pct/count_pct',
    'top_10pct/balance_pct',
    'top_25pct/count_pct',
    'top_25pct/balance_pct',
    'top_50pct/count_pct',
    'top_50pct/balance_pct',
]
# wallet_metrics = list(validation_coin_wallet_features_df.columns)

# Create styled performance analysis
civa.create_top_coins_wallet_metrics_report(validation_coin_wallet_features_df,percentile=90,wallet_metrics=wallet_metrics,method='mean')


#### plotting coin feature performance vs market cap

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Get the analysis results
segment_results, summary_df = civa.analyze_market_cap_segments(
    coin_wallet_features_df,
    top_n=10
)

# Or create the visualizations
civa.plot_segment_heatmap(summary_df)
civa.plot_metric_consistency(summary_df)  # Optional secondary visualization


#### coin performance of top n for each bucket

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Run analysis
top_n = wallets_config['coin_validation_analysis']['top_n']
max_market_cap = wallets_config['coin_validation_analysis']['max_market_cap']
min_market_cap = wallets_config['coin_validation_analysis']['min_market_cap']

metric_top_coin_performance_df = civa.validate_coin_performance(coin_wallet_features_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

civa.print_performance_analysis(coin_wallet_features_df)

# Appendix: Single Window Construction


### Training Data Sequence

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# Load orchestrator
training_data_orchestrator = wtdo.WalletTrainingDataOrchestrator(
    copy.deepcopy(wallets_config.config),
    wallets_metrics_config,
    wallets_features_config
)

In [None]:
# Retrieve data
_,_,_,_ = training_data_orchestrator.retrieve_period_datasets(
    wallets_config['training_data']['training_period_start'],
    wallets_config['training_data']['training_period_end'],
    parquet_prefix='training'
)

In [None]:
# Select cohort and prepare training data
parquet_folder = wallets_config['training_data']['parquet_folder']
training_profits_df_full = pd.read_parquet(f"{parquet_folder}/training_profits_df_full.parquet")
training_market_data_df_full = pd.read_parquet(f"{parquet_folder}/training_market_data_df_full.parquet")
training_macro_trends_df_full = pd.read_parquet(f"{parquet_folder}/training_macro_trends_df_full.parquet")


_ = training_data_orchestrator.prepare_training_data(
    training_profits_df_full,
    training_market_data_df_full,
    training_macro_trends_df_full
)

# Store hybrid ID map
if wallets_config['training_data']['hybridize_wallet_ids']:
    pd.to_pickle(training_data_orchestrator.hybrid_cw_id_map, f"{parquet_folder}/hybrid_cw_id_map.pkl")

In [None]:
# Generate training features
parquet_folder = wallets_config['training_data']['parquet_folder']
training_profits_df = pd.read_parquet(f"{parquet_folder}/training_profits_df.parquet")
training_market_indicators_df = pd.read_parquet(f"{parquet_folder}/training_market_indicators_data_df.parquet")
training_macro_indicators_df = pd.read_parquet(f"{parquet_folder}/training_macro_indicators_df.parquet")
training_transfers_df = pd.read_parquet(f"{parquet_folder}/training_transfers_sequencing_df.parquet")

training_data_orchestrator.generate_training_features(
    training_profits_df,
    training_market_indicators_df,
    training_macro_indicators_df,
    training_transfers_df
)

u.notify(3)

### Wallet Model Target Variable and Wallet Cohort

In [None]:
# Load modeling datasets
training_coin_cohort = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/training_market_indicators_data_df.parquet",
                                       columns=['coin_id'])['coin_id'].unique()
_,_,_,_ = training_data_orchestrator.retrieve_period_datasets(
    wallets_config['training_data']['modeling_period_start'],
    wallets_config['training_data']['modeling_period_end'],
    training_coin_cohort,
    parquet_prefix='modeling'
)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

training_wallet_cohort = pd.read_parquet(
    f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet",
    columns=[]
).index.values

# Load orchestrator
training_data_orchestrator = wtdo.WalletTrainingDataOrchestrator(
    copy.deepcopy(wallets_config.config),
    wallets_metrics_config,
    wallets_features_config,
    training_wallet_cohort
)

In [None]:
# Prepare modeling features for target variables
modeling_profits_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/modeling_profits_df_full.parquet")
hybrid_cw_id_map = None
if wallets_config['training_data']['hybridize_wallet_ids']:
    hybrid_cw_id_map = pd.read_pickle(f"{wallets_config['training_data']['parquet_folder']}/hybrid_cw_id_map.pkl")

_ = training_data_orchestrator.prepare_modeling_features(
    modeling_profits_df_full,
    hybrid_cw_id_map
)

u.notify(3)

### Wallet Model Construction and Analysis

#### select target variable (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create MODELING_DF and Construct Wallet Model
# ----------------------------------------------------------
# Retrieve training data for the full training wallet cohort
wallet_target_vars_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_target_vars_df.parquet")

# Filter training data to only the modeling cohort through inner join to target variable
modeling_cohort_target_var_df = wallet_target_vars_df[['in_modeling_cohort', wallets_config['modeling']['target_variable']]].copy()

# Retrieve training data for the full training wallet cohort
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet")
logger.info("Training data df shape: %s", wallet_training_data_df.shape)
# sorted(list(wallet_training_data_df.columns))

#### build wallet model or run search

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Retrieve training data for the full training wallet cohort
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet")

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config['modeling'])

# Validate indices match
if not all(
    wallet_training_data_df.sort_index().index.get_level_values(level).equals(
        modeling_cohort_target_var_df.sort_index().index.get_level_values(level)
    ) for level in wallet_training_data_df.index.names
):
    raise ValueError("Merged training and modeling DataFrames have mismatched indices.")


wallet_model_results = wallet_model.construct_wallet_model(wallet_training_data_df,modeling_cohort_target_var_df)
del wallet_training_data_df
gc.collect()

# Print summary
if 'y_train' in wallet_model_results:

    # Generate and save all model artifacts
    model_id, wallet_evaluator, modeling_wallet_scores_df = wimr.generate_and_save_wallet_model_artifacts(
        model_results=wallet_model_results,
        base_path='../artifacts/wallet_modeling',
        configs = {
            'wallets_config': wallets_config.config,
            'wallets_metrics_config': wallets_metrics_config,
            'wallets_features_config': wallets_features_config
        },
        save_scores=False
    )
    print(wallet_evaluator.summary_report())
else:
    display(wallet_model.generate_search_report())

# Junkyard

# Tests failing

In [None]:
[importlib.reload(module) for module in modules]
wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')

wallets_config, wallets_coin_config = wcm.load_all_wallets_configs('../config')
