### start

In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import gc
import time
import copy
import logging
import re
from itertools import chain,combinations
import pdb
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import IPython
import requests
import pandas_gbq
from google.cloud import bigquery
import scipy
from scipy import stats
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    explained_variance_score,
    mean_absolute_percentage_error,
    roc_auc_score
)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['LOGGING_FILE'] = "../../../Local/logs/wallet_modeling.log"
os.environ['NOTIFICATION_SOUNDS_DIR'] = "../../../Local"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp

# Wallet features
import wallet_features.clustering_features as wcl
import wallet_features.market_cap_features as wmc
import wallet_features.market_timing_features as wmt
import wallet_features.trading_features as wtf
import wallet_features.performance_features as wpf
import wallet_features.transfers_features as wts
import wallet_features.scenario_features as wsc
import wallet_features.balance_features as wbf
import wallet_features.macroeconomic_features as wmac
import wallet_features.wallet_features_orchestrator as wfo

# Base modeling
import base_modeling.base_model as bm
import base_modeling.feature_selection as fs

# Wallet modeling
import wallet_modeling.wallet_training_data_orchestrator as wtdo
import wallet_modeling.wallet_epochs_orchestrator as weo
import wallet_modeling.wallet_training_data as wtd
import wallet_insights.wallet_model_reporting as wmr
import wallet_modeling.wallet_model as wm
import wallet_modeling.experiments_manager as wem
import wallet_modeling.wallets_config_manager  as wcm
from wallet_modeling.wallets_config_manager import WalletsConfig

# Wallet insights
import wallet_insights.model_evaluation as wime
import wallet_insights.wallet_experiments_orchestrator as wimo
import wallet_insights.wallet_validation_analysis as wiva
import wallet_insights.wallet_cluster_analysis as wica

# Coin features
import coin_wallet_features.coin_features_orchestrator as cfo
import coin_wallet_features.wallet_base_metrics as cwbm
import coin_wallet_features.wallet_segmentation as cws

# Coin modeling
import coin_modeling.coin_model_reporting as cmr
import coin_modeling.coin_model as cm

# Coin insights
import coin_insights.coin_validation_analysis as civa


# reload all modules
modules = [
    u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp,
    wtdo, weo, wtd, wmr, wm, wem, wcm,
    wcl, wmc, wmt, wtf, wpf, wts, wsc, wbf, wmac, wfo,
    bm, fs,
    wime, wimo, wiva, wica,
    cfo, cwbm, cws,
    cmr, cm,
    civa,
]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')
wallets_config = WalletsConfig.load_from_yaml('../config/wallets_config.yaml')

wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))

# make parquet dirs if they don't already exist
Path(wallets_config['training_data']['parquet_folder']).mkdir(parents=True, exist_ok=True)
Path(wallets_coin_config['wallet_segments']['parquet_folder']).mkdir(parents=True, exist_ok=True)

# Set the custom error handler
ipython = IPython.get_ipython()
ipython.set_custom_exc((Exception,), u.notify_on_failure)

# configure logger
logger = u.setup_notebook_logger('../logs/notebook_logs.log')
logger.setLevel(logging.INFO)


# u.export_code(
#     code_directories=[
#         # 'training_data',
#         'wallet_features',
#         # 'base_modeling',
#         'wallet_modeling',
#         # 'wallet_insights'
#     ],
#     # include_config = True,
#     # ipynb_notebook = 'DDA-660 multithreading epochs.ipynb'
# )



[importlib.reload(module) for module in modules]
u.notify('retro')

logger.info("Good morning, let's get to work")

# Wallet Model Construction

## Multiwindow Model Construction

### Load complete datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))

# Initiate orchestrator
epochs_orchestrator = weo.MultiEpochOrchestrator(
    wallets_config.config,
    wallets_metrics_config,
    wallets_features_config,
    wallets_epochs_config
)

epochs_orchestrator.load_complete_raw_datasets()

### Generate modeling features (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

parquet_folder = wallets_config['training_data']['parquet_folder']
complete_profits_df = pd.read_parquet(f"{parquet_folder}/complete_profits_df.parquet")
complete_market_data_df = pd.read_parquet(f"{parquet_folder}/complete_market_data_df.parquet")
complete_macro_trends_df = pd.read_parquet(f"{parquet_folder}/complete_macro_trends_df.parquet")

# Initiate orchestrator
epochs_orchestrator = weo.MultiEpochOrchestrator(
    wallets_config.config,
    wallets_metrics_config,
    wallets_features_config,
    wallets_epochs_config,
    complete_profits_df,
    complete_market_data_df,
    complete_macro_trends_df,
)


# Generate TRAINING_DATA_DF for all windows
wallet_training_data_df, modeling_wallet_features_df = epochs_orchestrator.generate_epochs_training_data()

# Save files
wallet_training_data_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_wallet_training_data_df.parquet",index=True)
modeling_wallet_features_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_modeling_wallet_features_df.parquet",index=True)

# sorted(list(wallet_training_data_df.columns))


#### model without validation (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Load modeling files
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_wallet_training_data_df.parquet")
modeling_wallet_features_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_modeling_wallet_features_df.parquet")

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config['modeling'])
wallet_model_results = wallet_model.construct_wallet_model(
    wallet_training_data_df, modeling_wallet_features_df
)

# Print summary
if 'y_train' in wallet_model_results:

    # Generate and save all model artifacts
    model_id, wallet_evaluator, modeling_wallet_scores_df = wmr.generate_and_save_wallet_model_artifacts(
        model_results=wallet_model_results,
        base_path='../artifacts/wallet_modeling',
        configs = {
            'wallets_config': wallets_config.config,
            'wallets_metrics_config': wallets_metrics_config,
            'wallets_features_config': wallets_features_config
        }
    )
    wallet_evaluator.summary_report()
else:
    display(wallet_model.generate_search_report())

### Generate validation features (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))

complete_profits_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_profits_df.parquet")
complete_market_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_market_data_df.parquet")
complete_macro_trends_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_macro_trends_df.parquet")

# Override base offsets
validation_epochs_config = copy.deepcopy(wallets_epochs_config)
validation_epochs_config['offset_epochs']['offsets'] = validation_epochs_config['offset_epochs']['validation_offsets']

# Initiate orchestrator
epochs_orchestrator = weo.MultiEpochOrchestrator(
    wallets_config.config,
    wallets_metrics_config,
    wallets_features_config,
    validation_epochs_config,
    complete_profits_df,
    complete_market_data_df,
    complete_macro_trends_df
)

# Generate TRAINING_DATA_DF for the modeling period offset window
validation_training_data_df, validation_wallet_features_df = epochs_orchestrator.generate_epochs_training_data()

# Save files
validation_training_data_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_wallet_training_data_df.parquet",index=True)
validation_wallet_features_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_wallet_features_df.parquet",index=True)


### Construct model

#### model w validation (parquet loadable)

In [None]:
# Load modeling and validation files
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_wallet_training_data_df.parquet")
wallet_training_data_df = u.df_downcast(wallet_training_data_df)
modeling_wallet_features_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_modeling_wallet_features_df.parquet")
modeling_wallet_features_df = u.df_downcast(modeling_wallet_features_df)
validation_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_wallet_training_data_df.parquet")
validation_training_data_df = u.df_downcast(validation_training_data_df)
validation_wallet_features_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_wallet_features_df.parquet")
validation_wallet_features_df = u.df_downcast(validation_wallet_features_df)



In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config['modeling'])
wallet_model_results = wallet_model.construct_wallet_model(
    # sampled_train_df, sampled_modeling_df,
    wallet_training_data_df, modeling_wallet_features_df,
    validation_training_data_df, validation_wallet_features_df
)

# Print summary
if 'y_train' in wallet_model_results:

    # Generate and save all model artifacts
    model_id, wallet_evaluator, modeling_wallet_scores_df = wmr.generate_and_save_wallet_model_artifacts(
        model_results=wallet_model_results,
        base_path='../artifacts/wallet_modeling',
        configs = {
            'wallets_config': wallets_config.config,
            'wallets_metrics_config': wallets_metrics_config,
            'wallets_features_config': wallets_features_config,
            'wallets_epochs_config': wallets_epochs_config
        }
    )
    wallet_evaluator.summary_report()
else:
    display(wallet_model.generate_search_report())

##### model 2

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config['modeling'])
wallet_model_results = wallet_model.construct_wallet_model(
    # sampled_train_df, sampled_modeling_df,
    wallet_training_data_df, modeling_wallet_features_df,
    validation_training_data_df, validation_wallet_features_df
)

# Print summary
if 'y_train' in wallet_model_results:

    # Generate and save all model artifacts
    model_id, wallet_evaluator, modeling_wallet_scores_df = wmr.generate_and_save_wallet_model_artifacts(
        model_results=wallet_model_results,
        base_path='../artifacts/wallet_modeling',
        configs = {
            'wallets_config': wallets_config.config,
            'wallets_metrics_config': wallets_metrics_config,
            'wallets_features_config': wallets_features_config,
            'wallets_epochs_config': wallets_epochs_config
        }
    )
    wallet_evaluator.summary_report()
else:
    display(wallet_model.generate_search_report())

#### evaluation report

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Reload evaluator
wallet_evaluator = wime.ClassifierEvaluator(wallet_model_results)
# Print results
logger.info(f"\n{wallet_evaluator.summary_report()}")
wallet_evaluator.plot_wallet_evaluation()
wallet_evaluator.importance_summary(0)

#### assess segment performance

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
pd.set_option('display.max_colwidth', None)  # Shows full text in columns


# Reload evaluator
wallet_evaluator = wime.RegressionEvaluator(wallet_model_results)

segmentation_features = [
    # 'mktcap|portfolio_mcap_mean/market_cap_unadj|all_windows',
    'mktcap|volume_wtd_market_cap/market_cap_filled|all_windows',
    # 'timing|btc_mvrv_z_score/buy_weighted|all_windows',
    # 'timing|btc_mvrv_z_score/sell_weighted|all_windows',
    # 'macro|btc_mvrv_z_score_first|all_windows',
    # 'macro|btc_mvrv_z_score_last|all_windows',
    'trading|crypto_net_gain|all_windows',
    'trading|total_volume|all_windows',
    'trading|crypto_net_cash_flows|all_windows',
    'trading|unique_coins_traded|all_windows',
    # 'transfers|first_buy/median_avg_wallet_rank|all_windows',
    'trading|max_investment|all_windows'
]
segmentation_features = [
    # 'mktcap|portfolio_mcap_mean/market_cap_unadj|w5',
    'mktcap|volume_wtd_market_cap/market_cap_filled|w5',
    # 'timing|btc_mvrv_z_score/buy_weighted|w5',
    # 'timing|btc_mvrv_z_score/sell_weighted|w5',
    # 'macro|btc_mvrv_z_score_first|w5',
    # 'macro|btc_mvrv_z_score_last|w5',
    'trading|crypto_net_gain|w5',
    'trading|total_volume|w5',
    'trading|crypto_net_cash_flows|w5',
    'trading|unique_coins_traded|w5',
    # 'transfers|first_buy/median_avg_wallet_rank|w5',
    'trading|max_investment|w5'
]


# get raw segments
segments_df = wallet_evaluator.identify_predictive_populations(
    segmentation_features,
    min_pop_pct=0.02,
    max_segments=25
)

# coerce the formatted strings to numbers, then sort
# segments_df.sort_values('RMSE vs Overall', ascending=True)
segments_df.sort_values('R2 vs Overall', ascending=False)
# segments_df.describe()


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
pd.set_option('display.max_colwidth', None)  # Shows full text in columns


# Reload evaluator
wallet_evaluator = wime.RegressionEvaluator(wallet_model_results)

segmentation_features = [
    # 'mktcap|portfolio_mcap_mean/market_cap_unadj|all_windows',
    'mktcap|volume_wtd_market_cap/market_cap_filled|all_windows',
    # 'timing|btc_mvrv_z_score/buy_weighted|all_windows',
    # 'timing|btc_mvrv_z_score/sell_weighted|all_windows',
    # 'macro|btc_mvrv_z_score_first|all_windows',
    # 'macro|btc_mvrv_z_score_last|all_windows',
    'trading|crypto_net_gain|all_windows',
    'trading|total_volume|all_windows',
    'trading|crypto_net_cash_flows|all_windows',
    'trading|unique_coins_traded|all_windows',
    # 'transfers|first_buy/median_avg_wallet_rank|all_windows',
    'trading|max_investment|all_windows'
]


# get raw segments
segments_df = wallet_evaluator.identify_predictive_populations(
    segmentation_features,
    min_pop_pct=0.02,
    max_segments=25
)

# coerce the formatted strings to numbers, then sort
# segments_df.sort_values('RMSE vs Overall', ascending=True)
segments_df.sort_values('R2 vs Overall', ascending=False)
# segments_df.describe()


In [None]:
segments_df.sort_values('R2 vs Overall', ascending=False)


#### importance analysis

In [None]:
wallet_evaluator.importance_summary(1)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Reload evaluator
wallet_evaluator = wime.RegressionEvaluator(wallet_model_results)

feature_importances_df = wiva.analyze_wallet_model_importance(wallet_evaluator.metrics['importances'])
feature_importances_df = feature_importances_df.copy()

feature_categories_filter = [
    # 'performance',
    'timing',
    # 'trading',
    # 'transfers',
    # 'mktcap',
    # 'scenario',
    # 'macro',
    # 'cluster',
]

feature_names_filter = [
    'price_sma_2',
    # 'price_rsi_5',
    # 'volume_sma_5',
    # 'market_cap_filled',
    # 'mktcap',
    # 'cluster',
    # 'btc_mvrv_z_score',
]

groups = [
    'feature_category',
    'feature_name',
    'feature_comparison',
    'feature_aggregation',
    # 'training_segment',
    'feature'
]

(feature_importances_df
 [feature_importances_df['feature_category'].isin(feature_categories_filter)]
 [feature_importances_df['feature_name'].isin(feature_names_filter)]
 .fillna('None').groupby(groups)
 .sum('importance')
 .sort_values(by='importance',ascending=False)
)

#### modeling multi window r2 comparison

In [None]:
epochs = sorted(list(modeling_wallet_scores_df.index.get_level_values('epoch_start_date').unique()))

for epoch in epochs:
    epoch_mask = modeling_wallet_scores_df.index.get_level_values('epoch_start_date') == epoch
    # Add cohort filter
    cohort_mask = modeling_wallet_scores_df['in_modeling_cohort'] == True
    combined_mask = epoch_mask & cohort_mask

    y_true = modeling_wallet_scores_df[combined_mask]['actual']
    y_pred = modeling_wallet_scores_df[combined_mask]['score']

    # Skip epochs with no actual values
    if y_true.isna().all():
        continue

    metrics = wiva.evaluate_predictions(y_true, y_pred)
    print(f"Epoch {epoch}: R² = {metrics['r2']:.3f}")

## dda 695 coin prices vs hybrid features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Source Data
parquet_folder = wallets_config['training_data']['parquet_folder']
complete_profits_df = pd.read_parquet(f"{parquet_folder}/complete_profits_df.parquet")
complete_market_data_df = pd.read_parquet(f"{parquet_folder}/complete_market_data_df.parquet")
complete_macro_trends_df = pd.read_parquet(f"{parquet_folder}/complete_macro_trends_df.parquet")

# Initiate orchestrator
epochs_orchestrator = weo.MultiEpochOrchestrator(
    wallets_config.config,
    wallets_metrics_config,
    wallets_features_config,
    wallets_epochs_config,
    complete_profits_df,
    complete_market_data_df,
    complete_macro_trends_df,
)


In [None]:
# Features
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/multiwindow_wallet_training_data_df.parquet")
wallet_training_data_df = u.df_downcast(wallet_training_data_df)

In [None]:
wallet_training_data_df['hybrid_cw_id'] = wallet_training_data_df.index.get_level_values('wallet_address')
coin_training_data_df = wallet_training_data_df.merge(epochs_orchestrator.complete_hybrid_cw_id_df, how='inner', on='hybrid_cw_id')
coin_training_data_df = coin_training_data_df.set_index('coin_id')

In [None]:
def aggregate_coin_features(coin_training_data_df):
    """
    Aggregate features by coin_id, computing sum, count, mean, and median for all columns.

    Params:
    - coin_training_data_df (DataFrame): Coin-wallet level feature data

    Returns:
    - coin_features_df (DataFrame): Coin-level aggregated features
    """
    # Reset index to make coin_id a column for proper groupby
    df = coin_training_data_df.reset_index()

    # Define aggregations to apply to all numeric columns
    aggregations = ['sum', 'count', 'mean', 'median']

    # Group by coin_id and apply aggregations
    # Exclude non-numeric columns that shouldn't be aggregated
    exclude_cols = ['hybrid_cw_id', 'wallet_address']
    numeric_cols = [col for col in df.columns if col not in exclude_cols and col != 'coin_id']

    # Perform groupby and aggregation
    coin_features_df = df.groupby('coin_id', observed=True)[numeric_cols].agg(aggregations)

    # Flatten the multi-level columns
    coin_features_df.columns = [f"{col[0]}_{col[1]}" for col in coin_features_df.columns]

    return coin_features_df

coin_features_df = aggregate_coin_features(coin_training_data_df)

In [None]:
def calculate_price_changes(complete_market_data_df, period_start, period_end):
    """
    Calculate percentage price change between period_start and period_end for each coin.

    Params:
    - complete_market_data_df (DataFrame): Market data with multiindex (coin_id, date)
    - period_start (datetime): Start date for calculation
    - period_end (datetime): End date for calculation

    Returns:
    - price_change_df (DataFrame): DataFrame with coin_id index and price_change column
    """
    # Reset index to access coin_id and date as columns
    df = complete_market_data_df.reset_index()

    # Get prices at period_start
    start_prices = df[df['date'] == period_start].set_index('coin_id')['price']

    # Get prices at period_end
    end_prices = df[df['date'] == period_end].set_index('coin_id')['price']

    # Calculate percentage change
    price_change = ((end_prices - start_prices) / start_prices)

    # Create result dataframe
    result_df = pd.DataFrame(price_change)
    result_df.columns = ['price_change']

    return result_df

period_start = wallets_config['training_data']['modeling_starting_balance_date']
period_end = wallets_config['training_data']['modeling_period_end']
complete_market_data_df.head()

price_changes_df = calculate_price_changes(complete_market_data_df, period_start, period_end)
price_changes_df.head()

In [None]:
def analyze_price_change_by_quartiles(df: pd.DataFrame) -> pd.DataFrame:
    """
    Analyze relationship between features and price_change using quartile analysis.

    Params:
    - df (DataFrame): input dataframe with features and price_change column

    Returns:
    - quartile_df (DataFrame): metrics about feature quartiles and corresponding price_change values
    """
    # 1. Drop columns with zero variance
    variance = df.var(numeric_only=True)
    zero_var_cols = variance[variance == 0].index.tolist()
    df = df.drop(columns=zero_var_cols)

    # Make a copy of price_change as our target
    target = df['price_change'].copy()

    # Initialize lists to store results
    metrics = []
    correlations = []
    counts = []
    q1_feature_avgs = []
    q2_feature_avgs = []
    q3_feature_avgs = []
    q4_feature_avgs = []
    q1_price_avgs = []
    q2_price_avgs = []
    q3_price_avgs = []
    q4_price_avgs = []

    # Calculate stats for each column
    for col in df.columns:
        if col == 'price_change':
            continue

        # Skip columns that are entirely NaN
        if df[col].isna().all():
            continue

        # Get valid data (non-NaN pairs)
        valid_mask = (~df[col].isna()) & (~target.isna())
        feature_valid = df.loc[valid_mask, col]
        target_valid = target.loc[valid_mask]

        # Skip if not enough valid data points
        if len(feature_valid) < 20:  # Minimum needed for meaningful quartiles
            continue

        # Calculate correlation for sorting
        corr = feature_valid.corr(target_valid, method='pearson', min_periods=5)

        # Calculate quartile boundaries
        quartiles = feature_valid.quantile([0.25, 0.5, 0.75])
        q1_bound, q2_bound, q3_bound = quartiles.iloc[0], quartiles.iloc[1], quartiles.iloc[2]

        # Create quartile masks
        q1_mask = feature_valid <= q1_bound
        q2_mask = (feature_valid > q1_bound) & (feature_valid <= q2_bound)
        q3_mask = (feature_valid > q2_bound) & (feature_valid <= q3_bound)
        q4_mask = feature_valid > q3_bound

        # Calculate average feature values in each quartile
        q1_feat_avg = feature_valid[q1_mask].mean()
        q2_feat_avg = feature_valid[q2_mask].mean()
        q3_feat_avg = feature_valid[q3_mask].mean()
        q4_feat_avg = feature_valid[q4_mask].mean()

        # Calculate average price_change in each feature quartile
        q1_price_avg = target_valid[q1_mask].mean()
        q2_price_avg = target_valid[q2_mask].mean()
        q3_price_avg = target_valid[q3_mask].mean()
        q4_price_avg = target_valid[q4_mask].mean()

        # Store results
        metrics.append(col)
        correlations.append(corr)
        counts.append(len(feature_valid))
        q1_feature_avgs.append(q1_feat_avg)
        q2_feature_avgs.append(q2_feat_avg)
        q3_feature_avgs.append(q3_feat_avg)
        q4_feature_avgs.append(q4_feat_avg)
        q1_price_avgs.append(q1_price_avg)
        q2_price_avgs.append(q2_price_avg)
        q3_price_avgs.append(q3_price_avg)
        q4_price_avgs.append(q4_price_avg)

    # Create result dataframe
    result_df = pd.DataFrame({
        'metric': metrics,
        'count': counts,
        'correlation': correlations,
        'q1_feature_avg': q1_feature_avgs,
        'q2_feature_avg': q2_feature_avgs,
        'q3_feature_avg': q3_feature_avgs,
        'q4_feature_avg': q4_feature_avgs,
        'q1_price_avg': q1_price_avgs,
        'q2_price_avg': q2_price_avgs,
        'q3_price_avg': q3_price_avgs,
        'q4_price_avg': q4_price_avgs
    })

    # Calculate monotonicity (how consistently price changes across quartiles)
    result_df['monotonic_score'] = (
        ((result_df['q2_price_avg'] > result_df['q1_price_avg']).astype(int) +
        (result_df['q3_price_avg'] > result_df['q2_price_avg']).astype(int) +
        (result_df['q4_price_avg'] > result_df['q3_price_avg']).astype(int)) -
        ((result_df['q2_price_avg'] < result_df['q1_price_avg']).astype(int) +
        (result_df['q3_price_avg'] < result_df['q2_price_avg']).astype(int) +
        (result_df['q4_price_avg'] < result_df['q3_price_avg']).astype(int))
    )

    # Sort by absolute correlation values (descending)
    result_df['abs_correlation'] = result_df['correlation'].abs()
    result_df = result_df.sort_values('abs_correlation', ascending=False).drop('abs_correlation', axis=1)

    return result_df

In [None]:
merged_df = coin_features_df.join(price_changes_df)
analysis_df = analyze_price_change_by_quartiles(merged_df)

In [None]:
# Create X and y
features = analysis_df[abs(analysis_df['correlation'])>0.005]['metric']

X = merged_df[features]
y = u.winsorize(merged_df['price_change'],0.02)

In [None]:
def mark_top_n_percent(values: pd.Series, threshold_percent: float) -> pd.Series:
    """
    Creates a boolean mask identifying values in the top n percent.

    Params:
    - values (Series): Series of numeric values to evaluate
    - threshold_percent (float): Percentage threshold (0-100)

    Returns:
    - Series: Boolean mask of same length as input, True for values in top n percent
    """
    # Calculate the threshold value at the specified percentile
    percentile_threshold = np.percentile(values, 100 - threshold_percent)

    # Return boolean series where True indicates values in top n percent
    return values >= percentile_threshold

y = mark_top_n_percent(merged_df['price_change'], 5)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# First drop NaN values from the target
mask = ~y.isna()
X_filtered = X[mask]
y_filtered = y[mask]

# Handle NaN values in features
X_filtered = X_filtered.fillna(X_filtered.mean())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train XGBoost classification model
model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Calculate ROC AUC (handling multiclass if needed)
if len(np.unique(y_test)) > 2:
    # One-vs-rest ROC AUC for multiclass
    roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted')
else:
    # Binary classification ROC AUC
    roc_auc = roc_auc_score(y_test, y_prob[:, 1])

# Print metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Plot ROC curve (for binary classification)
if len(np.unique(y_test)) == 2:
    fpr, tpr, _ = roc_curve(y_test, y_prob[:, 1])
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# Get feature importance
importance = model.feature_importances_
feature_names = X_train.columns

# Sort features by importance
sorted_idx = np.argsort(importance)[::-1]
top_features = [(feature_names[i], importance[i]) for i in sorted_idx[:10]]

# Print top 10 features
print("\nTop 10 features:")
for name, imp in top_features:
    print(f"{name}: {imp:.4f}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# First drop NaN values from the target
mask = ~y.isna()
X_filtered = X[mask]
y_filtered = y[mask]

# Handle NaN values in features
X_filtered = X_filtered.fillna(X_filtered.mean())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train XGBoost classification model
model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Calculate ROC AUC (handling multiclass if needed)
if len(np.unique(y_test)) > 2:
    # One-vs-rest ROC AUC for multiclass
    roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted')
else:
    # Binary classification ROC AUC
    roc_auc = roc_auc_score(y_test, y_prob[:, 1])

# Print metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Plot ROC curve (for binary classification)
if len(np.unique(y_test)) == 2:
    fpr, tpr, _ = roc_curve(y_test, y_prob[:, 1])
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# Get feature importance
importance = model.feature_importances_
feature_names = X_train.columns

# Sort features by importance
sorted_idx = np.argsort(importance)[::-1]
top_features = [(feature_names[i], importance[i]) for i in sorted_idx[:10]]

# Print top 10 features
print("\nTop 10 features:")
for name, imp in top_features:
    print(f"{name}: {imp:.4f}")

#### classification

#### regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

# First drop NaN values from the target
mask = ~y.isna()
X_filtered = X[mask]
y_filtered = y[mask]

# Handle NaN values in features
X_filtered = X_filtered.fillna(X_filtered.mean())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train XGBoost model
model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print metrics
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R²: {r2:.4f}")

# Get feature importance
importance = model.feature_importances_
feature_names = X_train.columns

# Sort features by importance
sorted_idx = np.argsort(importance)[::-1]
top_features = [(feature_names[i], importance[i]) for i in sorted_idx[:10]]

# Print top 10 features
print("\nTop 10 features:")
for name, imp in top_features:
    print(f"{name}: {imp:.4f}")

## validation period predictions with wallet model

#### generate validation period training data

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_epochs_config = yaml.safe_load(Path('../config/wallets_epochs_config.yaml').read_text(encoding='utf-8'))

complete_profits_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_profits_df.parquet")
complete_market_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_market_data_df.parquet")
complete_macro_trends_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/complete_macro_trends_df.parquet")

# Identify offset needed to generate training data up to the validation period start
# modeling_offset = (datetime.strptime(wallets_config['training_data']['modeling_period_end'], '%Y-%m-%d') - datetime.strptime(wallets_config['training_data']['training_period_end'], '%Y-%m-%d')).days
validation_epochs_config = copy.deepcopy(wallets_epochs_config)
validation_epochs_config['offset_epochs']['offsets'] = validation_epochs_config['offset_epochs']['validation_offsets']

# Initiate orchestrator
epochs_orchestrator = weo.MultiEpochOrchestrator(
    wallets_config.config,
    wallets_metrics_config,
    wallets_features_config,
    validation_epochs_config,
    complete_profits_df,
    complete_market_data_df,
    complete_macro_trends_df
)

# Generate TRAINING_DATA_DF for the modeling period offset window
validation_training_data_df, validation_wallet_features_df = epochs_orchestrator.generate_epochs_training_data()

# Save files
validation_training_data_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_wallet_training_data_df.parquet",index=True)
validation_wallet_features_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_wallet_features_df.parquet",index=True)


#### make validation predictions

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Override model_id from model generation if necessary
# model_id = 'da541721-627d-4991-affd-b7822a80c67f'

# Load files
validation_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_wallet_training_data_df.parquet")
validation_wallet_features_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_wallet_features_df.parquet")

# y_true generation
validation_y_true_full=validation_wallet_features_df[wallets_config['modeling']['target_variable']]
validation_y_true_modeling=validation_wallet_features_df[validation_wallet_features_df['in_modeling_cohort']==1][wallets_config['modeling']['target_variable']]
validation_y_true_nonmodeling=validation_wallet_features_df[validation_wallet_features_df['in_modeling_cohort']==0][wallets_config['modeling']['target_variable']]

# y_pred generation
base_path = wallets_config['training_data']['model_artifacts_folder']
validation_y_pred = wiva.load_and_predict(model_id,validation_training_data_df,base_path)

wiva.evaluate_predictions(validation_y_true_modeling,validation_y_pred)
wiva.evaluate_predictions(validation_y_true_full,validation_y_pred)

#### validation multi window r2 comparison

In [None]:
y_true_cohort = validation_y_true_full

# Consolidate scores for all validation epochs
validation_wallet_scores_df = pd.merge(
    y_true_cohort.reset_index(),
    validation_y_pred.reset_index(),
    on=['wallet_address', 'epoch_start_date'],
    how='inner',
    suffixes=('_true', '_pred')
)
validation_wallet_scores_df = validation_wallet_scores_df.set_index(['wallet_address','epoch_start_date'])
validation_wallet_scores_df.columns = ['actual','score']


# Generate metrics for all validation epochs
epochs = sorted(list(validation_wallet_scores_df.index.get_level_values('epoch_start_date').unique()))
for epoch in epochs:
    epoch_mask = validation_wallet_scores_df.index.get_level_values('epoch_start_date') == epoch
    y_true = validation_wallet_scores_df[epoch_mask]['actual']
    y_pred = validation_wallet_scores_df[epoch_mask]['score']

    # Skip epochs with no actual values
    if y_true.isna().all():
        continue

    metrics = wiva.evaluate_predictions(y_true, y_pred)
    print(f"epoch {epoch}: R² = {metrics['r2']:.3f}")

u.notify(34)



## Single Window Construction


### Training Data Sequence

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# Load orchestrator
training_data_orchestrator = wtdo.WalletTrainingDataOrchestrator(
    copy.deepcopy(wallets_config.config),
    wallets_metrics_config,
    wallets_features_config
)

In [None]:
# Retrieve data
_,_,_,_ = training_data_orchestrator.retrieve_period_datasets(
    wallets_config['training_data']['training_period_start'],
    wallets_config['training_data']['training_period_end'],
    parquet_prefix='training'
)

In [None]:
# Select cohort and prepare training data
parquet_folder = wallets_config['training_data']['parquet_folder']
training_profits_df_full = pd.read_parquet(f"{parquet_folder}/training_profits_df_full.parquet")
training_market_data_df_full = pd.read_parquet(f"{parquet_folder}/training_market_data_df_full.parquet")
training_macro_trends_df_full = pd.read_parquet(f"{parquet_folder}/training_macro_trends_df_full.parquet")


_ = training_data_orchestrator.prepare_training_data(
    training_profits_df_full,
    training_market_data_df_full,
    training_macro_trends_df_full
)

# Store hybrid ID map
if wallets_config['training_data']['hybridize_wallet_ids']:
    pd.to_pickle(training_data_orchestrator.hybrid_cw_id_map, f"{parquet_folder}/hybrid_cw_id_map.pkl")

In [None]:
# Generate training features
parquet_folder = wallets_config['training_data']['parquet_folder']
training_profits_df = pd.read_parquet(f"{parquet_folder}/training_profits_df.parquet")
training_market_indicators_df = pd.read_parquet(f"{parquet_folder}/training_market_indicators_data_df.parquet")
training_macro_indicators_df = pd.read_parquet(f"{parquet_folder}/training_macro_indicators_df.parquet")
training_transfers_df = pd.read_parquet(f"{parquet_folder}/training_transfers_sequencing_df.parquet")

training_data_orchestrator.generate_training_features(
    training_profits_df,
    training_market_indicators_df,
    training_macro_indicators_df,
    training_transfers_df
)

u.notify(3)

### Wallet Model Target Variable and Wallet Cohort

In [None]:
# Load modeling datasets
training_coin_cohort = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/training_market_indicators_data_df.parquet",
                                       columns=['coin_id'])['coin_id'].unique()
_,_,_,_ = training_data_orchestrator.retrieve_period_datasets(
    wallets_config['training_data']['modeling_period_start'],
    wallets_config['training_data']['modeling_period_end'],
    training_coin_cohort,
    parquet_prefix='modeling'
)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

training_wallet_cohort = pd.read_parquet(
    f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet",
    columns=[]
).index.values

# Load orchestrator
training_data_orchestrator = wtdo.WalletTrainingDataOrchestrator(
    copy.deepcopy(wallets_config.config),
    wallets_metrics_config,
    wallets_features_config,
    training_wallet_cohort
)

In [None]:
# Prepare modeling features for target variables
modeling_profits_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/modeling_profits_df_full.parquet")
hybrid_cw_id_map = None
if wallets_config['training_data']['hybridize_wallet_ids']:
    hybrid_cw_id_map = pd.read_pickle(f"{wallets_config['training_data']['parquet_folder']}/hybrid_cw_id_map.pkl")

_ = training_data_orchestrator.prepare_modeling_features(
    modeling_profits_df_full,
    hybrid_cw_id_map
)

u.notify(3)

### Wallet Model Construction and Analysis

#### select target variable (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create MODELING_DF and Construct Wallet Model
# ----------------------------------------------------------
# Retrieve training data for the full training wallet cohort
modeling_wallet_features_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/modeling_wallet_features_df.parquet")

# Filter training data to only the modeling cohort through inner join to target variable
modeling_cohort_target_var_df = modeling_wallet_features_df[['in_modeling_cohort', wallets_config['modeling']['target_variable']]].copy()

# Retrieve training data for the full training wallet cohort
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet")
logger.info("Training data df shape: %s", wallet_training_data_df.shape)
# sorted(list(wallet_training_data_df.columns))

#### build wallet model or run search

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Retrieve training data for the full training wallet cohort
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet")

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config['modeling'])

# Validate indices match
if not all(
    wallet_training_data_df.sort_index().index.get_level_values(level).equals(
        modeling_cohort_target_var_df.sort_index().index.get_level_values(level)
    ) for level in wallet_training_data_df.index.names
):
    raise ValueError("Merged training and modeling DataFrames have mismatched indices.")


wallet_model_results = wallet_model.construct_wallet_model(wallet_training_data_df,modeling_cohort_target_var_df)
del wallet_training_data_df
gc.collect()

# Print summary
if 'y_train' in wallet_model_results:

    # Generate and save all model artifacts
    model_id, wallet_evaluator, modeling_wallet_scores_df = wmr.generate_and_save_wallet_model_artifacts(
        model_results=wallet_model_results,
        base_path='../artifacts/wallet_modeling',
        configs = {
            'wallets_config': wallets_config.config,
            'wallets_metrics_config': wallets_metrics_config,
            'wallets_features_config': wallets_features_config
        },
        save_scores=False
    )
    print(wallet_evaluator.summary_report())
else:
    display(wallet_model.generate_search_report())

## assess wallet model performance

### performance report

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Reload evaluator
wallet_evaluator = wime.RegressionEvaluator(
    y_train=wallet_model_results['y_train'],
    y_test=wallet_model_results['y_test'],
    y_pred=wallet_model_results['y_pred'],
    training_cohort_pred=wallet_model_results['training_cohort_pred'],
    training_cohort_actuals=wallet_model_results['training_cohort_actuals'],
    model=wallet_model_results['pipeline'].named_steps['regressor'],
    feature_names=wallet_model_results['pipeline'][:-1].transform(wallet_model_results['X_train']).columns.tolist()
)
print(len(wallet_model_results['pipeline'][:-1].transform(wallet_model_results['X_train']).columns.tolist()))
# Print results
print(wallet_evaluator.summary_report())
wallet_evaluator.plot_wallet_evaluation()
wallet_evaluator.importance_summary(0)

### importance analysis

In [None]:
sorted(wallet_model_results['pipeline'][:-1].transform(wallet_model_results['X_train']).columns.tolist())

In [None]:
wallet_evaluator.importance_summary(0)
# wallet_evaluator.importance_summary(1)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Reload evaluator
wallet_evaluator = wime.RegressionEvaluator(
    y_train=wallet_model_results['y_train'],
    y_test=wallet_model_results['y_test'],
    y_pred=wallet_model_results['y_pred'],
    training_cohort_pred=wallet_model_results['training_cohort_pred'],
    training_cohort_actuals=wallet_model_results['training_cohort_actuals'],
    model=wallet_model_results['pipeline'].named_steps['regressor'],
    feature_names=wallet_model_results['pipeline'][:-1].transform(wallet_model_results['X_train']).columns.tolist()
)

feature_importances_df = wiva.analyze_wallet_model_importance(wallet_evaluator.metrics['importances'])
feature_importances_df = feature_importances_df.copy()

feature_categories_filter = [
    # 'performance',
    # 'timing',
    # 'trading',
    # 'transfers',
    'mktcap',
    # 'scenario',
    # 'cluster',
]

feature_names_filter = [
    # 'price_sma_5',
    # 'price_rsi_5',
    # 'volume_sma_5',
    # 'market_cap_filled',
    # 'mktcap',
    # 'cluster',
]

groups = [
    # 'record_type',
    'feature_category',
    'feature_name',
    'feature_comparison',
    'feature_aggregation',
    # 'training_segment',
]

(feature_importances_df
 [feature_importances_df['feature_category'].isin(feature_categories_filter)]
#  [feature_importances_df['feature_name'].isin(feature_names_filter)]
 .fillna('None').groupby(groups)
 .sum('importance')
 .sort_values(by='importance',ascending=False)
)

### save scores for coin model

In [None]:
score_name = 'net_gain_max_investment_winsorized_base'
# score_name = 'x'

# Special save score for use in the coin model

# Create wallet scores DataFrame with both cohorts
modeling_wallet_scores_df = pd.DataFrame({
    f'score|{score_name}': wallet_model_results['training_cohort_pred'],
    f'actual|{score_name}': wallet_model_results['training_cohort_actuals'],
    'in_modeling_cohort': wallet_model_results['training_cohort_pred'].index.isin(wallet_model_results['y_test'].index)
})

modeling_wallet_scores_df.head()


# scores_df.head()
modeling_wallet_scores_df.to_parquet(f"temp/wallet_modeling_score_dfs/{score_name}.parquet",index=True)

u.notify(2)
# u.notify(15)

### orchestrate experiment

In [None]:

# [importlib.reload(module) for module in modules]
# wallets_config.reload()

# # Load experiments config
# wallets_config_experiment = yaml.safe_load(Path('../config/wallets_config_experiment.yaml').read_text(encoding='utf-8'))

# # Initialize orchestrator with both configs
# orchestrator = wimo.WalletExperimentsOrchestrator(
#     config_base=wallets_config.config,         # your base config dict
#     config_experiment=wallets_config_experiment  # your experiment config dict
# )

# # Run experiment
# results = orchestrator.orchestrate_wallet_experiment(
#     training_data_df=wallet_training_data_df,
#     modeling_wallet_features_df=modeling_wallet_features_df
# )


### Cluster analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Load parquet
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet")


# List of the x features with the highest importance in the model
x_features = 6
top_feature_metrics = list((pd.DataFrame(wallet_evaluator.metrics['importances'])
                      .sort_values(by='importance',ascending=False)
                      .head(x_features)['feature']))
comparison_metrics = list(set(top_feature_metrics))



# Cluster numbers
n_clusters=4

styled_df,cluster_results_df = wica.create_cluster_report(wallet_training_data_df, wallet_model_results, n_clusters, comparison_metrics, 'median')

del(wallet_training_data_df)
gc.collect()

styled_df

In [None]:
modeling_df = wallet_training_data_df.copy()

base_metrics = [
    'trading|max_investment|all_windows',
    'trading|crypto_net_gain|all_windows',
    'mktcap|end_portfolio_wtd_market_cap|all_windows',
    'performance|crypto_net_gain/max_investment/base|all_windows',
]
cluster_cols = [col for col in modeling_df.columns if col.startswith('cluster|')]
cluster_analysis_df = modeling_df[list(set(cluster_cols + base_metrics + comparison_metrics))].copy()


# Assign wallets to categorical clusters based on the distance values
cluster_assignments_df = wcl.assign_clusters_from_distances(cluster_analysis_df,
                                                        wallets_config['features']['clustering_n_clusters'])
# cluster_analysis_df = cluster_analysis_df.join(cluster_assignments_df,how='inner')



In [None]:
list(cluster_analysis_df.columns)

In [None]:
cluster_assignments_df

# Validation Period Analysis

#### non-wallet coin model feature generation (slow)

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
wallets_config.reload()
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# Confirm period boundaries align
if not ((config['training_data']['modeling_period_start'] == wallets_config['training_data']['validation_period_start'])
        & (config['training_data']['modeling_period_end'] == wallets_config['training_data']['validation_period_end'])):
    raise ValueError("Coin features modeling period must align with wallet features validation period.")

# Generate features based on the coin config files
coin_features_training_data_df, _, _ = tw.generate_all_time_windows_model_inputs(config,metrics_config,modeling_config)

# Remove time window index since we aren't using that for now
coin_features_training_data_df = coin_features_training_data_df.reset_index(level='time_window', drop=True)

# Save to parquet
coin_features_training_data_df.to_parquet("temp/coin_modeling_dfs/coin_non_wallet_features_training_data_df.parquet",index=True)

u.notify()

### Load modeling dataset files (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# Load coin cohort (currently carried through training/modeling/validation periods)
training_coin_cohort = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/training_market_indicators_data_df.parquet",
                                       columns=['coin_id'])['coin_id'].unique()

# Load modeling period scores and data
modeling_market_data_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/modeling_market_data_df_full.parquet")
modeling_profits_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/modeling_profits_df.parquet")

# Filter historical records
modeling_market_data_df = modeling_market_data_df_full[
    modeling_market_data_df_full['date'] >= wallets_config['training_data']['modeling_starting_balance_date']
]

u.assert_period(modeling_market_data_df,
                wallets_config['training_data']['modeling_period_start'],
                wallets_config['training_data']['modeling_period_end'])
u.assert_period(modeling_profits_df,
                wallets_config['training_data']['modeling_period_start'],
                wallets_config['training_data']['modeling_period_end'])


u.obj_mem()

# Coin Model Construction

## Prepare coin_training_data_df

### assign wallets to segments

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Create base df with all wallet addresses and scores
wallet_scores_df = cfo.load_wallet_scores(wallets_coin_config['wallet_segments']['wallet_scores'],
                                            wallets_coin_config['wallet_segments']['wallet_scores_path'])
wallet_segmentation_df = wallet_scores_df

# Add "all" segment for full population level aggregations
wallet_segmentation_df['all_wallets|all'] = 'all'
wallet_segmentation_df['all_wallets|all'] = wallet_segmentation_df['all_wallets|all'].astype('category')


# Add score quantile assignments
wallet_segmentation_df = cws.assign_wallet_score_quantiles(
    wallet_segmentation_df,
    wallets_coin_config['wallet_segments']['wallet_scores'],
    wallets_coin_config['wallet_segments']['score_segment_quantiles']
)

# Add training period-based cluster labels if configured
if wallets_coin_config['wallet_segments'].get('training_period_cluster_groups'):
    training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet")
    wallet_clusters_df = cws.assign_cluster_labels(
        training_data_df,
        wallets_coin_config['wallet_segments']['training_period_cluster_groups']
    )
    del(training_data_df)
    gc.collect

    # Join together and ensure no rows were dropped
    orig_len = len(wallet_segmentation_df)
    wallet_segmentation_df = wallet_segmentation_df.join(wallet_clusters_df,how='inner')
    joined_len = len(wallet_segmentation_df)
    if joined_len < orig_len:
        raise ValueError(f"Join dropped {orig_len - joined_len} rows from original {orig_len} rows")


u.obj_mem()

list(wallet_segmentation_df.columns)


### generate metrics for coin-wallet pairs

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Create base df with all coin-wallet pairs
cw_metrics_df = pd.DataFrame(
    index=modeling_profits_df[['coin_id', 'wallet_address']]
    .drop_duplicates()
    .set_index(['coin_id', 'wallet_address'])
    .index
)

# Only modeling period boundaries work until date imputation logic is added
valid_dates = [
   wallets_config['training_data']['modeling_starting_balance_date'],
   wallets_config['training_data']['modeling_period_end']
]
assert all(date in valid_dates for date in wallets_coin_config['wallet_features']['wallet_balance_dates']), \
   f"Balance dates must be one of {valid_dates}"

# Generate balance metric
cw_balances_df = cwbm.calculate_coin_wallet_balances(
   modeling_profits_df,
   wallets_coin_config['wallet_features']['wallet_balance_dates']
)
cw_balances_df = cw_balances_df.add_prefix('balances/')
cw_metrics_df = cw_metrics_df.join(cw_balances_df,how='left')\
        .fillna({col: 0 for col in cw_balances_df.columns})


# Generate trading metrics
cw_trading_features_df = cwbm.calculate_coin_wallet_trading_metrics(modeling_profits_df,
                                                                    wallets_config['training_data']['modeling_period_start'],
                                                                    wallets_config['training_data']['modeling_period_end'],
                                                                    wallets_coin_config['wallet_features']['drop_trading_metrics'])
cw_trading_features_df = cw_trading_features_df.add_prefix('trading/')
cw_metrics_df = cw_metrics_df.join(cw_trading_features_df,how='left')\
        .fillna({col: 0 for col in cw_trading_features_df.columns})

cw_metrics_df.describe()

### flatten cw_metrics into single values for each coin-segment pair

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Define complete coin list and initialize df with
coin_wallet_features_df = pd.DataFrame(index=training_coin_cohort)
coin_wallet_features_df.index.name = 'coin_id'


# Loop through all metrics and segmentations to generate features
segmentation_families = wallet_segmentation_df.columns[~wallet_segmentation_df.columns.str.startswith('scores|')]
metric_columns = cw_metrics_df.columns

# Calculate all features for each metric column
i = 0
logger.info("Calculating segment features for each metric column...")
for metric_column in metric_columns:

    # Calculate metric column features for each segment family
    for segment_family in segmentation_families:

        # Generate coin-level features based on modeling period end wallet scores and balances
        coin_segment_family_features_df = cfo.flatten_cw_to_coin_features(
            cw_metrics_df,
            metric_column,
            wallet_segmentation_df,
            segment_family,
            training_coin_cohort
        )
        coin_wallet_features_df = coin_wallet_features_df.join(coin_segment_family_features_df, how='inner')

    i+=1
    logger.info("Completed metric %s/%s: %s...",
                i, len(metric_columns), metric_column)

logger.info("Calculated all metric-segment-aggregation features. Final output shape: %s",
            coin_wallet_features_df.shape )

del cw_metrics_df,cw_trading_features_df,cw_balances_df,wallet_scores_df#,wallet_segmentation_df
gc.collect()

u.obj_mem()

# save to parquet if next step won't be joined
coin_wallet_features_df.to_parquet("temp/coin_modeling_dfs/coin_training_data_df_full.parquet",index=True)

coin_wallet_features_df.shape

### Merge to non_wallet_features (if generated)

In [None]:
coin_non_wallet_features_training_data_df = pd.read_parquet("temp/coin_modeling_dfs/coin_non_wallet_features_training_data_df.parquet")
list(coin_non_wallet_features_training_data_df.columns)

In [None]:
coin_wallet_features_df.shape

In [None]:
coin_non_wallet_features_training_data_df.shape

In [None]:
# Retrieve data from coin features pipeline
coin_non_wallet_features_training_data_df = pd.read_parquet("temp/coin_modeling_dfs/coin_non_wallet_features_training_data_df.parquet")

# Confirm overlap
coin_features_ids = coin_features_training_data_df.index
coin_wallet_features_ids = coin_wallet_features_df.index
wallet_features_only_ids = set(coin_wallet_features_ids) - set(coin_features_ids)

if len(wallet_features_only_ids) == 0:
    logger.info("All %s coins with wallet features were found in the non wallet coin features set.",
                len(coin_wallet_features_ids))

else:
    logger.warning(f"Wallet features contain {len(wallet_features_only_ids)} coins not in the non wallet coin features")


# Join together
coin_training_data_df_full = coin_wallet_features_df.join(coin_non_wallet_features_training_data_df,how='inner')
logger.info("Final features shape: %s",coin_training_data_df_full.shape)

# Save to parquet and delete
coin_training_data_df_full.to_parquet("temp/coin_modeling_dfs/coin_training_data_df_full.parquet",index=True)
del coin_training_data_df_full,coin_wallet_features_df
gc.collect()

u.obj_mem()

### review columns

In [None]:
coin_training_data_df_full = pd.read_parquet("temp/coin_modeling_dfs/coin_training_data_df_full.parquet")


In [None]:
df = pd.DataFrame(coin_training_data_df_full.columns)
df.columns = ['feature']

# Split on pipe delimiters
split_df = df['feature'].str.split('|', expand=True)
split_df.columns = ['segment_category','segment_family','metric','transformation']

# Split nested components
segment_families = split_df['segment_family'].str.split('/', expand=True)
segment_families.columns = ['segment_family', 'segment_value']

metrics = split_df['metric'].str.split('/', expand=True)
metrics.columns = ['metric', 'metric_detail']

transformations = split_df['transformation'].str.split('/', expand=True)
transformations.columns = ['transformation', 'transformation_method']

# Combine all components
feature_details_df = pd.concat([
    split_df['segment_category'],
    segment_families,
    metrics,
    transformations,
], axis=1)

feature_details_df['feature_full'] = df['feature']
feature_details_df

In [None]:
segment_category_filter = [
    # 'all_wallets',
    # 'score_quantile',
    # 'training_clusters',
    'time_series',
    'wallet_cohorts',
]
segment_family_filter = [
    # 'all_wallets',
    'net_gain_winsorized_dda619_grid_score',
    # 'time_series',
    # 'wallet_cohorts',
]
metric_filter = [
    'trading',
    'balances',
]
metric_detail_filter = [
    'crypto_net_gain',
    'usd_balance_241031',
]
transformation_filter = [
    # 'aggregations',
    # 'score_wtd',
    'score_dist',
]
transformation_method_filter = [
    'count',
    'sum',
    # 'net_gain_winsorized_dda619_grid_score'
]

groups = [
    'segment_category',
    'segment_family',
    # 'segment_value',
    'metric',
    # 'metric_detail',
    # 'transformation',
    # 'transformation_method',
    'feature_full',

]

pd.DataFrame(feature_details_df
 [
 (feature_details_df['segment_category'].isin(segment_category_filter))
#  & (feature_details_df['segment_family'].isin(segment_family_filter))
#  & (feature_details_df['metric'].isin(metric_filter))
# #  & (feature_details_df['metric_detail'].isin(metric_detail_filter))
#  & (feature_details_df['transformation'].isin(transformation_filter))
# #  & (feature_details_df['transformation_method'].isin(transformation_method_filter))
    ]
 .fillna('None').groupby(groups)
 .size()
# ).columns
).sort_values(by=0,ascending=False)


## Prepare coin_modeling_df

### Retrieve validation datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Retrieve Validation Profits and Market Data
# ----------------------------------------------------------
# Retrieve full historical through validation period datasets

# Retrieve training coin cohort to ensure all training period coins are reflected
training_coin_cohort = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/training_market_indicators_data_df.parquet",
                                       columns=['coin_id'])['coin_id'].unique()
_,_,_ = wtdo.retrieve_period_datasets(
    wallets_config['training_data']['validation_period_start'],
    wallets_config['training_data']['validation_period_end'],
    training_coin_cohort,
    parquet_prefix = 'validation'

)

del _
gc.collect
u.obj_mem()


### load validation datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# Load parquet
validation_market_data_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_market_data_df_full.parquet")
validation_profits_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_profits_df_full.parquet")


# Remove pre-validation period prices
validation_market_data_df = validation_market_data_df_full[validation_market_data_df_full['date']
                                                       >=wallets_config['training_data']['validation_starting_balance_date']]
del validation_market_data_df_full
gc.collect()


# Handle hybridization if configured
if wallets_config['training_data']['hybridize_wallet_ids'] is True:
    hybrid_cw_id_map = pd.read_pickle(f"{wallets_config['training_data']['parquet_folder']}/hybrid_cw_id_map.pkl")

    logger.info("Applying wallet-coin hybridization...")
    validation_profits_df_full, _ = wtdo.hybridize_wallet_address(
        validation_profits_df_full,
        hybrid_cw_id_map
    )

# Filter to only training wallet cohort
training_wallet_cohort = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet", columns=[]).index.values
validation_profits_df = validation_profits_df_full[validation_profits_df_full['wallet_address'].isin(training_wallet_cohort)]
del validation_profits_df_full
gc.collect()


# Assert period, save files, remove from memory
u.assert_period(validation_market_data_df,
                wallets_config['training_data']['validation_period_start'],
                wallets_config['training_data']['validation_period_end'])
u.assert_period(validation_profits_df,
                wallets_config['training_data']['validation_period_start'],
                wallets_config['training_data']['validation_period_end'])
validation_profits_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_profits_df.parquet",index=False)
validation_market_data_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_market_data_df.parquet",index=False)
# del validation_profits_df,validation_market_data_df
gc.collect()
u.obj_mem()


### apply coin filters (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# Load parquet
coin_training_data_df_full = pd.read_parquet("temp/coin_modeling_dfs/coin_training_data_df_full.parquet")
logger.info("Starting coins: %s", len(coin_training_data_df_full))

# Filter based on holdings
min_cohort_wallets = wallets_coin_config['coin_modeling']['min_cohort_wallets']
min_cohort_balance = wallets_coin_config['coin_modeling']['min_cohort_balance']

coin_training_data_df = coin_training_data_df_full[
    (coin_training_data_df_full['all_wallets|all/all|balances/usd_balance_241031|aggregations/count'] >= min_cohort_wallets)
    & (coin_training_data_df_full['all_wallets|all/all|balances/usd_balance_241031|aggregations/sum'] >= min_cohort_balance)
]
logger.info("Coins after balance filters: %s", len(coin_training_data_df))
# del coin_training_data_df_full
# gc.collect()

# Filter based on market cap
min_market_cap = wallets_coin_config['coin_modeling']['min_market_cap']
max_market_cap = wallets_coin_config['coin_modeling']['max_market_cap']

coin_training_data_df = coin_training_data_df[
    (coin_training_data_df['time_series|market_data|market_cap_last'].isna())
    | (
        (coin_training_data_df['time_series|market_data|market_cap_last'] >= min_market_cap)
        & (coin_training_data_df['time_series|market_data|market_cap_last'] <= max_market_cap)
    )
]
logger.info("Coins after market cap filters: %s", len(coin_training_data_df))

# Save to parquet and delete
coin_training_data_df.to_parquet("temp/coin_modeling_dfs/coin_training_data_df.parquet",index=True)
# del coin_training_data_df
gc.collect()

u.obj_mem()

### Prepare coin model target variable (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# Load market data
validation_market_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_market_data_df.parquet")
coin_training_data_df = pd.read_parquet("temp/coin_modeling_dfs/coin_training_data_df.parquet")


# Target variable calculations
# ----------------------------
# Calculate coin return performance during validation period
validation_coin_performance_df = civa.calculate_coin_performance(
    validation_market_data_df,
    wallets_config['training_data']['validation_period_start'],
    wallets_config['training_data']['validation_period_end']
)

# Drop columns with np.nan coin_return values, which indicate a 0 starting price
validation_coin_performance_df = validation_coin_performance_df.dropna()

# Add winsorized return
validation_coin_performance_df['coin_return_winsorized'] = u.winsorize(
        validation_coin_performance_df['coin_return'],
        wallets_coin_config['coin_modeling']['returns_winsorization'])


# Add full percentile (meaning it's a percentile of all coins prior to any population filtering)
validation_coin_performance_df['coin_return_pctile_full'] = validation_coin_performance_df['coin_return'].rank(pct=True,ascending=True)


# Validation: check if any coin_ids missing from final features
missing_coins = set(coin_training_data_df.index) - set(validation_coin_performance_df.index)
if missing_coins:
    raise ValueError(f"Found {len(missing_coins)} coin_ids in training_data_df without validation period target variables.")


# Target variable attachment
# --------------------------
# Identify target variable column
target_var_column = wallets_coin_config['coin_modeling']['target_variable']

# Calculate the percentile among the coin_training_data_df coins
if target_var_column == 'coin_return_pctile':
    coin_modeling_df = coin_training_data_df.join(validation_coin_performance_df[['coin_return']])
    coin_modeling_df['coin_return_pctile'] = coin_modeling_df['coin_return'].rank(pct=True,ascending=True)
    coin_modeling_df = coin_modeling_df.drop('coin_return',axis=1)
else:
    coin_modeling_df = coin_training_data_df.join(validation_coin_performance_df[[target_var_column]])
# del coin_training_data_df,validation_coin_performance_df
gc.collect


# Convert the index to string to avoid serialization/export categorical series issues
coin_modeling_df.index = coin_modeling_df.index.astype(str)


u.obj_mem()

## Build coin model

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Initialize and run model
coin_model = cm.CoinModel(modeling_config=wallets_coin_config['coin_modeling'])
coin_model_results = coin_model.construct_coin_model(feature_df=coin_modeling_df)
# del coin_modeling_df
gc.collect()

# Print summary
if 'y_train' in coin_model_results:


# # Extract the trained model
# coin_model = coin_model_results['pipeline'].named_steps['regressor']

    # Generate and save all model artifacts
    coin_model_id, coin_evaluator, coin_scores_df = cmr.generate_and_save_coin_model_artifacts(
        model_results=coin_model_results,
        base_path='../artifacts/coin_modeling',
        configs = {
            'wallets_coin_config': wallets_coin_config,
            'wallets_config': wallets_config.config
        }
    )
    print(coin_evaluator.summary_report())
else:
    display(coin_model.generate_search_report())


## Post model analysis

### performance report

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Initialize evaluator
coin_evaluator = wime.RegressionEvaluator(
    y_train=coin_model_results['y_train'],
    y_test=coin_model_results['y_test'],
    y_pred=coin_model_results['y_pred'],
    model=coin_model_results['pipeline'].named_steps['regressor'],
    feature_names=coin_model_results['pipeline'][:-1].transform(coin_model_results['X_train']).columns.tolist()
)

print(coin_evaluator.summary_report())
coin_evaluator.plot_coin_evaluation()
coin_evaluator.importance_summary(0)

### importance analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


feature_details_df = civa.analyze_coin_model_importance(coin_evaluator.metrics['importances'])

segment_category_filter = [
    # 'all_wallets',
    'score_quantile',
    # 'time_series',
    # 'wallet_cohorts',
    # 'training_clusters',
]
segment_family_filter = [
    'all_wallets',
    'net_gain_winsorized_dda619_grid_score',
    # 'time_series',
    # 'wallet_cohorts',
]
metric_filter = [
    # 'trading',
    'balances',
]
metric_detail_filter = [
    'crypto_net_gain',
    'usd_balance_241031',
]
transformation_filter = [
    # 'aggregations',
    # 'score_wtd',
]
transformation_method_filter = [
    'net_gain_winsorized_dda619_grid_residual_p10',
    # 'sum',
]

groups = [
    'segment_category',
    # 'segment_family',
    # 'segment_value',
    'metric',
    'metric_detail',
    'transformation',
    'transformation_method',
    # 'feature_full',

]

pd.DataFrame(feature_details_df
 [
 (feature_details_df['segment_category'].isin(segment_category_filter))
#  & (feature_details_df['segment_family'].isin(segment_family_filter))
#  & (feature_details_df['metric'].isin(metric_filter))
#  & (feature_details_df['metric_detail'].isin(metric_detail_filter))
#  & (feature_details_df['transformation'].isin(transformation_filter))
#  & (feature_details_df['transformation_method'].isin(transformation_method_filter))
    ]
 .fillna('None').groupby(groups)
 .sum('importance')
# ).columns
).sort_values(by='importance',ascending=False)


In [None]:
# Load importances
feature_importance_df = pd.DataFrame(coin_evaluator.metrics['importances'])

# Split on pipe delimiters
split_df = feature_importance_df['feature'].str.split('|', expand=True)
split_df.columns = ['segment_category','segment_family','metric','transformation']

# Split nested components
segment_families = split_df['segment_family'].str.split('/', expand=True)
segment_families.columns = ['segment_family', 'segment_value']

metrics = split_df['metric'].str.split('/', expand=True)
metrics.columns = ['metric', 'metric_detail']

transformations = split_df['transformation'].str.split('/', expand=True)
transformations.columns = ['transformation', 'transformation_method']

# Combine all components
feature_details_df = pd.concat([
    split_df['segment_category'],
    segment_families,
    metrics,
    transformations,
    feature_importance_df['importance']
], axis=1)

feature_details_df

In [None]:
list(feature_importance_df['feature'])

In [None]:
groups = [
    'segment_category',
    'segment_family',
    # 'segment_value',
    'metric',
    'metric_detail',
    # 'transformation',
    # 'transformation_method',
]

feature_details_df.groupby(groups).sum('importance').sort_values(by='importance',ascending=False)

In [None]:
result_df

## analyze features

### basic correlation

In [None]:
import pandas as pd

# Assuming your DataFrame is named `df`
# Calculate correlations
correlation_matrix = coin_modeling_df.corr()

# Extract correlations with the target variable
target_correlations = correlation_matrix[target_var_column].sort_values(ascending=False)

# Display the top features correlated with the target
target_correlations[:15]
# target_correlations

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))



# # Wallet metrics to analyze
# wallet_metrics = [
# ]

wallet_metrics = coin_modeling_df.columns
wallet_metrics = target_correlations[:15].index.values

# number of score buckets
n_quantiles = 5

analyze_df = civa.analyze_metric_segments(
    coin_modeling_df,
    wallet_metrics,
    n_quantiles,
    target_var_column,
)
civa.style_metric_segments(analyze_df)

# Pre Coin Model Analysis

### Wallet aggregated analysis

#### generate validation wallet features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Create a DataFrame with all wallets that should exist
validation_wallet_features_df = pd.DataFrame(index=training_wallet_cohort)
validation_wallet_features_df.index.name = 'wallet_address'


# Calculate modeling period wallet metrics
validation_trading_features_df = wtf.calculate_wallet_trading_features(validation_profits_df,
                                                            wallets_config['training_data']['validation_period_start'],
                                                            wallets_config['training_data']['validation_period_end'],
                                                            include_twb_metrics=False)
validation_wallet_features_df = validation_wallet_features_df.join(validation_trading_features_df, how='left')\
    .fillna({col: 0 for col in validation_trading_features_df.columns})

# Performance features (inner join, no fill)
performance_features_df = wpf.calculate_performance_features(validation_wallet_features_df,include_twb_metrics=False)
validation_wallet_features_df = validation_wallet_features_df.join(performance_features_df, how='inner')

In [None]:
validation_wallet_features_df

#### wallet validation period trading/performance by score quantile

In [None]:
# Create base df with all wallet addresses and scores
modeling_wallet_scores_df = cfo.load_wallet_scores(wallets_coin_config['wallet_segments']['wallet_scores'],
                                            wallets_coin_config['wallet_segments']['wallet_scores_path'])


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create analysis by prediction bands
metrics = [
    'crypto_net_gain/max_investment/winsorized',
    'crypto_net_gain/max_investment/base',
    'crypto_net_gain/max_investment/ntile_rank',
    'crypto_net_gain/active_twb/winsorized',
    'crypto_net_gain/active_twb/base',
    'max_investment',
    'crypto_net_gain',
    'crypto_net_flows',
    'total_volume',
]

min_wallet_volume_usd = 0
num_quantiles = 5

wiva.create_quantile_report(
    validation_wallet_features_df,
    modeling_wallet_scores_df[wallets_config['modeling']['score_name']],
    metrics,  # Your existing metrics list
    num_quantiles,  # Split into ntiles
    min_wallet_volume_usd
)


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create analysis by prediction bands
metrics = [
    'crypto_net_gain/max_investment/winsorized',
    'crypto_net_gain/max_investment/base',
    'crypto_net_gain/max_investment/ntile_rank',
    'crypto_net_gain/active_twb/winsorized',
    'crypto_net_gain/active_twb/base',
    'max_investment',
    'crypto_net_gain',
    'crypto_net_flows',
    'total_volume',
]

min_wallet_volume_usd = 0
num_quantiles = 5

wiva.create_quantile_report(
    validation_wallet_features_df,
    modeling_wallet_scores_df[wallets_config['modeling']['score_name']],
    metrics,  # Your existing metrics list
    num_quantiles,  # Split into ntiles
    min_wallet_volume_usd
)


### old analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Wallet metrics to analyze
wallet_metrics = [
    'top_100pct/balance_wtd_mean_score',
    'top_10pct/count',
    'top_25pct/count',
    'top_50pct/count',
    'top_100pct/count',
    'top_10pct/count_pct',
    'top_10pct/balance_pct',
    'top_25pct/count_pct',
    'top_25pct/balance_pct',
    'top_50pct/count_pct',
    'top_50pct/balance_pct',
]
# wallet_metrics = list(validation_coin_wallet_features_df.columns)

# Create styled performance analysis
civa.create_top_coins_wallet_metrics_report(validation_coin_wallet_features_df,percentile=90,wallet_metrics=wallet_metrics,method='mean')


#### plotting coin feature performance vs market cap

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Get the analysis results
segment_results, summary_df = civa.analyze_market_cap_segments(
    coin_wallet_features_df,
    top_n=10
)

# Or create the visualizations
civa.plot_segment_heatmap(summary_df)
civa.plot_metric_consistency(summary_df)  # Optional secondary visualization


#### coin performance of top n for each bucket

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Run analysis
top_n = wallets_config['coin_validation_analysis']['top_n']
max_market_cap = wallets_config['coin_validation_analysis']['max_market_cap']
min_market_cap = wallets_config['coin_validation_analysis']['min_market_cap']

metric_top_coin_performance_df = civa.validate_coin_performance(coin_wallet_features_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

civa.print_performance_analysis(coin_wallet_features_df)

# Junkyard

# Tests failing

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))
