### start

In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import gc
import time
import logging
import re
import pdb
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import IPython
import requests
import pandas_gbq
from google.cloud import bigquery
import scipy
from scipy import stats
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    explained_variance_score,
    mean_absolute_percentage_error,
    roc_auc_score
)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['LOGGING_FILE']="../../../Local/logs/wallet_modeling.log"
os.environ['ALERT_SOUND_FILEPATH']="../../../Local/assets/sounds/mixkit-alert-bells-echo-765.wav"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp

# Wallet features
import wallet_features.clustering_features as wcl
import wallet_features.market_cap_features as wmc
import wallet_features.market_timing_features as wmt
import wallet_features.performance_features as wpf
import wallet_features.trading_features as wtf
import wallet_features.transfers_features as wts
import wallet_features.wallet_features_orchestrator as wfo

# Base modeling
import base_modeling.base_model as bm
import base_modeling.feature_selection as fs

# Wallet modeling
import wallet_modeling.wallet_modeling_orchestrator as wmo
import wallet_modeling.wallet_training_data as wtd
import wallet_modeling.wallet_model_reporting as wmr
import wallet_modeling.wallet_model as wm
import wallet_modeling.experiments_manager as wem
from wallet_modeling.wallets_config_manager import WalletsConfig

# Wallet insights
import wallet_insights.model_evaluation as wime
import wallet_insights.wallet_experiments_orchestrator as wimo
import wallet_insights.wallet_validation_analysis as wiva
import wallet_insights.wallet_cluster_analysis as wica

# Coin features
import coin_wallet_features.coin_features_orchestrator as cfo
import coin_wallet_features.wallet_base_metrics as cwbm
import coin_wallet_features.wallet_segmentation as cws

# Coin modeling
import coin_modeling.coin_model_reporting as cmr
import coin_modeling.coin_model as cm

# Coin insights
import coin_insights.coin_validation_analysis as civa


# reload all modules
modules = [
    u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp,
    wmo, wtd, wmr, wm, wem,
    wcl, wmc, wmt, wpf, wtf, wts, wfo,
    bm, fs,
    wime, wimo, wiva, wica,
    cfo, cwbm, cws,
    cmr, cm,
    civa,
]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')
wallets_config = WalletsConfig.load_from_yaml('../config/wallets_config.yaml')

wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# make parquet dirs if they don't already exist
Path(wallets_config['training_data']['parquet_folder']).mkdir(parents=True, exist_ok=True)
Path(wallets_coin_config['wallet_segments']['parquet_folder']).mkdir(parents=True, exist_ok=True)

# Set the custom error handler
ipython = IPython.get_ipython()
ipython.set_custom_exc((Exception,), u.notify_on_failure)

# configure logger
logger = u.setup_notebook_logger('../logs/notebook_logs.log')
logger.setLevel(logging.INFO)


[importlib.reload(module) for module in modules]
# u.notify('startup')
# u.notify('intro_3')
u.notify('retro')

logger.info("Good morning, let's get to work")

In [None]:
u.export_code(
    code_directories=[
        # 'training_data',
        # 'wallet_features',
        # 'wallet_modeling',
        # 'wallet_insights'
    ],
    # include_config = True,
    ipynb_notebook = 'DDA-518 orchestrators amd hybrid keys.ipynb'
)


u.obj_mem()

# Wallet Model Construction

## Training Data Sequence

### retrieve training datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# Complete Pre-Training Profits/Market Data
# -----------------------------------------
# Retrieve training period datasets and save them to wallets_config['training_data']['parquet_folder']
_,_,_ = wmo.retrieve_period_datasets(
    wallets_config['training_data']['training_period_start'],
    wallets_config['training_data']['training_period_end'],
    parquet_prefix = 'training')

del _
gc.collect()
u.obj_mem()

### prepare indicators, cohort, training data dfs (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Load raw datasets from parquet
training_profits_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/training_profits_df_full.parquet")
training_market_data_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/training_market_data_df_full.parquet")

# Run consolidated preparation
_ = wmo.prepare_training_data(
    training_profits_df_full,
    training_market_data_df_full,
    wallets_metrics_config,
    parquet_folder=wallets_config['training_data']['parquet_folder']
)

del training_profits_df_full,training_market_data_df_full
u.obj_mem()

### generate training features (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# Load prepared data
parquet_folder = wallets_config['training_data']['parquet_folder']
training_profits_df = pd.read_parquet(f"{parquet_folder}/training_profits_df.parquet")
training_market_indicators_df = pd.read_parquet(f"{parquet_folder}/training_market_indicators_data_df.parquet")
training_transfers_df = pd.read_parquet(f"{parquet_folder}/training_transfers_sequencing_df.parquet")
training_wallet_cohort = list(set(training_profits_df['wallet_address']))

# Generate all features
wmo.generate_training_features(
    training_profits_df,
    training_market_indicators_df,
    training_transfers_df,
    training_wallet_cohort,
    parquet_folder
)

u.notify(3)

del training_profits_df,training_market_indicators_df,training_transfers_df,training_wallet_cohort
gc.collect()
u.obj_mem()

In [None]:
# Retrieve training data for the full training wallet cohort
wallet_training_data_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet")
list(wallet_training_data_df_full.columns)

In [None]:
# Retrieve training data for the full training wallet cohort
wallet_training_data_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet")
list(wallet_training_data_df_full.columns)

### dda 586 devspace

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

training_profits_df = pd.read_parquet(f"{parquet_folder}/training_profits_df.parquet")


trading_features_df = wtf.calculate_wallet_trading_features(training_profits_df,
    wallets_config['training_data']['training_period_start'],
    wallets_config['training_data']['training_period_end'],
    wallets_config['features']['include_twb_metrics']
)
trading_features_df.describe()

In [None]:
trading_features_df_full = trading_features_df.copy()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

trading_features_df = trading_features_df_full.copy()
include_twb_metrics = False

# Numerator features reflecting the profit/return rate/cash inflows/inflow
profits_features_df = wpf.calculate_profits_features(trading_features_df)
profits_features_df = profits_features_df.add_prefix('profits_')

# Demoniminator features reflecting the balance/investment/outlays
balance_features_df = wpf.calculate_balance_features(trading_features_df,
                                                    include_twb_metrics)
balance_features_df = balance_features_df.add_prefix('balance_')

# Combine to make ratios
ratios_features_df = profits_features_df.join(balance_features_df)
ratios_features_df = wpf.calculate_performance_ratios(ratios_features_df)

# Generate features using transformations of ratios
performance_features_df = wpf.transform_performance_ratios(ratios_features_df,
                                                        balance_features_df)

# Check null values
null_check = performance_features_df.isnull().sum()
if null_check.any():
    raise ValueError(f"Null values found in columns: {null_check[null_check > 0].index.tolist()}")

# Check for infinite values
inf_columns = (
    performance_features_df.columns[
        performance_features_df.isin([np.inf, -np.inf]).any()
    ].tolist())
if inf_columns:
    raise ValueError(f"Infinite values found in columns: {inf_columns}")

# Check wallet_address index consistency
if not performance_features_df.index.equals(trading_features_df.index):
    raise ValueError("Wallet address mismatch between trading_features_df and performance_features_df")
performance_features_df

In [None]:
profits_features_df

## Wallet Model Target Variable and Wallet Cohort

### Retrieve modeling period datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Retrieve Modeling Profits and Market Data
# ----------------------------------------------------------
# Retrieve training coin cohort to restrict modeling period data to only training period coins
training_coin_cohort = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/training_market_indicators_data_df.parquet",
                                       columns=['coin_id'])['coin_id'].unique()
# Retrieve full historical through modeling period datasets
_,_,_ = wmo.retrieve_period_datasets(
    wallets_config['training_data']['modeling_period_start'],
    wallets_config['training_data']['modeling_period_end'],
    coin_cohort=training_coin_cohort,
    parquet_prefix = 'modeling'
)

del _
gc.collect()
u.obj_mem()

### prepare modeling features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Load profits data
modeling_profits_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/modeling_profits_df_full.parquet")

# Load hybrid map if using hybridization
hybrid_cw_id_map = None
if wallets_config['training_data']['hybridize_wallet_ids']:
    hybrid_cw_id_map = pd.read_pickle(f"{wallets_config['training_data']['parquet_folder']}/hybrid_cw_id_map.pkl")

# Run orchestrator
modeling_wallet_features_df = wmo.prepare_modeling_features(
    modeling_profits_df_full,
    hybrid_cw_id_map
)

del modeling_profits_df_full
gc.collect()
u.obj_mem()

## Wallet Model Construction and Analysis

### feature selection (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# # Load parquet
wallet_training_data_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df_full.parquet")
modeling_wallet_features_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/modeling_wallet_features_df.parquet")

# Add modeling cohort boolean
modeling_wallet_training_data_df = wallet_training_data_df_full.join(modeling_wallet_features_df['in_modeling_cohort'], how='inner')
modeling_wallet_training_data_df = modeling_wallet_training_data_df[modeling_wallet_training_data_df['in_modeling_cohort']==1]
modeling_wallet_training_data_df = modeling_wallet_training_data_df.drop(columns='in_modeling_cohort',axis=1)
logger.info("Full training data df shape: %s", wallet_training_data_df_full.shape)
logger.info("Modeling cohort training data df shape: %s", modeling_wallet_training_data_df.shape)

# # Remove low variance features
# modeling_wallet_training_data_df = fs.remove_low_variance_features(modeling_wallet_training_data_df,
#                 wallets_config['modeling']['feature_selection']['variance_threshold'],
#                 wallets_config['modeling']['feature_selection']['protected_features'])

# Remove correlated features
modeling_wallet_training_data_df = fs.remove_correlated_features(modeling_wallet_training_data_df,
                wallets_config['modeling']['feature_selection']['correlation_threshold'],
                wallets_config['modeling']['feature_selection']['protected_features'])


# # Filter training data df to only the selected columns
# wallet_training_data_df = wallet_training_data_df_full[modeling_wallet_training_data_df.columns]
wallet_training_data_df = wallet_training_data_df_full
logger.info("Pruned training data df shape: %s", wallet_training_data_df.shape)

# Save to parquet and delete
wallet_training_data_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df.parquet",index=True)
del wallet_training_data_df_full,modeling_wallet_training_data_df,wallet_training_data_df
gc.collect()

u.obj_mem()

In [None]:
# Retrieve training data for the full training wallet cohort
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df.parquet")
list(wallet_training_data_df.columns)

### select target variable (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create MODELING_DF and Construct Wallet Model
# ----------------------------------------------------------
# Retrieve training data for the full training wallet cohort
modeling_wallet_features_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/modeling_wallet_features_df.parquet")

# Filter training data to only the modeling cohort through inner join to target variable
modeling_cohort_target_var_df = modeling_wallet_features_df[['in_modeling_cohort', wallets_config['modeling']['target_variable']]].copy()

# Winsorize dollar amount columns
if wallets_config['modeling']['target_variable'] == 'crypto_net_gain':


    modeling_cohort_target_var_df['crypto_net_gain'] = u.winsorize(
        modeling_cohort_target_var_df[wallets_config['modeling']['target_variable']],
        wallets_config['features']['returns_winsorization']
    )

u.notify(3)

### build wallet model or run search

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Retrieve training data for the full training wallet cohort
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df.parquet")


# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config['modeling'])
wallet_model_results = wallet_model.construct_wallet_model(wallet_training_data_df,modeling_cohort_target_var_df)
del wallet_training_data_df
gc.collect()

### save model artifacts

In [None]:
# Generate and save all model artifacts
model_id, wallet_evaluator, modeling_wallet_scores_df = wmr.generate_and_save_wallet_model_artifacts(
    model_results=wallet_model_results,
    base_path='../artifacts/wallet_modeling',
    configs = {
        'wallets_config': wallets_config.config,
        'wallets_metrics_config': wallets_metrics_config,
        'wallets_features_config': wallets_features_config
    }
)

# save score
modeling_wallet_scores_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/modeling_wallet_scores_df.parquet",index=True)

print(wallet_evaluator.summary_report())

### save scores for coin model

In [None]:
score_name = 'net_gain_winsorized_checkpoint'
# score_name = 'x'

# Special save score for use in the coin model

# Create wallet scores DataFrame with both cohorts
modeling_wallet_scores_df = pd.DataFrame({
    f'score|{score_name}': wallet_model_results['training_cohort_pred'],
    f'actual|{score_name}': wallet_model_results['training_cohort_actuals'],
    'in_modeling_cohort': wallet_model_results['training_cohort_pred'].index.isin(wallet_model_results['y_test'].index)
})

modeling_wallet_scores_df.head()


# scores_df.head()
modeling_wallet_scores_df.to_parquet(f"temp/wallet_modeling_score_dfs/{score_name}.parquet",index=True)

u.notify(2)
# u.notify(15)

## assess wallet model performance

### performance report

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Reload evaluator
wallet_evaluator = wime.RegressionEvaluator(
    y_train=wallet_model_results['y_train'],
    y_test=wallet_model_results['y_test'],
    y_pred=wallet_model_results['y_pred'],
    training_cohort_pred=wallet_model_results['training_cohort_pred'],
    training_cohort_actuals=wallet_model_results['training_cohort_actuals'],
    model=model,
    feature_names=wallet_model_results['pipeline'][:-1].transform(wallet_model_results['X_train']).columns.tolist()
)

# Print results
print(wallet_evaluator.summary_report())
wallet_evaluator.plot_wallet_evaluation()
wallet_evaluator.importance_summary(0)

### orchestrate experiment

In [None]:

# [importlib.reload(module) for module in modules]
# wallets_config.reload()

# # Load experiments config
# wallets_config_experiment = yaml.safe_load(Path('../config/wallets_config_experiment.yaml').read_text(encoding='utf-8'))

# # Initialize orchestrator with both configs
# orchestrator = wimo.WalletExperimentsOrchestrator(
#     config_base=wallets_config.config,         # your base config dict
#     config_experiment=wallets_config_experiment  # your experiment config dict
# )

# # Run experiment
# results = orchestrator.orchestrate_wallet_experiment(
#     training_data_df=wallet_training_data_df,
#     modeling_wallet_features_df=modeling_wallet_features_df
# )


### importance analysis

In [None]:
# Retrieve training data for the full training wallet cohort
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df.parquet")
list(wallet_training_data_df.columns)

In [None]:
wallet_evaluator.importance_summary(0)
# wallet_evaluator.importance_summary(1)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Reload evaluator
wallet_evaluator = wime.RegressionEvaluator(
    y_train=wallet_model_results['y_train'],
    y_test=wallet_model_results['y_test'],
    y_pred=wallet_model_results['y_pred'],
    training_cohort_pred=wallet_model_results['training_cohort_pred'],
    training_cohort_actuals=wallet_model_results['training_cohort_actuals'],
    model=wallet_model_results['pipeline'].named_steps['regressor'],
    feature_names=wallet_model_results['pipeline'][:-1].transform(wallet_model_results['X_train']).columns.tolist()
)
feature_importances_df = wiva.analyze_wallet_model_importance(wallet_evaluator.metrics['importances'])
feature_importances_df = feature_importances_df.copy()

feature_categories_filter = [
    'performance',
    # 'timing',
    # 'trading',
    # 'transfers',
    # 'mktcap',
    # 'cluster',
]

feature_names_filter = [
    # 'price_sma_5',
    # 'price_rsi_5',
    # 'volume_sma_5',
    # 'market_cap_filled',
    # 'mktcap',
    # 'cluster',
]

groups = [
    # 'record_type',
    'feature_category',
    'feature_name',
    'feature_comparison',
    'feature_aggregation',
    # 'training_segment',
]

(feature_importances_df
 [feature_importances_df['feature_category'].isin(feature_categories_filter)]
#  [feature_importances_df['feature_name'].isin(feature_names_filter)]
 .fillna('None').groupby(groups)
 .sum('importance')
 .sort_values(by='importance',ascending=False)
)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Reload evaluator
wallet_evaluator = wime.RegressionEvaluator(
    y_train=wallet_model_results['y_train'],
    y_test=wallet_model_results['y_test'],
    y_pred=wallet_model_results['y_pred'],
    training_cohort_pred=wallet_model_results['training_cohort_pred'],
    training_cohort_actuals=wallet_model_results['training_cohort_actuals'],
    model=wallet_model_results['pipeline'].named_steps['regressor'],
    feature_names=wallet_model_results['pipeline'][:-1].transform(wallet_model_results['X_train']).columns.tolist()
)
feature_importances_df = wiva.analyze_wallet_model_importance(wallet_evaluator.metrics['importances'])
feature_importances_df = feature_importances_df.copy()

feature_categories_filter = [
    'performance',
    # 'timing',
    'trading',
    # 'transfers',
    # 'mktcap',
    # 'cluster',
]

feature_names_filter = [
    # 'price_sma_5',
    # 'price_rsi_5',
    # 'volume_sma_5',
    # 'market_cap_filled',
    # 'mktcap',
    # 'cluster',
]

groups = [
    # 'record_type',
    'feature_category',
    'feature_name',
    'feature_comparison',
    # 'feature_aggregation',
    # 'training_segment',
]

(feature_importances_df
 [feature_importances_df['feature_category'].isin(feature_categories_filter)]
#  [feature_importances_df['feature_name'].isin(feature_names_filter)]
 .fillna('None').groupby(groups)
 .sum('importance')
 .sort_values(by='importance',ascending=False)
)

In [None]:
feature_importances_df

In [None]:
wallet_evaluator.metrics['importances']

### Cluster analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Load parquet
wallet_training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df.parquet")


# List of the x features with the highest importance in the model
x_features = 6
top_feature_metrics = list((pd.DataFrame(wallet_evaluator.metrics['importances'])
                      .sort_values(by='importance',ascending=False)
                      .head(x_features)['feature']))
comparison_metrics = list(set(top_feature_metrics))



# Cluster numbers
n_clusters=4

styled_df,cluster_results_df = wica.create_cluster_report(wallet_training_data_df, wallet_model_results, n_clusters, comparison_metrics, 'median')

del(wallet_training_data_df)
gc.collect()

styled_df

In [None]:
modeling_df = wallet_training_data_df.copy()

base_metrics = [
    'trading|max_investment|all_windows',
    'trading|crypto_net_gain|all_windows',
    'mktcap|end_portfolio_wtd_market_cap|all_windows',
    'performance|crypto_net_gain/max_investment/base|all_windows',
]
cluster_cols = [col for col in modeling_df.columns if col.startswith('cluster|')]
cluster_analysis_df = modeling_df[list(set(cluster_cols + base_metrics + comparison_metrics))].copy()


# Assign wallets to categorical clusters based on the distance values
cluster_assignments_df = wcl.assign_clusters_from_distances(cluster_analysis_df,
                                                        wallets_config['features']['clustering_n_clusters'])
# cluster_analysis_df = cluster_analysis_df.join(cluster_assignments_df,how='inner')



In [None]:
list(cluster_analysis_df.columns)

In [None]:
cluster_assignments_df

# Validation Period Analysis

### Retrieve validation and modeling datasets

#### Retrieve validation datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Retrieve Validation Profits and Market Data
# ----------------------------------------------------------
# Retrieve full historical through validation period datasets

# Retrieve training coin cohort to ensure all training period coins are reflected
# TODO: assess whether this cohort filter should be removed
training_coin_cohort = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/training_market_indicators_data_df.parquet",
                                       columns=['coin_id'])['coin_id'].unique()
_,_,_ = wmo.retrieve_period_datasets(
    wallets_config['training_data']['validation_period_start'],
    wallets_config['training_data']['validation_period_end'],
    training_coin_cohort,
    parquet_prefix = 'validation'

)

del _
gc.collect
u.obj_mem()


#### non-wallet coin model feature generation (slow)

In [None]:
# [importlib.reload(module) for module in modules]  # Reload all modules
# config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# # Generate features based on the coin config files
# coin_features_training_data_df, _, _ = tw.generate_all_time_windows_model_inputs(config,metrics_config,modeling_config)

# # Remove time window index since we aren't using that for now
# coin_features_training_data_df = coin_features_training_data_df.reset_index(level='time_window', drop=True)

# # Save to parquet
# coin_features_training_data_df.to_parquet("temp/coin_modeling_dfs/coin_features_training_data_df.parquet",index=True)

# u.notify()

### Load modeling and validation dataset files (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# Load coin cohort (currently carried through training/modeling/validation periods)
training_coin_cohort = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/training_market_indicators_data_df.parquet",
                                       columns=['coin_id'])['coin_id'].unique()

# Load modeling period scores and data
modeling_wallet_scores_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/modeling_wallet_scores_df.parquet")
modeling_market_data_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/modeling_market_data_df_full.parquet")
modeling_profits_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/modeling_profits_df.parquet")

# Filter historical records
modeling_market_data_df = modeling_market_data_df_full[
    modeling_market_data_df_full['date'] >= wallets_config['training_data']['modeling_starting_balance_date']
]

u.assert_period(modeling_market_data_df,
                wallets_config['training_data']['modeling_period_start'],
                wallets_config['training_data']['modeling_period_end'])
u.assert_period(modeling_profits_df,
                wallets_config['training_data']['modeling_period_start'],
                wallets_config['training_data']['modeling_period_end'])


u.obj_mem()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# Load parquet
validation_market_data_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_market_data_df_full.parquet")
validation_profits_df_full = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_profits_df_full.parquet")


# Remove pre-validation period prices
validation_market_data_df = validation_market_data_df_full[validation_market_data_df_full['date']
                                                       >=wallets_config['training_data']['validation_starting_balance_date']]
del validation_market_data_df_full
gc.collect()


# Handle hybridization if configured
if wallets_config['training_data']['hybridize_wallet_ids'] is True:
    hybrid_cw_id_map = pd.read_pickle(f"{wallets_config['training_data']['parquet_folder']}/hybrid_cw_id_map.pkl")

    logger.info("Applying wallet-coin hybridization...")
    validation_profits_df_full, _ = wmo.hybridize_wallet_address(
        validation_profits_df_full,
        hybrid_cw_id_map
    )


# Filter to only training wallet cohort
training_wallet_cohort = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df.parquet", columns=[]).index.values
validation_profits_df = validation_profits_df_full[validation_profits_df_full['wallet_address'].isin(training_wallet_cohort)]
del validation_profits_df_full
gc.collect()


# Assert period, save files, remove from memory
u.assert_period(validation_market_data_df,
                wallets_config['training_data']['validation_period_start'],
                wallets_config['training_data']['validation_period_end'])
u.assert_period(validation_profits_df,
                wallets_config['training_data']['validation_period_start'],
                wallets_config['training_data']['validation_period_end'])
validation_profits_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_profits_df.parquet",index=False)
validation_market_data_df.to_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_market_data_df.parquet",index=False)
# del validation_profits_df,validation_market_data_df
gc.collect()
u.obj_mem()


# Coin Model Construction

## Prepare coin_training_data_df

### assign wallets to segments

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Create base df with all wallet addresses and scores
wallet_scores_df = cfo.load_wallet_scores(wallets_coin_config['wallet_segments']['wallet_scores'],
                                            wallets_coin_config['wallet_segments']['wallet_scores_path'])
wallet_segmentation_df = wallet_scores_df

# Add "all" segment for full population level aggregations
wallet_segmentation_df['all_wallets|all'] = 'all'
wallet_segmentation_df['all_wallets|all'] = wallet_segmentation_df['all_wallets|all'].astype('category')


# Add score quantile assignments
wallet_segmentation_df = cws.assign_wallet_score_quantiles(
    wallet_segmentation_df,
    wallets_coin_config['wallet_segments']['wallet_scores'],
    wallets_coin_config['wallet_segments']['score_segment_quantiles']
)


# Add training period-based cluster labels
training_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/wallet_training_data_df.parquet")
wallet_clusters_df = cws.assign_cluster_labels(
    training_data_df,
    wallets_coin_config['wallet_segments']['training_period_cluster_groups']
)
del(training_data_df)
gc.collect


# Join together and ensure no rows were dropped
orig_len = len(wallet_segmentation_df)
wallet_segmentation_df = wallet_segmentation_df.join(wallet_clusters_df,how='inner')
joined_len = len(wallet_segmentation_df)
if joined_len < orig_len:
    raise ValueError(f"Join dropped {orig_len - joined_len} rows from original {orig_len} rows")


u.obj_mem()

wallet_segmentation_df.shape


### generate metrics for coin-wallet pairs

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Create base df with all coin-wallet pairs
cw_metrics_df = pd.DataFrame(
    index=modeling_profits_df[['coin_id', 'wallet_address']]
    .drop_duplicates()
    .set_index(['coin_id', 'wallet_address'])
    .index
)

# Only modeling period boundaries work until date imputation logic is added
valid_dates = [
   wallets_config['training_data']['modeling_starting_balance_date'],
   wallets_config['training_data']['modeling_period_end']
]
assert all(date in valid_dates for date in wallets_coin_config['wallet_features']['wallet_balance_dates']), \
   f"Balance dates must be one of {valid_dates}"

# Generate balance metric
cw_balances_df = cwbm.calculate_coin_wallet_balances(
   modeling_profits_df,
   wallets_coin_config['wallet_features']['wallet_balance_dates']
)
cw_balances_df = cw_balances_df.add_prefix('balances/')
cw_metrics_df = cw_metrics_df.join(cw_balances_df,how='left')\
        .fillna({col: 0 for col in cw_balances_df.columns})


# Generate trading metrics
cw_trading_features_df = cwbm.calculate_coin_wallet_trading_metrics(modeling_profits_df,
                                                                    wallets_config['training_data']['modeling_period_start'],
                                                                    wallets_config['training_data']['modeling_period_end'],
                                                                    wallets_coin_config['wallet_features']['drop_trading_metrics'])
cw_trading_features_df = cw_trading_features_df.add_prefix('trading/')
cw_metrics_df = cw_metrics_df.join(cw_trading_features_df,how='left')\
        .fillna({col: 0 for col in cw_trading_features_df.columns})

cw_metrics_df.describe()

### flatten cw_metrics into single values for each coin-segment pair

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Define complete coin list and initialize df with
coin_wallet_features_df = pd.DataFrame(index=training_coin_cohort)
coin_wallet_features_df.index.name = 'coin_id'


# Loop through all metrics and segmentations to generate features
segmentation_families = wallet_segmentation_df.columns[~wallet_segmentation_df.columns.str.startswith('scores|')]
metric_columns = cw_metrics_df.columns

# Calculate all features for each metric column
i = 0
logger.info("Calculating segment features for each metric column...")
for metric_column in metric_columns:

    # Calculate metric column features for each segment family
    for segment_family in segmentation_families:

        # Generate coin-level features based on modeling period end wallet scores and balances
        coin_segment_family_features_df = cfo.flatten_cw_to_coin_features(
            cw_metrics_df,
            metric_column,
            wallet_segmentation_df,
            segment_family,
            training_coin_cohort
        )
        coin_wallet_features_df = coin_wallet_features_df.join(coin_segment_family_features_df, how='inner')

    i+=1
    logger.info("Completed metric %s/%s: %s...",
                i, len(metric_columns), metric_column)

logger.info("Calculated all metric-segment-aggregation features. Final output shape: %s",
            coin_wallet_features_df.shape )

del cw_metrics_df,cw_trading_features_df,cw_balances_df,wallet_scores_df#,wallet_segmentation_df
gc.collect()

u.obj_mem()

# save to parquet if next step won't be joined
coin_wallet_features_df.to_parquet("temp/coin_modeling_dfs/coin_training_data_df_full.parquet",index=True)

coin_wallet_features_df.shape

### Merge to coin_training_data_df_full (if generated)

In [None]:
# # Retrieve data from coin features pipeline
# coin_features_training_data_df = pd.read_parquet("temp/coin_modeling_dfs/coin_features_training_data_df.parquet")

# # Confirm overlap
# coin_features_ids = coin_features_training_data_df.index
# coin_wallet_features_ids = coin_wallet_features_df.index
# wallet_features_only_ids = set(coin_wallet_features_ids) - set(coin_features_ids)

# if len(wallet_features_only_ids) == 0:
#     logger.info("All %s coins with wallet features were found in the base features set.",
#                 len(coin_wallet_features_ids))

# else:
#     raise ValueError(f"Wallet features contain {len(wallet_features_only_ids)} coins not in the other coin features")


# # Join together
# coin_training_data_df_full = coin_wallet_features_df.join(coin_features_training_data_df,how='inner')
# logger.info("Final features shape: %s",coin_training_data_df_full.shape)

# # Save to parquet and delete
# coin_training_data_df_full.to_parquet("temp/coin_modeling_dfs/coin_training_data_df_full.parquet",index=True)
# del coin_training_data_df_full,coin_wallet_features_df
# gc.collect()

# u.obj_mem()

## Prepare coin_modeling_df

### apply coin filters (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# Load parquet
coin_training_data_df_full = pd.read_parquet("temp/coin_modeling_dfs/coin_training_data_df_full.parquet")
logger.info("Starting coins: %s", len(coin_training_data_df_full))

# Filter based on holdings
min_cohort_wallets = wallets_coin_config['coin_modeling']['min_cohort_wallets']
min_cohort_balance = wallets_coin_config['coin_modeling']['min_cohort_balance']

coin_training_data_df = coin_training_data_df_full[
    (coin_training_data_df_full['all_wallets|all/all|balances/usd_balance_241120|aggregations/count'] >= min_cohort_wallets)
    & (coin_training_data_df_full['all_wallets|all/all|balances/usd_balance_241120|aggregations/sum'] >= min_cohort_balance)
]
logger.info("Coins after balance filters: %s", len(coin_training_data_df))
# del coin_training_data_df_full
# gc.collect()

# # Filter based on market cap
# min_market_cap = wallets_coin_config['coin_modeling']['min_market_cap']
# max_market_cap = wallets_coin_config['coin_modeling']['max_market_cap']

# coin_training_data_df = coin_training_data_df[
#     (coin_training_data_df['time_series|market_data|market_cap_last'] >= min_market_cap)
#     & (coin_training_data_df['time_series|market_data|market_cap_last'] <= max_market_cap)
# ]
# logger.info("Coins after market cap filters: %s", len(coin_training_data_df))


u.obj_mem()

### apply feature selection to columns

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

logger.info("Initial training data df shape: %s", coin_training_data_df.shape)

# Remove low variance features
coin_training_data_df = fs.remove_low_variance_features(coin_training_data_df,
                variance_threshold = wallets_coin_config['coin_modeling']['feature_selection']['variance_threshold'],
                scale_before_selection = wallets_coin_config['coin_modeling']['feature_selection']['scale_before_selection'])

# Remove correlated features
coin_training_data_df = fs.remove_correlated_features(coin_training_data_df,
                wallets_coin_config['coin_modeling']['feature_selection']['correlation_threshold'])

logger.info("Final training data df shape after feature selection: %s", coin_training_data_df.shape)


# Save to parquet and delete
coin_training_data_df.to_parquet("temp/coin_modeling_dfs/coin_training_data_df.parquet",index=True)
# del coin_training_data_df
gc.collect()

u.obj_mem()

### Prepare coin model target variable (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# Load market data
validation_market_data_df = pd.read_parquet(f"{wallets_config['training_data']['parquet_folder']}/validation_market_data_df.parquet")
coin_training_data_df = pd.read_parquet("temp/coin_modeling_dfs/coin_training_data_df.parquet")


# Target variable claculations
# ----------------------------
# Calculate coin return performance during validation period
validation_coin_performance_df = civa.calculate_coin_performance(
    validation_market_data_df,
    wallets_config['training_data']['validation_period_start'],
    wallets_config['training_data']['validation_period_end']
)

# Drop columns with np.nan coin_return values, which indicate a 0 starting price
validation_coin_performance_df = validation_coin_performance_df.dropna()

# Add winsorized return
validation_coin_performance_df['coin_return_winsorized'] = u.winsorize(
        validation_coin_performance_df['coin_return'],
        wallets_coin_config['coin_modeling']['returns_winsorization'])


# # Add full percentile (meaning it's a percentile of all coins prior to any population filtering)
# validation_coin_performance_df['coin_return_pctile_full'] = validation_coin_performance_df['coin_return'].rank(pct=True,ascending=True)


# # Validation: check if any coin_ids missing from final features
# missing_coins = set(coin_training_data_df.index) - set(validation_coin_performance_df.index)
# if missing_coins:
#     raise ValueError(f"Found {len(missing_coins)} coin_ids in training_data_df without validation period target variables.")


# # Target variable attachment
# # --------------------------
# # Identify target variable column
# target_var_column = wallets_coin_config['coin_modeling']['target_variable']

# # Calculate the percentile among the coin_training_data_df coins
# if target_var_column == 'coin_return_pctile':
#     coin_modeling_df = coin_training_data_df.join(validation_coin_performance_df[['coin_return']])
#     coin_modeling_df['coin_return_pctile'] = coin_modeling_df['coin_return'].rank(pct=True,ascending=True)
#     coin_modeling_df = coin_modeling_df.drop('coin_return',axis=1)
# else:
#     coin_modeling_df = coin_training_data_df.join(validation_coin_performance_df[[target_var_column]])
# # del coin_training_data_df,validation_coin_performance_df
# gc.collect


# # Convert the index to string to avoid serialization/export categorical series issues
# coin_modeling_df.index = coin_modeling_df.index.astype(str)


# u.obj_mem()

## Build coin model

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Initialize and run model
coin_model_base = cm.CoinModel(modeling_config=wallets_coin_config['coin_modeling'])
coin_model_results = coin_model_base.construct_coin_model(feature_df=coin_modeling_df)
# del coin_modeling_df
gc.collect()

# Extract the trained model
coin_model = coin_model_results['pipeline'].named_steps['regressor']

# Generate and save all model artifacts
coin_model_id, coin_evaluator, coin_scores_df = cmr.generate_and_save_coin_model_artifacts(
    model_results=coin_model_results,
    base_path='../artifacts/coin_modeling',
    configs = {
        'wallets_coin_config': wallets_coin_config,
        'wallets_config': wallets_config.config
    }
)

# save score
coin_scores_df.to_parquet("temp/coin_modeling_dfs/coin_scores_df.parquet",index=True)


# Initialize evaluator
coin_evaluator = wime.RegressionEvaluator(
    y_test=coin_model_results['y_test'],
    y_pred=coin_model_results['y_pred'],
    model=coin_model,
    feature_names=coin_model_results['pipeline'][:-1].transform(coin_model_results['X_train']).columns.tolist()
)

# Generate reports
print(coin_evaluator.summary_report())

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Initialize evaluator
coin_evaluator = wime.RegressionEvaluator(
    y_test=coin_model_results['y_test'],
    y_pred=coin_model_results['y_pred'],
    model=coin_model,
    feature_names=coin_model_results['pipeline'][:-1].transform(coin_model_results['X_train']).columns.tolist()
)

print(wallet_evaluator.summary_report())
wallet_evaluator.plot_wallet_evaluation()
wallet_evaluator.importance_summary(0)

## Post model analysis

### importance analysis

In [None]:
# Load importances
feature_importance_df = pd.DataFrame(coin_evaluator.metrics['importances'])

# Split on pipe delimiters
split_df = feature_importance_df['feature'].str.split('|', expand=True)
split_df.columns = ['segment_category','segment_family','metric','transformation']

# Split nested components
segment_families = split_df['segment_family'].str.split('/', expand=True)
segment_families.columns = ['segment_family', 'segment_value']

metrics = split_df['metric'].str.split('/', expand=True)
metrics.columns = ['metric', 'metric_detail']

transformations = split_df['transformation'].str.split('/', expand=True)
transformations.columns = ['transformation', 'transformation_method']

# Combine all components
feature_details_df = pd.concat([
    split_df['segment_category'],
    segment_families,
    metrics,
    transformations,
    feature_importance_df['importance']
], axis=1)

feature_details_df.head()



In [None]:
groups = [
    'segment_category',
    'segment_family',
    'segment_value',
    'metric',
    'metric_detail',
    'transformation',
    'transformation_method',
]

feature_details_df.groupby(groups).sum('importance').sort_values(by='importance',ascending=False)

In [None]:
groups = [
    'segment_category',
    'segment_family',
    # 'segment_value',
    'metric',
    'metric_detail',
    # 'transformation',
    # 'transformation_method',
]

feature_details_df.groupby(groups).sum('importance').sort_values(by='importance',ascending=False)

In [None]:
[
    'score_quantile|net_gain_winsorized_checkpoint_score/'
]

In [None]:
coin_evaluator.metrics['importances']

In [None]:
feature_details_df

In [None]:
groups = [
    # 'segment_category',
    # 'segment_family',
    # 'segment_value',
    'metric',
    'metric_detail',
    # 'transformation',
    # 'transformation_method',
]

feature_details_df.groupby(groups).sum('importance').sort_values(by='importance',ascending=False).head(20)

In [None]:
groups = [
    # 'segment_category',
    # 'segment_family',
    # 'segment_value',
    # 'metric',
    # 'metric_detail',
    'transformation',
    'transformation_method',
]

feature_details_df.groupby(groups).sum('importance').sort_values(by='importance',ascending=False)

In [None]:
df = split_df.copy()
result_df = split_df.copy()

# Process columns that need splitting
for col in ['segment_parent', 'metric', 'transformation']:
    # Split on '/' and create incrementing level columns
    split_cols = df[col].str.split('/', expand=True)

    # First component stays in original column
    result_df[col] = split_cols[0]

    # Additional components get level numbers
    for i in range(1, len(split_cols.columns)):
        result_df[f'{col}_l{i}'] = split_cols[i]


In [None]:
result_df

## analyze features

### basic correlation

In [None]:
import pandas as pd

# Assuming your DataFrame is named `df`
# Calculate correlations
correlation_matrix = coin_modeling_df.corr()

# Extract correlations with the target variable
target_correlations = correlation_matrix[target_var_column].sort_values(ascending=False)

# Display the top features correlated with the target
target_correlations[:15]
# target_correlations

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))



# # Wallet metrics to analyze
# wallet_metrics = [
# ]

wallet_metrics = coin_modeling_df.columns
wallet_metrics = target_correlations[:15].index.values

# number of score buckets
n_quantiles = 5

analyze_df = civa.analyze_metric_segments(
    coin_modeling_df,
    wallet_metrics,
    n_quantiles,
    target_var_column,
)
civa.style_metric_segments(analyze_df)

## Pre Coin Model Analysis

### Wallet aggregated analysis

#### generate validation wallet features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Create a DataFrame with all wallets that should exist
validation_wallet_features_df = pd.DataFrame(index=training_wallet_cohort)
validation_wallet_features_df.index.name = 'wallet_address'


# Calculate modeling period wallet metrics
validation_trading_features_df = wtf.calculate_wallet_trading_features(validation_profits_df,
                                                            wallets_config['training_data']['validation_period_start'],
                                                            wallets_config['training_data']['validation_period_end'])
validation_wallet_features_df = validation_wallet_features_df.join(validation_trading_features_df, how='left')\
    .fillna({col: 0 for col in validation_trading_features_df.columns})

# Performance features (inner join, no fill)
performance_features_df = wpf.calculate_performance_features(validation_wallet_features_df)
validation_wallet_features_df = validation_wallet_features_df.join(performance_features_df, how='inner')

In [None]:
validation_wallet_features_df

#### wallet validation period trading/performance by score quantile

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create analysis by prediction bands
metrics = [
    'crypto_net_gain/max_investment/winsorized',
    'crypto_net_gain/max_investment/base',
    'crypto_net_gain/max_investment/ntile_rank',
    'crypto_net_gain/active_twb/winsorized',
    'crypto_net_gain/active_twb/base',
    'max_investment',
    'crypto_net_gain',
    'crypto_net_flows',
    'total_volume',
]

min_wallet_volume_usd = 0
num_quantiles = 5

wiva.create_quantile_report(
    validation_wallet_features_df,
    modeling_wallet_scores_df[wallets_config['modeling']['score_name']],
    metrics,  # Your existing metrics list
    num_quantiles,  # Split into ntiles
    min_wallet_volume_usd
)


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create analysis by prediction bands
metrics = [
    'crypto_net_gain/max_investment/winsorized',
    'crypto_net_gain/max_investment/base',
    'crypto_net_gain/max_investment/ntile_rank',
    'crypto_net_gain/active_twb/winsorized',
    'crypto_net_gain/active_twb/base',
    'max_investment',
    'crypto_net_gain',
    'crypto_net_flows',
    'total_volume',
]

min_wallet_volume_usd = 0
num_quantiles = 5

wiva.create_quantile_report(
    validation_wallet_features_df,
    modeling_wallet_scores_df[wallets_config['modeling']['score_name']],
    metrics,  # Your existing metrics list
    num_quantiles,  # Split into ntiles
    min_wallet_volume_usd
)


### old analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Wallet metrics to analyze
wallet_metrics = [
    'top_100pct/balance_wtd_mean_score',
    'top_10pct/count',
    'top_25pct/count',
    'top_50pct/count',
    'top_100pct/count',
    'top_10pct/count_pct',
    'top_10pct/balance_pct',
    'top_25pct/count_pct',
    'top_25pct/balance_pct',
    'top_50pct/count_pct',
    'top_50pct/balance_pct',
]
# wallet_metrics = list(validation_coin_wallet_features_df.columns)

# Create styled performance analysis
civa.create_top_coins_wallet_metrics_report(validation_coin_wallet_features_df,percentile=90,wallet_metrics=wallet_metrics,method='mean')


#### plotting coin feature performance vs market cap

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Get the analysis results
segment_results, summary_df = civa.analyze_market_cap_segments(
    coin_wallet_features_df,
    top_n=10
)

# Or create the visualizations
civa.plot_segment_heatmap(summary_df)
civa.plot_metric_consistency(summary_df)  # Optional secondary visualization


#### coin performance of top n for each bucket

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Run analysis
top_n = wallets_config['coin_validation_analysis']['top_n']
max_market_cap = wallets_config['coin_validation_analysis']['max_market_cap']
min_market_cap = wallets_config['coin_validation_analysis']['min_market_cap']

metric_top_coin_performance_df = civa.validate_coin_performance(coin_wallet_features_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

civa.print_performance_analysis(coin_wallet_features_df)

# Junkyard

# Tests failing

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


In [None]:
class ProfitsValidator:
    """
    Validates profits DataFrame follows expected format and constraints.
    Only validates training period data.
    """
    def validate_all(self, profits_df, training_period_start, training_period_end):
        """Run all validation checks and return dict of results"""

        dates = {
            'training_starting_balance_date': pd.to_datetime(training_period_start) - timedelta(days=1),
            'training_period_start': pd.to_datetime(training_period_start),
            'training_period_end': pd.to_datetime(training_period_end),
        }

        return {
            'no_duplicates': self.check_no_duplicates(profits_df),
            'period_boundaries': self.check_period_boundaries(profits_df, dates),
            'no_negatives': self.check_no_negative_balances(profits_df),
            'date_range': self.check_date_range(profits_df, dates),
            'no_missing': self.check_no_missing_values(profits_df)
        }

    def check_no_duplicates(self, profits_df):
        """Check for duplicate records"""
        deduped_df = profits_df[['coin_id', 'wallet_address', 'date']].drop_duplicates()
        return len(profits_df) == len(deduped_df)

    def check_period_boundaries(self, profits_df, dates):
        """Check records exist at period boundaries"""
        profits_df['date'] = pd.to_datetime(profits_df['date'])
        pairs = profits_df[['coin_id', 'wallet_address']].drop_duplicates()
        n_pairs = len(pairs)

        period_df = profits_df[profits_df['date'] == dates['training_period_end']]
        period_pairs = period_df[['coin_id', 'wallet_address']].drop_duplicates()
        return len(period_pairs) == n_pairs

    def check_no_negative_balances(self, profits_df):
        """Check for negative USD balances"""
        return (profits_df['usd_balance'] >= -0.1).all()

    def check_date_range(self, profits_df, dates):
        """Verify date coverage"""
        profits_df['date'] = pd.to_datetime(profits_df['date'])
        return (profits_df['date'].min() >= dates['training_starting_balance_date'] and
                profits_df['date'].max() == dates['training_period_end'])

    def check_no_missing_values(self, profits_df):
        """Check for missing values"""
        return not profits_df.isna().any().any()



# pylint:disable=line-too-long
def test_profits_data():
    """
    Returns raw profits data that can be remapped for many-to-many testing.
    """
    training_period_start = '2024-01-01'
    training_period_end = '2024-10-01'

    profits_data = [
        # w01_multiple_coins - btc & eth (multiple transactions, multiple coins)
        {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-01-01', 'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-05-01', 'usd_balance': 120, 'usd_net_transfers': 50, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-10-01', 'usd_balance': 180, 'usd_net_transfers': 0, 'is_imputed': True},

        {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-01-01', 'usd_balance': 200, 'usd_net_transfers': 200, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-05-01', 'usd_balance': 300, 'usd_net_transfers': 50, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-10-01', 'usd_balance': 280, 'usd_net_transfers': 0, 'is_imputed': True},

        # w02_net_loss - btc (net loss)
        {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-01-01', 'usd_balance': 300, 'usd_net_transfers': 300, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-05-01', 'usd_balance': 250, 'usd_net_transfers': -100, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-10-01', 'usd_balance': 100, 'usd_net_transfers': 0, 'is_imputed': True},

        # w03_sell_all_and_rebuy
        {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-01-01', 'usd_balance': 50, 'usd_net_transfers': 50, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-03-01', 'usd_balance': 0,  'usd_net_transfers': -50, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-08-01', 'usd_balance': 40, 'usd_net_transfers': 40, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-10-01', 'usd_balance': 42, 'usd_net_transfers': 0, 'is_imputed': True},

        # w04_only_period_end - btc (only final row)
        {'coin_id': 'sol', 'wallet_address': 'w04_only_period_end', 'date': '2024-10-01', 'usd_balance': 70, 'usd_net_transfers': 70, 'is_imputed': False},

        # w04a_only_period_end_w_balance - btc
        {'coin_id': 'eth', 'wallet_address': 'w04a_only_period_end_w_balance', 'date': '2023-12-31', 'usd_balance': 30, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'eth', 'wallet_address': 'w04a_only_period_end_w_balance', 'date': '2024-10-01', 'usd_balance': 90, 'usd_net_transfers': 50, 'is_imputed': False},

        # w04b_only_period_start_buy
        {'coin_id': 'sol', 'wallet_address': 'w04b_only_period_start_buy', 'date': '2024-01-01', 'usd_balance': 300, 'usd_net_transfers': 300, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w04b_only_period_start_buy', 'date': '2024-10-01', 'usd_balance': 900, 'usd_net_transfers': 0, 'is_imputed': True},

        # w04c_only_period_start_buy_w_existing_balance
        {'coin_id': 'btc', 'wallet_address': 'w04c_only_period_start_buy_w_existing_balance', 'date': '2023-12-31', 'usd_balance': 40, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'btc', 'wallet_address': 'w04c_only_period_start_buy_w_existing_balance', 'date': '2024-01-01', 'usd_balance': 350, 'usd_net_transfers': 300, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w04c_only_period_start_buy_w_existing_balance', 'date': '2024-10-01', 'usd_balance': 1050, 'usd_net_transfers': 0, 'is_imputed': True},

        # w04d_only_period_start_sell
        {'coin_id': 'sol', 'wallet_address': 'w04d_only_period_start_sell', 'date': '2023-12-31', 'usd_balance': 200, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'sol', 'wallet_address': 'w04d_only_period_start_sell', 'date': '2024-01-01', 'usd_balance': 0, 'usd_net_transfers': -200, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w04d_only_period_start_sell', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

        # w04e_only_period_start_sell_partial
        {'coin_id': 'btc', 'wallet_address': 'w04e_only_period_start_sell_partial', 'date': '2023-12-31', 'usd_balance': 510, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'btc', 'wallet_address': 'w04e_only_period_start_sell_partial', 'date': '2024-01-01', 'usd_balance': 500, 'usd_net_transfers': -10, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w04e_only_period_start_sell_partial', 'date': '2024-10-01', 'usd_balance': 600, 'usd_net_transfers': 0, 'is_imputed': True},

        # w05_only_imputed - btc (only imputed rows at start and end)
        {'coin_id': 'sol', 'wallet_address': 'w05_only_imputed', 'date': '2023-12-31', 'usd_balance': 50, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'sol', 'wallet_address': 'w05_only_imputed', 'date': '2024-10-01', 'usd_balance': 70, 'usd_net_transfers': 0, 'is_imputed': True},

        # w06_tiny_transactions - very small transactions relative to portfolio size
        {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2023-12-31', 'usd_balance': 1250, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-02-01', 'usd_balance': 1220, 'usd_net_transfers': 1, 'is_imputed': False},
        {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-08-01', 'usd_balance': 0, 'usd_net_transfers': -350, 'is_imputed': False},
        {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

        # w07_tiny_transactions2 - very small transactions relative to portfolio size
        {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2023-12-31', 'usd_balance': 400, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-02-01', 'usd_balance': 1220, 'usd_net_transfers': -20, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-08-01', 'usd_balance': 0, 'usd_net_transfers': -150, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

        # w08_offsetting_transactions - large offsetting transactions in the middle of the period
        {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2023-12-31', 'usd_balance': 500, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-02-01', 'usd_balance': 10400, 'usd_net_transfers': 10000, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-02-02', 'usd_balance': 400, 'usd_net_transfers': -10000, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-10-01', 'usd_balance': 750, 'usd_net_transfers': 0, 'is_imputed': True},

        # w09_memecoin_winner - Large swings in portfolio value
        {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-01-01', 'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-03-01', 'usd_balance': 250, 'usd_net_transfers': -500, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-05-01', 'usd_balance': 50, 'usd_net_transfers': -100, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-10-01', 'usd_balance': 10, 'usd_net_transfers': 0, 'is_imputed': True},

        # w10_memecoin_loser - Large swings in portfolio value
        {'coin_id': 'myro', 'wallet_address': 'w10_memecoin_loser', 'date': '2024-03-01', 'usd_balance': 250, 'usd_net_transfers': 250, 'is_imputed': False},
        {'coin_id': 'myro', 'wallet_address': 'w10_memecoin_loser', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': -20, 'is_imputed': False},

        # w11_sells_early
        {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-03-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-04-01', 'usd_balance': 250, 'usd_net_transfers': 250, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-5-01', 'usd_balance': 0, 'usd_net_transfers': -300, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

        # w12_buys_late
        {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-03-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-09-01', 'usd_balance': 500, 'usd_net_transfers': 250, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-10-01', 'usd_balance': 550, 'usd_net_transfers': 0, 'is_imputed': True},
    ]

    test_profits_df = pd.DataFrame(profits_data)

    # Create usd_inflows column
    test_profits_df['usd_inflows'] = test_profits_df['usd_net_transfers'].where(
        (test_profits_df['usd_net_transfers'] > 0) &
        (~test_profits_df['is_imputed']),
        0
    )

    return test_profits_df, training_period_start, training_period_end




# @pytest.fixture
def test_profits_df(test_profits_data):
    """
    Returns test profits DataFrame with cash flow transfers added.
    """
    profits_df, training_period_start, training_period_end = test_profits_data
    profits_df = profits_df.copy()

    # Validate test data format before proceeding
    validator = ProfitsValidator()
    validation_results = validator.validate_all(
        profits_df,
        training_period_start,
        training_period_end
    )
    assert all(validation_results.values()), "Test data failed validation checks."

    # Remove rows with a rounded 0 balance and 0 transfers which happens in wmo.retrieve_datasets() once validation checks are passed
    profits_df = profits_df[
        ~((profits_df['usd_balance'] == 0) &
        (profits_df['usd_net_transfers'] == 0))
    ]

    return profits_df, training_period_start, training_period_end


test_profits_df, training_period_start, training_period_end = test_profits_df(test_profits_data())


In [None]:
w = 'w08_offsetting_transactions'
test_profits_df[test_profits_df['wallet_address']==w]

In [None]:

# Compute trading features
wallet_trading_features_df = wtf.calculate_wallet_trading_features(test_profits_df,
                                                                    training_period_start,
                                                                    training_period_end)
wallet_trading_features_df

In [None]:
period_start_date = training_period_start
period_end_date = training_period_end

# Validate profits_df
profits_df = test_profits_df.copy()
profits_df = wtf.ensure_index(profits_df,
                            period_start_date, period_end_date)

# Calculate configured metrics
# ----------------------------
# Add crypto balance/transfers/gain helper columns
profits_df = wtf.calculate_crypto_balance_columns(profits_df,
                                                period_start_date, period_end_date)

# Calculate net_gain and max_investment columns
gain_and_investment_df = wtf.calculate_gain_and_investment_columns(profits_df)

# Calculated metrics that ignore imputed transactions
observed_activity_df = wtf.calculate_observed_activity_columns(profits_df,
                                                            period_start_date, period_end_date)


profits_df.xs(key='w08_offsetting_transactions', level='wallet_address')
