### start

In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import gc
import time
import logging
import re
import pdb
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
import scipy
from scipy import stats
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    explained_variance_score,
    mean_absolute_percentage_error,
    roc_auc_score
)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['LOGGING_FILE']="../../../Local/logs/wallet_modeling.log"
os.environ['ALERT_SOUND_FILEPATH']="../../../Local/assets/sounds/mixkit-alert-bells-echo-765.wav"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp

# Wallet features
import wallet_features.clustering_features as wcl
import wallet_features.market_cap_features as wmc
import wallet_features.market_timing_features as wmt
import wallet_features.performance_features as wpf
import wallet_features.trading_features as wtf
import wallet_features.transfers_features as wts
import wallet_features.wallet_features_orchestrator as wfo

# Wallet modeling
import wallet_modeling.wallet_modeling_orchestrator as wmo
import wallet_modeling.wallet_training_data as wtd
import wallet_modeling.model_reporting as wmr
import wallet_modeling.wallet_model as wm
import wallet_modeling.experiments_manager as wem
from wallet_modeling.wallets_config_manager import WalletsConfig

# Wallet insights
import wallet_insights.wallet_model_evaluation as wime
import wallet_insights.wallet_validation_analysis as wiwv

# Coin features
import coin_wallet_features.wallet_segmentation as cws
import coin_wallet_features.wallet_balance_features as cwb

# Coin modeling
import coin_modeling.coin_model as cm

# Coin insights
import coin_insights.coin_model_evaluation as cime
import coin_insights.coin_validation_analysis as civa


# reload all modules
modules = [
    u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp,
    wmo, wtd, wmr, wm, wem,
    wcl, wmc, wmt, wpf, wtf, wts, wfo,
    wime, wiwv,
    cws, cwb,
    cm,
    cime, civa,
]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')
wallets_config = WalletsConfig.load_from_yaml('../config/wallets_config.yaml')
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# configure logger
logger = dc.setup_logger()
# logger = u.setup_local_logging(logger)
logger.setLevel(logging.INFO)

logger.info("Good morning, let's get to work")

In [None]:
u.export_code(
    code_directories=[
        # 'training_data',
        'wallet_features',
        # 'wallet_modeling',
        # 'wallet_insights'
    ],
    # include_config = True,
    # ipynb_notebook = 'DDA-456 wallet validation performance.ipynb'
)

u.obj_mem()

# Wallet Model Construction

## Training Data Sequence

### retrieve training datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# Complete Pre-Training Profits/Market Data
# -----------------------------------------
# Retrieve training period datasets and save them to temp/wallet_modeling_dfs
_,_,_ = wmo.retrieve_period_datasets(
    wallets_config['training_data']['training_period_start'],
    wallets_config['training_data']['training_period_end'],
    parquet_prefix = 'training')

u.obj_mem()

### define cohort and clean training datasets (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))



# Add Indicators to Market Data
# ----------------------------------------------------------
# Load relevant parquet dfs with pre-training history
training_market_data_df_full = pd.read_parquet("temp/wallet_modeling_dfs/training_market_data_df_full.parquet")

# Generate indicators and save file
_ = wmo.generate_training_indicators_df(training_market_data_df_full,wallets_metrics_config)



# Identify Wallet Cohort
# ----------------------------------------------------------
# Remove market data from prior to the starting balance date
training_market_data_df = training_market_data_df_full[training_market_data_df_full['date']
                                        >=wallets_config['training_data']['training_starting_balance_date']]
u.assert_period(training_market_data_df,
                wallets_config['training_data']['training_period_start'],
                wallets_config['training_data']['training_period_end'])
del training_market_data_df_full
gc.collect()

# Retrieve full profits history
training_profits_df_full = pd.read_parquet("temp/wallet_modeling_dfs/training_profits_df_full.parquet")

# Define wallet cohort and return cohort-filtered training_profits_df
training_profits_df, training_wallet_cohort = wmo.define_training_wallet_cohort(training_profits_df_full,
                                                                                training_market_data_df)
u.assert_period(training_profits_df,
                wallets_config['training_data']['training_period_start'],
                wallets_config['training_data']['training_period_end'])
training_profits_df.to_parquet("temp/wallet_modeling_dfs/training_profits_df.parquet",index=True)
del training_profits_df_full,training_profits_df,training_market_data_df
gc.collect()

# Retrieve Transfers Data
# ----------------------------------------------------------
# Transfers data retrieval for the wallet_ids in temp.wallet_modeling_training_cohort
training_transfers_sequencing_df = wts.retrieve_transfers_sequencing()
training_transfers_sequencing_df.to_parquet("temp/wallet_modeling_dfs/training_transfers_sequencing_df.parquet",index=True)
del training_transfers_sequencing_df
gc.collect()

u.obj_mem()

### generate training features (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Load files
training_profits_df = pd.read_parquet("temp/wallet_modeling_dfs/training_profits_df.parquet")
training_market_indicators_data_df = pd.read_parquet("temp/wallet_modeling_dfs/training_market_indicators_data_df.parquet")
training_transfers_sequencing_df = pd.read_parquet("temp/wallet_modeling_dfs/training_transfers_sequencing_df.parquet")
training_wallet_cohort = list(set(training_profits_df['wallet_address']))


# Generate Features for the Full Training Period
# ----------------------------------------------------------
logger.info("Generating features for full training period...")
training_wallet_features_df = wfo.calculate_wallet_features(training_profits_df,
                                                            training_market_indicators_data_df,
                                                            training_transfers_sequencing_df,
                                                            training_wallet_cohort,
                                                            wallets_config['training_data']['training_period_start'],
                                                            wallets_config['training_data']['training_period_end'])

# Define the start of training_data_df appending a suffix for the window
training_data_df = training_wallet_features_df.add_suffix("|all_windows")

u.obj_mem()

#### generate windowed training features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate Cohort-Filtered Profits Data for Training Windows
# ----------------------------------------------------------
# Generate wallet_cohort-filtered profits_df for all training windows
training_windows_profits_dfs = wmo.split_training_window_profits_dfs(
                                                        training_profits_df,
                                                        training_market_indicators_data_df,
                                                        training_wallet_cohort)
del training_profits_df
gc.collect()

# Generate Features for Each Individual Window
# ----------------------------------------------------------
# Generate features for each window
for i, window_profits_df in enumerate(training_windows_profits_dfs, 1):
    logger.info("Generating features for window %s...", i)

    # Extract the window_df boundary dates that were validated by split_training_window_profits_dfs()
    window_opening_balance_date = window_profits_df['date'].min()
    window_start_date = window_opening_balance_date + timedelta(days=1)
    window_end_date = window_profits_df['date'].max()

    # Generate the features
    window_wallet_features_df = wfo.calculate_wallet_features(
        window_profits_df,  # profits_df is filtered to the window
        training_market_indicators_data_df,training_transfers_sequencing_df,  # full training period dfs
        training_wallet_cohort,  # full training cohort
        window_start_date.strftime('%Y-%m-%d'), window_end_date.strftime('%Y-%m-%d')  # window-specific dates
    )

    # Add column suffix and join to training_data_df
    window_wallet_features_df = window_wallet_features_df.add_suffix(f'|w{i}')
    training_data_df = training_data_df.join(window_wallet_features_df, how='left')


del window_profits_df,window_wallet_features_df,training_market_indicators_data_df,training_transfers_sequencing_df
gc.collect()

u.obj_mem()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate Clusters Using All Other Features
# ----------------------------------------------------------
# Append clustering features based on all numeric features in the base training data
training_cluster_features_df = wcl.create_basic_cluster_features(training_data_df)
training_cluster_features_df = training_cluster_features_df.add_prefix('cluster|')
training_data_df = training_data_df.join(training_cluster_features_df, how='inner')


# Save TRAINING_DATA_DF
# ----------------------------------------------------------
# Verify all input wallets exist in final output
missing_wallets = set(training_wallet_cohort) - set(training_data_df.index)
if missing_wallets:
    raise ValueError(f"Lost {len(missing_wallets)} wallets from original cohort during feature generation. First few missing: {list(missing_wallets)[:5]}")
logger.info("Feature generation complete. Final training_df shape: %s", training_data_df.shape)


# Save and clear from memory
training_data_df.to_parquet("temp/wallet_modeling_dfs/training_data_df.parquet",index=True)
del training_data_df,training_cluster_features_df
gc.collect()
u.obj_mem()

## Wallet Model Target Variable and Wallet Cohort

### Retrieve modeling period datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Retrieve Modeling Profits and Market Data
# ----------------------------------------------------------
# Retrieve training coin cohort to restrict modeling period data to only training period coins
training_coin_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_market_indicators_data_df.parquet",
                                       columns=['coin_id'])['coin_id'].unique()
# Retrieve full historical through modeling period datasets
modeling_profits_df_full, modeling_market_data_df_full, modeling_coin_cohort = wmo.retrieve_period_datasets(
    wallets_config['training_data']['modeling_period_start'],
    wallets_config['training_data']['modeling_period_end'],
    coin_cohort=training_coin_cohort
)

# Remove pre-modeling period prices
modeling_market_data_df = modeling_market_data_df_full[modeling_market_data_df_full['date']
                                                       >=wallets_config['training_data']['modeling_starting_balance_date']]
del modeling_market_data_df_full,training_coin_cohort
gc.collect()


# Filter to only training wallet cohort
training_wallet_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet", columns=[]).index.values
modeling_profits_df = modeling_profits_df_full[modeling_profits_df_full['wallet_address'].isin(training_wallet_cohort)]
del modeling_profits_df_full
gc.collect()


# Assert period, save files, remove from memory
u.assert_period(modeling_market_data_df,
                wallets_config['training_data']['modeling_period_start'],
                wallets_config['training_data']['modeling_period_end'])
u.assert_period(modeling_profits_df,
                wallets_config['training_data']['modeling_period_start'],
                wallets_config['training_data']['modeling_period_end'])
modeling_profits_df.to_parquet("temp/wallet_modeling_dfs/modeling_profits_df.parquet",index=False)
modeling_market_data_df.to_parquet("temp/wallet_modeling_dfs/modeling_market_data_df.parquet",index=False)
del modeling_profits_df,modeling_market_data_df
gc.collect()

### define modeling cohort and features (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Create training_cohort-Indexed modeling_wallet_features_df
# -----------------------------------------------------------
# Create a DataFrame with training wallet cohort as the index
training_wallet_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet", columns=[]).index.values
modeling_wallet_features_df = pd.DataFrame(index=training_wallet_cohort)
modeling_wallet_features_df.index.name = 'wallet_address'

# Store feature sets with their prefixes for bulk renaming
feature_column_names = {}


# Identify Modeling Period Cohort
# -----------------------------------------------------------
# Retrieve trading features for all wallets in training_cohort with boolean for in_modeling_cohort
modeling_profits_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_profits_df.parquet")
modeling_trading_features_df = wmo.identify_modeling_cohort(modeling_profits_df)
modeling_wallet_features_df = modeling_wallet_features_df.join(modeling_trading_features_df, how='left')\
    .fillna({col: 0 for col in modeling_trading_features_df.columns})


# Generate Modeling Period Performance Features
# -----------------------------------------------------------
# Calculate performance metrics for the training cohort (wallets with 0 activity still impact rank orders)
modeling_performance_features_df = wpf.calculate_performance_features(modeling_wallet_features_df)
modeling_wallet_features_df = modeling_wallet_features_df.join(modeling_performance_features_df, how='left')\
    .fillna({col: 0 for col in modeling_performance_features_df.columns})


modeling_wallet_features_df.to_parquet("temp/wallet_modeling_dfs/modeling_wallet_features_df.parquet",index=True)
del modeling_profits_df,modeling_wallet_features_df,modeling_trading_features_df,modeling_performance_features_df
gc.collect()

u.obj_mem()

## Wallet Model Construction and Analysis

### select target variable and build model (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create MODELING_DF and Construct Wallet Model
# ----------------------------------------------------------
# Retrieve training data for the full training wallet cohort
training_data_df = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet")
modeling_wallet_features_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_wallet_features_df.parquet")

# Filter training data to only the modeling cohort through inner join to target variable
modeling_cohort_target_var_df = modeling_wallet_features_df[['in_modeling_cohort', wallets_config['modeling']['target_variable']]]

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config)
wallet_model_results = wallet_model.run_experiment(training_data_df,modeling_cohort_target_var_df)
del training_data_df
gc.collect()

# Extract the trained model
model = wallet_model_results['pipeline'].named_steps['regressor']

# Generate and save all model artifacts
model_id, evaluator, modeling_wallet_scores_df = wmr.generate_and_save_model_artifacts(
    model_results=wallet_model_results,
    base_path='../wallet_modeling'
)

# Rename and save score
score_name = wallets_config['modeling']['score_name']
modeling_wallet_scores_df = modeling_wallet_scores_df.rename(columns={'score': score_name})
modeling_wallet_scores_df.to_parquet("temp/wallet_modeling_dfs/modeling_wallet_scores_df.parquet",index=True)


u.notify()

### assess wallet model performance

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Reload evaluator
wallet_evaluator = wime.RegressionEvaluator(
    y_train=wallet_model_results['y_train'],
    y_true=wallet_model_results['y_test'],
    y_pred=wallet_model_results['y_pred'],
    training_cohort_pred=wallet_model_results['training_cohort_pred'],
    training_cohort_actuals=wallet_model_results['training_cohort_actuals'],
    model=model,
    feature_names=wallet_model_results['X_train'].columns.tolist()
)

# Print results
print(wallet_evaluator.summary_report())
wallet_evaluator.plot_evaluation()
wallet_evaluator.importance_summary()

### Cluster analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Load parquet
training_data_df = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet")


# List of the x features with the highest importance in the model
x_features = 6
top_feature_metrics = list((pd.DataFrame(wallet_evaluator.metrics['importances'])
                      .sort_values(by='importance',ascending=False)
                      .head(x_features)['feature']))
comparison_metrics = list(set(top_feature_metrics))



# Cluster numbers
n_clusters=4

styled_df,cluster_results_df = wime.create_cluster_report(training_data_df, wallet_model_results, n_clusters, comparison_metrics, 'median')

del(training_data_df)
gc.collect()

styled_df

# Validation Period Analysis

## Load validation and modeling datasets

### Retrieve validation datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Retrieve Validation Profits and Market Data
# ----------------------------------------------------------
# Retrieve full historical through validation period datasets

# Retrieve training coin cohort to ensure all training period coins are reflected
# TODO: assess whether this cohort filter should be removed
training_coin_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_market_indicators_data_df.parquet",
                                       columns=['coin_id'])['coin_id'].unique()
validation_profits_df, validation_market_data_df_full, validation_coin_cohort = wmo.retrieve_period_datasets(
    wallets_config['training_data']['validation_period_start'],
    wallets_config['training_data']['validation_period_end'],
    training_coin_cohort
)

# Remove pre-validation period prices
validation_market_data_df = validation_market_data_df_full[validation_market_data_df_full['date']
                                                       >=wallets_config['training_data']['validation_starting_balance_date']]
del validation_market_data_df_full
gc.collect()


# Assert period, save files, remove from memory
u.assert_period(validation_market_data_df,
                wallets_config['training_data']['validation_period_start'],
                wallets_config['training_data']['validation_period_end'])
u.assert_period(validation_profits_df,
                wallets_config['training_data']['validation_period_start'],
                wallets_config['training_data']['validation_period_end'])
validation_profits_df.to_parquet("temp/wallet_modeling_dfs/validation_profits_df.parquet",index=False)
validation_market_data_df.to_parquet("temp/wallet_modeling_dfs/validation_market_data_df.parquet",index=False)
del validation_profits_df,validation_market_data_df
gc.collect()

### Load modeling dataset files (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# Load coin cohort (currently carried through training/modeling/validation periods)
training_coin_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_market_indicators_data_df.parquet",
                                       columns=['coin_id'])['coin_id'].unique()


# Load modeling period scores and data
modeling_wallet_scores_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_wallet_scores_df.parquet")
modeling_market_data_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_market_data_df.parquet")
modeling_profits_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_profits_df.parquet")
u.assert_period(modeling_market_data_df,
                wallets_config['training_data']['modeling_period_start'],
                wallets_config['training_data']['modeling_period_end'])
u.assert_period(modeling_profits_df,
                wallets_config['training_data']['modeling_period_start'],
                wallets_config['training_data']['modeling_period_end'])


u.obj_mem()

### Load validation dataset files (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# Load parquet
validation_profits_df = pd.read_parquet("temp/wallet_modeling_dfs/validation_profits_df.parquet")
validation_market_data_df = pd.read_parquet("temp/wallet_modeling_dfs/validation_market_data_df.parquet")
training_wallet_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet", columns=[]).index.values
u.assert_period(validation_profits_df,
                wallets_config['training_data']['validation_period_start'],
                wallets_config['training_data']['validation_period_end'])
u.assert_period(validation_market_data_df,
                wallets_config['training_data']['validation_period_start'],
                wallets_config['training_data']['validation_period_end'])


u.obj_mem()

## Wallet aggregated analysis

### generate validation wallet features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Create a DataFrame with all wallets that should exist
validation_wallet_features_df = pd.DataFrame(index=training_wallet_cohort)
validation_wallet_features_df.index.name = 'wallet_address'


# Calculate modeling period wallet metrics
validation_trading_features_df = wtf.calculate_wallet_trading_features(validation_profits_df,
                                                            wallets_config['training_data']['validation_period_start'],
                                                            wallets_config['training_data']['validation_period_end'])
validation_wallet_features_df = validation_wallet_features_df.join(validation_trading_features_df, how='left')\
    .fillna({col: 0 for col in validation_trading_features_df.columns})

# Performance features (inner join, no fill)
performance_features_df = wpf.calculate_performance_features(validation_wallet_features_df)
validation_wallet_features_df = validation_wallet_features_df.join(performance_features_df, how='inner')

### wallet validation period trading/performance by score quantile

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create analysis by prediction bands
metrics = [
    'crypto_net_gain/max_investment/winsorized',
    'crypto_net_gain/max_investment/base',
    'crypto_net_gain/max_investment/ntile_rank',
    'crypto_net_gain/active_twb/winsorized',
    'crypto_net_gain/active_twb/base',
    'max_investment',
    'crypto_net_gain',
    'net_crypto_investment',
    'total_volume',
]

min_wallet_volume_usd = 0
num_quantiles = 5

wiwv.create_quantile_report(
    validation_wallet_features_df,
    wallet_model_results['training_cohort_pred'],
    metrics,  # Your existing metrics list
    num_quantiles,  # Split into ntiles
    min_wallet_volume_usd
)


## Coin aggregated analysis

### coin-aggregated wallet metrics by coin performance

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Load wallet scores from the modeling period
modeling_profits_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_profits_df.parquet")
modeling_wallet_scores_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_wallet_scores_df.parquet")
all_coin_ids = list(set(modeling_profits_df['coin_id']))

# Generate coin-level features based on modeling period end wallet scores and balances
coin_wallet_balance_features_modeling_end_df = cwb.calculate_coin_wallet_balance_features(
    modeling_profits_df,
    modeling_wallet_scores_df,
    wallets_config['training_data']['modeling_period_end'],
    all_coin_ids
)

# Calculate and join coin return performance during validation period
validation_coin_performance_df = civa.calculate_coin_performance(
    validation_market_data_df,
    wallets_config['training_data']['validation_period_start'],
    wallets_config['training_data']['validation_period_end']
)
validation_coin_wallet_features_df = coin_wallet_balance_features_modeling_end_df.join(validation_coin_performance_df,how='inner')


### apply filters

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

min_cohort_wallets = wallets_coin_config['coin_modeling']['min_cohort_wallets']
min_cohort_balance = wallets_coin_config['coin_modeling']['min_cohort_balance']


validation_coin_wallet_features_df = validation_coin_wallet_features_df[
    (validation_coin_wallet_features_df['top_100pct/count'] > min_cohort_wallets)
    & (validation_coin_wallet_features_df['top_100pct/balance'] > min_cohort_balance)
]

### generate reports

In [None]:
validation_coin_wallet_features_df.columns

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))



# Wallet metrics to analyze
wallet_metrics = [
    'top_100pct/balance_wtd_mean_score',
    'top_10pct/count',
    'top_25pct/count',
    'top_50pct/count',
    'top_100pct/count',
    'top_10pct/balance',
    'top_25pct/balance',
    'top_50pct/balance',
    'top_100pct/balance',
    'top_10pct/count_pct',
    'top_25pct/count_pct',
    'top_50pct/count_pct',
    'top_10pct/balance_pct',
    'top_25pct/balance_pct',
    'top_50pct/balance_pct',
]

# number of score buckets
n_quantiles = 5

analyze_df = civa.analyze_metric_segments(
    validation_coin_wallet_features_df,
    wallet_metrics,
    n_quantiles
)
civa.style_metric_segments(analyze_df)

## old analysis

In [None]:
list(validation_coin_wallet_features_df.columns)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Wallet metrics to analyze
wallet_metrics = [
    'top_100pct/balance_wtd_mean_score',
    'top_10pct/count',
    'top_25pct/count',
    'top_50pct/count',
    'top_100pct/count',
    'top_10pct/count_pct',
    'top_10pct/balance_pct',
    'top_25pct/count_pct',
    'top_25pct/balance_pct',
    'top_50pct/count_pct',
    'top_50pct/balance_pct',
]
# wallet_metrics = list(validation_coin_wallet_features_df.columns)

# Create styled performance analysis
civa.create_top_coins_wallet_metrics_report(validation_coin_wallet_features_df,percentile=90,wallet_metrics=wallet_metrics,method='mean')


### plotting coin feature performance vs market cap

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Get the analysis results
segment_results, summary_df = civa.analyze_market_cap_segments(
    coin_wallet_features_df,
    top_n=10
)

# Or create the visualizations
civa.plot_segment_heatmap(summary_df)
civa.plot_metric_consistency(summary_df)  # Optional secondary visualization


### coin performance of top n for each bucket

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Run analysis
top_n = wallets_config['coin_validation_analysis']['top_n']
max_market_cap = wallets_config['coin_validation_analysis']['max_market_cap']
min_market_cap = wallets_config['coin_validation_analysis']['min_market_cap']

metric_top_coin_performance_df = civa.validate_coin_performance(coin_wallet_features_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

civa.print_performance_analysis(coin_wallet_features_df)

# Coin Model Construction

### non-wallet coin model feature generation (slow)

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# Generate features based on the coin config files
coin_features_training_data_df, _, _ = tw.generate_all_time_windows_model_inputs(config,metrics_config,modeling_config)

# Remove time window index since we aren't using that for now
coin_features_training_data_df = coin_features_training_data_df.reset_index(level='time_window', drop=True)

# Save to parquet
coin_features_training_data_df.to_parquet("temp/coin_modeling_dfs/coin_features_training_data_df.parquet",index=True)

u.notify()

## assign wallets to segments

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# Create base df with all wallet addresses and scores
wallet_scores = [wallets_config['modeling']['score_name']]
wallet_segmentation_df = modeling_wallet_scores_df[wallet_scores].copy().add_prefix('score|')

# Add "all" segment for full population level aggregations
wallet_segmentation_df['all_wallets'] = 'wallets'


# Add score quantile assignments
score_segment_quantiles = wallets_coin_config['wallet_segmentation']['score_segment_quantiles']
modeling_wallet_scores_df.columns = [wallets_config['modeling']['score_name'],'in_modeling_cohort']
score_series = modeling_wallet_scores_df[wallets_config['modeling']['score_name']]

wallet_score_quantiles_df = cws.assign_wallet_quantiles(score_series, score_segment_quantiles)
wallet_segmentation_df = wallet_segmentation_df.join(wallet_score_quantiles_df,how='inner')


# Add training period-based cluster labels
training_period_n_clusters = wallets_coin_config['wallet_segmentation']['training_period_n_clusters']
training_data_df = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet")
wallet_clusters_df = wcl.assign_clusters_from_distances(training_data_df,[training_period_n_clusters])

wallet_clusters_df = wallet_clusters_df.add_prefix('training_')
wallet_segmentation_df = wallet_segmentation_df.join(wallet_clusters_df,how='inner')
del(training_data_df)
gc.collect


wallet_segmentation_df.head()

In [None]:
segmentations = wallet_segmentation_df.columns[~wallet_segmentation_df.columns.str.startswith('score|')]
for s in segmentations:
    print(s)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Define complete coin list and initialize df with
coin_wallet_features_df = pd.DataFrame(index=training_coin_cohort)
coin_wallet_features_df.index.name = 'coin_id'


# Loop through all segmentations and generate features
segmentation_columns = wallet_segmentation_df.columns[~wallet_segmentation_df.columns.str.startswith('score|')]

for s_col in segmentation_columns:

    # Generate coin-level features based on modeling period end wallet scores and balances
    segment_balance_df = cwb.calculate_segment_wallet_balance_features(
        modeling_profits_df,
        wallet_segmentation_df,
        s_col,
        wallets_config['training_data']['modeling_period_end'],
        all_coin_ids
    )

    coin_wallet_features_df = coin_wallet_features_df.join(segment_balance_df, how='inner')


coin_wallet_features_df.columns

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Define complete coin list and initialize df with
all_coin_ids = list(set(modeling_profits_df['coin_id']))
coin_wallet_features_df = pd.DataFrame(index=all_coin_ids)
coin_wallet_features_df.index.name = 'coin_id'

# Generate coin-level features based on modeling period end wallet scores and balances
segment_balance_df = cwb.calculate_segmented_wallet_features(
    modeling_profits_df,
    wallet_segmentation_df['training_k4_cluster'],
    wallets_config['training_data']['modeling_period_end'],
    all_coin_ids
)
segment_balance_df.describe()
segment_balance_df.head()

In [None]:
segment_balance_df.add_prefix

In [None]:
wallet_segmentation_df[f'score_{score_name}_quantile'].name

In [None]:
profits_df = modeling_profits_df.copy()
wallet_segments = wallet_segmentation_df['score_quantile']
balance_date = wallets_config['training_data']['modeling_period_end']
all_coin_ids = all_coin_ids


# Convert date and filter to balance date
balance_date = pd.to_datetime(balance_date)
balances_df = profits_df[profits_df['date'] == balance_date].copy()

# Get required columns and merge segments
analysis_df = balances_df[['coin_id', 'wallet_address', 'usd_balance']].merge(
    wallet_segments.rename('segment'),
    left_on='wallet_address',
    right_index=True,
    how='left'
)

# Initialize output with all coins
result_df = pd.DataFrame(index=all_coin_ids)
result_df.index.name = 'coin_id'

# Calculate total metrics first for percentages
totals = analysis_df.groupby('coin_id', observed=True).agg({
    'usd_balance': 'sum',
    'wallet_address': 'count'
}).rename(columns={
    'usd_balance': 'total/balance',
    'wallet_address': 'total/count'
})

# Calculate metrics for each segment
for segment in wallet_segments.unique():
    segment_metrics = analysis_df[analysis_df['segment'] == segment].groupby(
        'coin_id',
        observed=True
    ).agg({
        'usd_balance': 'sum',
        'wallet_address': 'count'
    }).rename(columns={
        'usd_balance': f'{segment}/balance',
        'wallet_address': f'{segment}/count'
    })

    # Add percentage metrics
    segment_metrics[f'{segment}/balance_pct'] = (
        segment_metrics[f'{segment}/balance'] / totals['total/balance']
    ).fillna(0)

#     segment_metrics[f'{segment}/count_pct'] = (
#         segment_metrics[f'{segment}/count'] / totals['total/count']
#     ).fillna(0)

#     result_df = result_df.join(segment_metrics, how='left')

# # Fill missing values with 0 for all metrics
# result_df = result_df.fillna(0)

# # Validate no missing coins
# missing_coins = set(all_coin_ids) - set(result_df.index)
# if missing_coins:
#     raise ValueError(f"Found {len(missing_coins)} coin_ids missing from analysis")

# return result_df


## Prepare coin_training_data_df

### generate coin wallet balance features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Define complete coin list and initialize df with
all_coin_ids = list(set(modeling_profits_df['coin_id']))
coin_wallet_features_df = pd.DataFrame(index=all_coin_ids)
coin_wallet_features_df.index.name = 'coin_id'

# Generate coin-level features based on modeling period end wallet scores and balances
coin_wallet_balance_features_modeling_end_df = cwb.calculate_coin_wallet_balance_features(
    modeling_profits_df,
    modeling_wallet_scores_df,
    wallets_config['training_data']['modeling_period_end'],
    all_coin_ids
)
coin_wallet_balance_features_modeling_end_df = coin_wallet_balance_features_modeling_end_df.add_prefix('wallet_balance|')
coin_wallet_balance_features_modeling_end_df = coin_wallet_balance_features_modeling_end_df.add_suffix('|modeling_end')
coin_wallet_features_df = coin_wallet_features_df.join(coin_wallet_balance_features_modeling_end_df,how='inner')

# # Generate coin-level features based on modeling period start wallet scores and balances
# coin_wallet_balance_features_modeling_start_df = cwb.calculate_coin_wallet_balance_features(
#     modeling_profits_df,
#     modeling_wallet_scores_df,
#     wallets_config['training_data']['modeling_period_start'],
#     all_coin_ids
# )
# coin_wallet_balance_features_modeling_start_df = coin_wallet_balance_features_modeling_start_df.add_prefix('wallet_balance|')
# coin_wallet_balance_features_modeling_start_df = coin_wallet_balance_features_modeling_start_df.add_suffix('|modeling_start')
# coin_wallet_features_df = coin_wallet_features_df.join(coin_wallet_balance_features_modeling_start_df,how='inner')

coin_wallet_features_df.describe()
coin_wallet_features_df.columns
coin_wallet_features_df.shape

In [None]:
modeling_wallet_scores_df

In [None]:
def assign_wallet_quantiles(wallet_scores_df: pd.DataFrame) -> pd.DataFrame:
   """
   Assigns each wallet to a single quantile bucket based on score.

   Params:
   - wallet_scores_df (DataFrame): Wallet scores data indexed by wallet_address
       Must contain 'score' column

   Returns:
   - DataFrame: Original wallet_scores_df with new 'score_quantile' column
       indicating which quantile bucket the wallet belongs to (e.g. '0-1%', '1-5%', etc)
   """
   # Get quantile thresholds from config
   quantiles = sorted(wallets_coin_config['features']['top_wallets_quantiles'])

   # Create bin edges from 0 to 1, reversed since we want top scores in first bins
   bin_edges = [0] + [1 - q for q in reversed(quantiles)] + [1]

   # Create labels for each bin (e.g. '0-1%', '1-5%', etc)
   bin_labels = []
   for i in range(len(bin_edges) - 1):
       start_pct = int(bin_edges[i] * 100)
       end_pct = int(bin_edges[i + 1] * 100)
       bin_labels.append(f'{start_pct}_{end_pct}pct')

   # Assign quantile labels using pd.qcut
   result_df = wallet_scores_df.copy()
   result_df['score_quantile'] = pd.qcut(
       result_df['score'],
       q=bin_edges,
       labels=bin_labels,
       duplicates='drop'
   )

   return result_df

assign_wallet_quantiles(modeling_wallet_scores_df)

In [None]:
coin_wallet_balance_features_modeling_end_df.columns

### add quantile columns

In [None]:
# n_quantiles = 4

# coin_wallet_features_df = cwb.add_quantile_columns(coin_wallet_features_df,4)
# coin_wallet_features_df.shape

### Merge to coin_training_data_df_full

In [None]:
# Retrieve data from coin features pipeline
coin_features_training_data_df = pd.read_parquet("temp/coin_modeling_dfs/coin_features_training_data_df.parquet")

# Confirm overlap
coin_features_ids = coin_features_training_data_df.index
coin_wallet_features_ids = coin_wallet_features_df.index
wallet_features_only_ids = set(coin_wallet_features_ids) - set(coin_features_ids)

if len(wallet_features_only_ids) == 0:
    logger.info("All %s coins with wallet features were found in the base features set.",
                len(coin_wallet_features_ids))

else:
    raise ValueError(f"Wallet features contain {len(wallet_features_only_ids)} coins not in the other coin features")


# Join together
coin_training_data_df_full = coin_wallet_features_df.join(coin_features_training_data_df,how='inner')
coin_training_data_df_full.shape

coin_training_data_df_full = coin_wallet_features_df.copy()
coin_training_data_df_full.describe()

### apply filters

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


logger.info("Starting coins: %s", len(coin_training_data_df_full))

# Filter based on holdings
min_cohort_wallets = wallets_coin_config['coin_modeling']['min_cohort_wallets']
min_cohort_balance = wallets_coin_config['coin_modeling']['min_cohort_balance']

coin_training_data_df = coin_training_data_df_full[
    (coin_training_data_df_full['wallet_balance|top_100pct/count|modeling_end'] >= min_cohort_wallets)
    & (coin_training_data_df_full['wallet_balance|top_100pct/balance|modeling_end'] >= min_cohort_balance)
]
logger.info("Coins after balance filters: %s", len(coin_training_data_df))

# # Filter based on market cap
# min_market_cap = wallets_coin_config['coin_modeling']['min_market_cap']
# max_market_cap = wallets_coin_config['coin_modeling']['max_market_cap']

# coin_training_data_df = coin_training_data_df[
#     (coin_training_data_df['time_series|market_data|market_cap_last'] >= min_market_cap)
#     & (coin_training_data_df['time_series|market_data|market_cap_last'] <= max_market_cap)
# ]
# logger.info("Coins after balance filters: %s", len(coin_training_data_df))

coin_training_data_df.describe()

## Prepare coin_modeling_df

### Prepare target variable (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# Load market data
validation_market_data_df = pd.read_parquet("temp/wallet_modeling_dfs/validation_market_data_df.parquet")


# Calculate coin return performance during validation period
validation_coin_performance_df = civa.calculate_coin_performance(
    validation_market_data_df,
    wallets_config['training_data']['validation_period_start'],
    wallets_config['training_data']['validation_period_end']
)

# Drop columns with np.nan coin_return values, which indicate a 0 starting price
validation_coin_performance_df = validation_coin_performance_df.dropna()

validation_coin_performance_df['coin_return_winsorized'] = u.winsorize(
        validation_coin_performance_df['coin_return'],
        wallets_coin_config['coin_modeling']['returns_winsorization'])

# Validation: check if any coin_ids missing from final features
missing_coins = set(coin_training_data_df.index) - set(validation_coin_performance_df.index)
if missing_coins:
    raise ValueError(f"Found {len(missing_coins)} coin_ids in training_data_df without validation period target variables.")


# Attach target variable column to training_data_df
target_var_column = wallets_coin_config['coin_modeling']['target_variable']
coin_modeling_df = coin_training_data_df.join(validation_coin_performance_df[[target_var_column]])

coin_modeling_df.describe()

### analyze correlations

In [None]:
import pandas as pd

# Assuming your DataFrame is named `df`
# Calculate correlations
correlation_matrix = coin_modeling_df.corr()

# Extract correlations with the target variable
target_correlations = correlation_matrix[target_var_column].sort_values(ascending=False)

# Display the top features correlated with the target
target_correlations[:25]
target_correlations

## Build coin model

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Initialize and run model
coin_model = cm.CoinModel(wallets_coin_config=wallets_coin_config)
coin_model_results = coin_model.run_experiment(feature_df=coin_modeling_df)


# Initialize evaluator
coin_evaluator = cime.CoinRegressionEvaluator(
    y_test=coin_model_results['y_test'],
    y_pred=coin_model_results['y_pred'],
    model=coin_model.pipeline.named_steps['regressor'],
    feature_names=coin_model_results['X_train'].columns.tolist()
)

# Generate reports
print(evaluator.summary_report())
evaluator.plot_evaluation()

u.notify()

# Junkyard

# Tests failing

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()