In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import gc
import time
import logging
import re
import pdb
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from scipy import stats
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    explained_variance_score,
    mean_absolute_percentage_error,
    roc_auc_score
)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['ALERT_SOUND_FILEPATH']="../../../Local/assets/sounds/mixkit-alert-bells-echo-765.wav"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp

# Wallet modeling
import wallet_modeling.wallet_modeling_orchestrator as wmo
import wallet_modeling.wallet_training_data as wtd
import wallet_modeling.model_reporting as wmr
import wallet_modeling.wallet_model as wm
import wallet_modeling.experiments_manager as wem
from wallet_modeling.wallets_config_manager import WalletsConfig

# Wallet features
import wallet_features.clustering_features as wcl
import wallet_features.market_cap_features as wmc
import wallet_features.market_timing_features as wmt
import wallet_features.performance_features as wpf
import wallet_features.trading_features as wtf
import wallet_features.transfers_features as wts
import wallet_features.features_orchestrator as wfo

# Wallet insights
import wallet_insights.wallet_model_evaluation as wime
import wallet_insights.wallet_validation_analysis as wiwv
import wallet_insights.coin_validation_analysis as wicv
import wallet_insights.coin_validation_model as wicm


# reload all modules
modules = [u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp,
           wmo, wtd, wmr, wm, wem,
           wcl, wmc, wmt, wpf, wtf, wts, wfo,
           wime, wiwv, wicv, wicm]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')
wallets_config = WalletsConfig.load_from_yaml('../config/wallets_config.yaml')
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

logger.info("Good morning, let's get to work")

In [None]:
u.export_code(
    code_directories=[
        # 'training_data',
        'wallet_features',
        # 'wallet_modeling',
        # 'wallet_insights'
    ],
    # include_config = True,
    # ipynb_notebook = 'DDA-456 wallet validation performance.ipynb'
)

u.obj_mem()

# Wallet Model Construction

## Training Data Sequence

### retrieve training datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# Complete Pre-Training Profits/Market Data
# -----------------------------------------
# Retrieve training period datasets and save them to temp/wallet_modeling_dfs
wmo.retrieve_period_datasets(
    wallets_config['training_data']['training_period_start'],
    wallets_config['training_data']['training_period_end'],
    parquet_prefix = 'training')


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


training_profits_df,training_market_data_df,coin_cohort = wmo.retrieve_period_datasets(
    wallets_config['training_data']['training_period_start'],
    wallets_config['training_data']['training_period_end'])

modeling_profits_df,modeling_market_data_df,_ = wmo.retrieve_period_datasets(
    wallets_config['training_data']['modeling_period_start'],
    wallets_config['training_data']['modeling_period_end'],
    coin_cohort=coin_cohort)

combined_profits_df,combined_market_data_df,_ = wmo.retrieve_period_datasets(
    wallets_config['training_data']['training_period_start'],
    wallets_config['training_data']['modeling_period_end'],
    coin_cohort=coin_cohort)

### useable assertions

In [None]:
# Temporarily adjust log level
logger.info("Generating training, modeling, and combined datasets...")
logger.setLevel(logging.WARNING)

# Target the dev schema to avoid a very long runtime
wallets_config['training_data']['dataset'] = 'dev'

# Get initial training data and coin cohort
training_profits_df, training_market_df, coin_cohort = wmo.retrieve_period_datasets(
    wallets_config['training_data']['training_period_start'],
    wallets_config['training_data']['training_period_end']
)

# Get modeling period data
modeling_profits_df, modeling_market_df, _ = wmo.retrieve_period_datasets(
    wallets_config['training_data']['modeling_period_start'],
    wallets_config['training_data']['modeling_period_end'],
    coin_cohort=coin_cohort
)

# Get combined period data
combined_profits_df, combined_market_df, _ = wmo.retrieve_period_datasets(
    wallets_config['training_data']['training_period_start'],
    wallets_config['training_data']['modeling_period_end'],
    coin_cohort=coin_cohort
)

period_datasets = (
    training_profits_df, training_market_df,
    modeling_profits_df, modeling_market_df,
    combined_profits_df, combined_market_df
)
logger.setLevel(logging.INFO)
logger.info("All dev data retrieved.")


In [None]:
def test_wallet_coin_balance_continuity(period_datasets):
    """
    Test that all wallet-coin pair balances match at the training/modeling boundary
    using vectorized operations. Allows for 0.0001% difference due to floating point math.
    """
    training_df, _, modeling_df, _, _, _ = period_datasets

    # Get boundary data
    training_last = training_df['date'].max()
    training_end_df = training_df[training_df['date']==training_last]

    modeling_first = modeling_df['date'].min()
    modeling_start_df = modeling_df[modeling_df['date']==modeling_first]

    # Create merged df on composite key
    balance_compare_df = pd.merge(
        training_end_df[['wallet_address', 'coin_id', 'usd_balance']],
        modeling_start_df[['wallet_address', 'coin_id', 'usd_balance']],
        on=['wallet_address', 'coin_id'],
        suffixes=('_train', '_model')
    )

    # Convert to float64 for consistency
    balance_compare_df['usd_balance_train'] = balance_compare_df['usd_balance_train'].astype('float64')
    balance_compare_df['usd_balance_model'] = balance_compare_df['usd_balance_model'].astype('float64')

    # Filter out zero balance pairs to avoid div by zero
    nonzero_mask = ~((balance_compare_df['usd_balance_train'] == 0) &
                        (balance_compare_df['usd_balance_model'] == 0))
    balance_compare_df = balance_compare_df[nonzero_mask]

    # Calculate both absolute and percentage differences
    balance_compare_df['abs_diff'] = abs(
        balance_compare_df['usd_balance_train'] -
        balance_compare_df['usd_balance_model']
    )

    balance_compare_df['pct_diff'] = abs(
        balance_compare_df['usd_balance_train'] /
        balance_compare_df['usd_balance_model'] - 1
    )

    # Flag significant mismatches (both conditions must be true)
    significant_diffs = balance_compare_df[
        (balance_compare_df['abs_diff'] > 0.1) &
        (balance_compare_df['pct_diff'] > 0.00001)
    ]

    assert len(significant_diffs) == 0, \
        "Found wallet-coin pairs with significant balance mismatches (>$0.01 and >0.0001%)"


In [None]:
balance_compare_df.sort_values('pct_diff',ascending=False)

In [None]:
combined_profits_df.shape

In [None]:
combined_df.shape

In [None]:
# def test_coin_set_consistency(period_datasets):
#     """Test that coin sets match between periods"""
training_df, _, modeling_df, _, combined_df, _ = datasets
training_coins = set(training_df['coin_id'])
modeling_coins = set(modeling_df['coin_id'])
combined_coins = set(combined_df['coin_id'])
assert training_coins == combined_coins
assert len(training_coins - modeling_coins) == 0

# def test_transfer_amount_consistency(period_datasets):
#     """Test that transfer amounts sum correctly"""
training_df, _, modeling_df, _, combined_df, _ = datasets
training_transfers = abs(training_df['usd_net_transfers']).astype('float64').sum()
modeling_transfers = abs(modeling_df['usd_net_transfers']).astype('float64').sum()
combined_transfers = abs(combined_df['usd_net_transfers']).astype('float64').sum()
assert abs(combined_transfers - (training_transfers + modeling_transfers)) < 0.01

# def test_time_period_boundaries(period_datasets):
#     """Test that period boundaries align correctly"""
training_df, _, modeling_df, _, _, _ = datasets
training_last = training_df['date'].max()
modeling_first = modeling_df['date'].min()
assert training_last == modeling_first


In [None]:
training_df, _, modeling_df, _, combined_df, _ = datasets
training_transfers = abs(training_df['usd_net_transfers']).astype('float64').sum()
modeling_transfers = abs(modeling_df['usd_net_transfers']).astype('float64').sum()
combined_transfers = abs(combined_df['usd_net_transfers']).astype('float64').sum()

combined_transfers - (training_transfers + modeling_transfers)

In [None]:
training_transfers

In [None]:
modeling_transfers

In [None]:
combined_transfers

In [None]:
modeling_transfers

In [None]:
combined_transfers

In [None]:
training_transfers + modeling_transfers

In [None]:
training_df, _, modeling_df, _, _, _ = datasets

# Check date boundaries align
training_last = training_df['date'].max()
modeling_first = modeling_df['date'].min()
assert training_last == modeling_first

# Check balances match at boundary
training_end_df = training_df[training_df['date']==training_last]
modeling_start_df = modeling_df[modeling_df['date']==modeling_first]

training_end_balance = training_end_df['usd_balance'].astype('float64').sum()
modeling_start_balance = modeling_start_df['usd_balance'].astype('float64').sum()

# Balance difference must be within 0.0001%
assert abs(training_end_balance / modeling_start_balance - 1) < 0.000001


In [None]:
training_df, _, modeling_df, _, _, _ = datasets
training_last = training_df['date'].max()
modeling_first = modeling_df['date'].min()
assert training_last == modeling_first


In [None]:
training_last

In [None]:
modeling_first

In [None]:
combined_transfers

In [None]:
training_transfers + modeling_transfers

In [None]:
combined_transfers - (training_transfers + modeling_transfers)

In [None]:
# Assert coins match
training_coins = set(training_profits_df['coin_id'])
modeling_coins = set(modeling_profits_df['coin_id'])
combined_coins = set(combined_profits_df['coin_id'])

# Training set and combined set have identical coins
assert training_coins == combined_coins
# All modeling coins are included in the training coins
assert len(training_coins - modeling_coins) == 0


In [None]:
# Reload modules and config
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Calculate total transfers across both periods and ensure they match the combined count
training_transfers = abs(training_profits_df['usd_net_transfers']).astype('float64').sum()
modeling_transfers = abs(modeling_profits_df['usd_net_transfers']).astype('float64').sum()
combined_transfers = abs(combined_profits_df['usd_net_transfers']).astype('float64').sum()

# Assert the difference is within $0.01
assert abs(combined_transfers - (training_transfers+modeling_transfers)) < 0.01

In [None]:
training_last = training_profits_df['date'].max()
modeling_first = modeling_profits_df['date'].min()

# Assert that the last and first dates match
assert training_last == modeling_first


training_end_df = training_profits_df[training_profits_df['date']==training_last]
modeling_start_df = modeling_profits_df[modeling_profits_df['date']==modeling_first]

training_end_balance = training_end_df['usd_balance'].astype('float64').sum()
modeling_start_balance = modeling_start_df['usd_balance'].astype('float64').sum()

# Assert the ending balances are within 0.0001% of each other to account for floating points
assert abs(training_end_balance / modeling_start_balance - 1) < 0.000001

In [None]:
training_end_balance


In [None]:
training_end_df.describe()

In [None]:
modeling_start_df.describe()

In [None]:
# Proper deduplication considering coin-wallet pairs
training_end_balance = (training_end_df
    .groupby(['wallet_address', 'coin_id'], observed=True)['usd_balance']
    .sum()
    .astype('float64'))

modeling_start_balance = (modeling_start_df
    .groupby(['wallet_address', 'coin_id'], observed=True)['usd_balance']
    .sum()
    .astype('float64'))

# Check pairs with discrepancies
pair_analysis = pd.DataFrame({
    'training_end': training_end_balance,
    'modeling_start': modeling_start_balance
}).reset_index()

pair_analysis['diff'] = pair_analysis['training_end'] - pair_analysis['modeling_start']
discrepancies = pair_analysis[pair_analysis['diff'].abs() > 0.01]

In [None]:
pair_analysis

In [None]:
c = '0f17de35-54d9-4fba-85fc-7b6fb131f609'
w = 17597

In [None]:
u.cw_filter_df(combined_profits_df,c,w)

In [None]:
u.cw_filter_df(training_profits_df,c,w)

In [None]:
u.cw_filter_df(modeling_profits_df,c,w)

In [None]:
modeling_start_balance

In [None]:
training_end_df['usd_balance'].astype('float64').sum()

In [None]:
training_end

In [None]:
modeling_start

In [None]:
modeling_start_df['usd_balance'].astype('float64').sum()

### codespace

In [None]:
def calculate_transfers(df: pd.DataFrame) -> pd.DataFrame:
    """
    Params:
    - df (DataFrame): input profits data

    Returns:
    - result (DataFrame): transfer calculations with consistent precision
    """
    # Use vectorized operations instead of groupby
    result = pd.DataFrame({
        'train_transfers': df.query('wallet_address == @w and coin_id == @c')['usd_net_transfers'].sum(),
    }).astype('float64')  # Force higher precision

    return result

In [None]:
# Reload modules and config
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Calculate transfers using vectorized operations
training_xfers = training_profits_df[training_profits_df['usd_net_transfers'].notna()]['usd_net_transfers'].sum()
modeling_xfers = modeling_profits_df[modeling_profits_df['usd_net_transfers'].notna()]['usd_net_transfers'].sum()
combined_xfers = combined_profits_df[combined_profits_df['usd_net_transfers'].notna()]['usd_net_transfers'].sum()

print(training_xfers)
print(modeling_xfers)
print(combined_xfers)
print(combined_xfers - (training_xfers+modeling_xfers))

# # Test specific wallet-coin combination
# c = '1c3d1f11-b299-4eb1-84fa

In [None]:
c = '1c3d1f11-b299-4eb1-84fa-abe6e013cb03'
w = 23754641


In [None]:

# Direct vectorized calculation with float64 precision
train_sum = training_profits_df.query('wallet_address == @w and coin_id == @c')['usd_net_transfers'].astype('float64').sum()
combined_sum = combined_profits_df.query('wallet_address == @w and coin_id == @c')['usd_net_transfers'].astype('float64').sum()

print(f"\nTraining sum: {train_sum}")
print(f"Combined sum: {combined_sum}")

# Create result dataframe with consistent precision
result = pd.DataFrame({
    'wallet_address': [w],
    'coin_id': [c],
    'train_transfers': [train_sum],
    'model_transfers': [0],  # Assuming no model transfers based on previous output
    'combined_transfers': [combined_sum],
    'transfer_diff': [combined_sum - train_sum],
    'abs_diff': [abs(combined_sum - train_sum)]
}).astype({'wallet_address': 'int64', 'train_transfers': 'float64', 'combined_transfers': 'float64'})

print("\nFinal result:")
print(result)

In [None]:
c = '1c3d1f11-b299-4eb1-84fa-abe6e013cb03'
w = 23754641

u.cw_filter_df(training_profits_df,c,w)['usd_net_transfers'].sum()

In [None]:
c = '1c3d1f11-b299-4eb1-84fa-abe6e013cb03'
w = 23754641

u.cw_filter_df(combined_profits_df,c,w)['usd_net_transfers'].sum()

In [None]:
train_df = training_profits_df.copy()
model_df = modeling_profits_df.copy()
combined_df = combined_profits_df.copy()

# Group and sum transfers by wallet and coin
train_sums = train_df.groupby(['wallet_address', 'coin_id'],observed=True)['usd_net_transfers'].sum()
model_sums = model_df.groupby(['wallet_address', 'coin_id'],observed=True)['usd_net_transfers'].sum()
combined_sums = combined_df.groupby(['wallet_address', 'coin_id'],observed=True)['usd_net_transfers'].sum()

# Combine into single df and fill NaN with 0
result = pd.concat([
    train_sums.rename('train_transfers'),
    model_sums.rename('model_transfers'),
    combined_sums.rename('combined_transfers')
], axis=1).fillna(0)

# Calculate difference
result['transfer_diff'] = result['combined_transfers'] - (result['train_transfers'] + result['model_transfers'])
result['abs_diff'] = abs(result['transfer_diff'])

In [None]:
result.sort_values(by='abs_diff',ascending=False).head(5).tail(1)

In [None]:
c = '1c3d1f11-b299-4eb1-84fa-abe6e013cb03'
w = 23754641

assert c in coin_cohort

u.cw_filter_df(train_df,c,w)['usd_net_transfers'].cumsum()

In [None]:
c = '1c3d1f11-b299-4eb1-84fa-abe6e013cb03'
w = 23754641

assert c in coin_cohort

u.cw_filter_df(combined_df,c,w)['usd_net_transfers'].cumsum()

In [None]:
def compare_transfer_calcs(df: pd.DataFrame, wallet_id: str, coin_id: str) -> pd.DataFrame:
    """
    Params:
    - df (DataFrame): Combined transfer data
    - wallet_id (str): Wallet to analyze
    - coin_id (str): Coin to analyze

    Returns:
    - comparison_df (DataFrame): Parallel cumsum vs total calculations
    """
    mask = (df['wallet_address'] == wallet_id) & (df['coin_id'] == coin_id)

    # Get both calculations for comparison
    running_total = df[mask]['usd_net_transfers'].cumsum()
    total_sum = df[mask]['usd_net_transfers'].sum()

    return pd.DataFrame({
        'running_total': running_total,
        'static_total': total_sum,
        'difference': running_total - total_sum
    })

def check_transfer_sequence(df: pd.DataFrame, wallet_id: str, coin_id: str) -> pd.DataFrame:
    """
    Params:
    - df (DataFrame): Transfer data
    - wallet_id (str): Target wallet
    - coin_id (str): Target coin

    Returns:
    - sequence_df (DataFrame): Ordered transfer sequence
    """
    mask = (df['wallet_address'] == wallet_id) & (df['coin_id'] == coin_id)
    return df[mask][['date', 'usd_net_transfers']].sort_values('date')

# Let's analyze both training and modeling data
c = '1c3d1f11-b299-4eb1-84fa-abe6e013cb03'
w = 23754641


# Original transfer comparison
print("=== Transfer Total Analysis ===")
print("\nTraining Data Analysis:")
train_compare = compare_transfer_calcs(train_df, w, c)
print(train_compare)

print("\nModeling Data Analysis:")
model_compare = compare_transfer_calcs(model_df, w, c)
print(model_compare)

print("\nCombined Data Analysis:")
combined_compare = compare_transfer_calcs(combined_df, w, c)
print(combined_compare)

# Add temporal sequence analysis
print("\n=== Temporal Sequence Analysis ===")
print("\nTraining Data Timeline:")
train_seq = check_transfer_sequence(train_df, w, c)
print(train_seq)

print("\nModeling Data Timeline:")
model_seq = check_transfer_sequence(model_df, w, c)
print(model_seq)

print("\nCombined Data Timeline:")
combined_seq = check_transfer_sequence(combined_df, w, c)
print(combined_seq)

In [None]:
# Get raw period data
profits_df_raw, market_data_df_raw = wtd.retrieve_raw_datasets(
        wallets_config['training_data']['modeling_period_start'],
        wallets_config['training_data']['modeling_period_end'])

u.cw_filter_df(profits_df_raw,c,w)

In [None]:

if coin_cohort is not None:
    # Filter to existing cohort before processing
    profits_df = profits_df_raw[profits_df_raw['coin_id'].isin(coin_cohort)]
    market_data_df = market_data_df_raw[market_data_df_raw['coin_id'].isin(coin_cohort)]

    print('x')

u.cw_filter_df(profits_df,c,w)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Format and optionally save the datasets
profits_df_formatted, market_data_df_formatted = wtd.format_and_save_datasets(
    profits_df,
    market_data_df,
    wallets_config['training_data']['modeling_period_start'],
    None
)

u.cw_filter_df(profits_df_formatted,c,w)

In [None]:
wallets_config['training_data']['training_period_start']

### define cohort and clean training datasets (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))



# Add Indicators to Market Data
# ----------------------------------------------------------
# Load relevant parquet dfs with pre-training history
training_market_data_df_full = pd.read_parquet("temp/wallet_modeling_dfs/training_market_data_df_full.parquet")

# Generate indicators and save file
wmo.generate_training_indicators_df(training_market_data_df_full,wallets_metrics_config)

# Identify Wallet Cohort
# ----------------------------------------------------------
# Identify the date we need starting balances from
training_period_start = datetime.strptime(wallets_config['training_data']['training_period_start'],'%Y-%m-%d')
training_starting_balance_date = training_period_start - timedelta(days=1)

# Remove market data from prior to the starting balance date
training_market_data_df = training_market_data_df_full[training_market_data_df_full['date']
                                                       >=training_starting_balance_date]
del training_market_data_df_full
gc.collect()

# Retrieve full profits history
training_profits_df_full = pd.read_parquet("temp/wallet_modeling_dfs/training_profits_df_full.parquet")

# Define wallet cohort
training_wallet_cohort = wmo.define_wallet_cohort(training_profits_df_full,training_market_data_df)



# Generate Cohort-Filtered Profits Data for Training Windows
# ----------------------------------------------------------
# Generate wallet_cohort-filtered profits_df for all training windows
training_profits_df, training_windows_profits_dfs = wmo.split_training_window_profits_dfs(training_profits_df_full,
                                                                         training_market_data_df,training_wallet_cohort)
del training_profits_df_full,training_market_data_df
gc.collect()



# Retrieve Transfers Data
# ----------------------------------------------------------
# Transfers data retrieval for the wallet_ids in temp.wallet_modeling_training_cohort
training_transfers_sequencing_df = wts.retrieve_transfers_sequencing()

### generate training features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Generate Features for the Full Training Period
# ----------------------------------------------------------
logger.info("Generating features for full training period...")
training_market_indicators_data_df = pd.read_parquet("temp/wallet_modeling_dfs/training_market_indicators_data_df.parquet")
training_wallet_features_df = wfo.calculate_wallet_features(training_profits_df,
                                                            training_market_indicators_data_df,
                                                            training_transfers_sequencing_df,
                                                            training_wallet_cohort)

# Define the start of training_data_df appending a suffix for the window
training_data_df = training_wallet_features_df.add_suffix("_all_windows")

del training_profits_df,training_wallet_features_df
gc.collect()

## Feature boundary dev space

In [None]:
profits_df = training_profits_df.copy()
market_indicators_data_df = training_market_indicators_data_df.copy()
transfers_sequencing_df = training_transfers_sequencing_df.copy()
wallet_cohort = training_wallet_cohort.copy()

In [None]:
# Initialize output dataframe
wallet_features_df = pd.DataFrame(index=wallet_cohort)
wallet_features_df.index.name = 'wallet_address'
feature_column_names = {}

# Trading features (inner join)
# Requires both starting_balance_date and period_end_date imputed rows
# -----------------------------------------------------------------------
profits_df = wtf.add_cash_flow_transfers_logic(profits_df)
trading_features_df = wtf.calculate_wallet_trading_features(profits_df)
trading_features_df.head()

In [None]:
wallets_config['training_data']

In [None]:
starting_balance_date = datetime.strptime(wallets_config['training_data']['training_starting_balance_date'],'%Y-%m-%d')
profits_df = training_profits_df.copy()


# Apply the filter and update the values
profits_df.loc[profits_df['date'] == starting_balance_date, ['is_imputed', 'usd_net_transfers', 'usd_inflows']] = [True, 0, 0]
profits_df[profits_df['date'] == starting_balance_date].describe()

In [None]:
can

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

profits_df = wtf.add_cash_flow_transfers_logic(profits_df)
trading_features_df = wtf.calculate_wallet_trading_features(profits_df)
trading_features_df.sum()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


trading_features_df_new = wtf.calculate_wallet_trading_features_new(profits_df)
trading_features_df_new.sum()

In [None]:
w = 56836
profits_df[profits_df['wallet_address']==w].sort_values(by=['coin_id','date'])

## code resumes

In [None]:
# Generate Features for Each Individual Window
# ----------------------------------------------------------
# Generate features for each window
for i, window_profits_df in enumerate(training_windows_profits_dfs, 1):
    logger.info("Generating features for window %s...", i)

    # Generate the features
    window_wallet_features_df = wfo.calculate_wallet_features(window_profits_df, training_market_indicators_data_df,
                                                             training_transfers_sequencing_df, training_wallet_cohort)

    # Check for NaN values and identify problematic columns
    nan_columns = window_wallet_features_df.columns[window_wallet_features_df.isna().any()].tolist()
    if nan_columns:
        raise ValueError(f"NaN values detected in window {i} in columns: {nan_columns}")

    # Add column suffix and join to training_data_df
    window_wallet_features_df = window_wallet_features_df.add_suffix(f'_w{i}')
    training_data_df = training_data_df.join(window_wallet_features_df, how='left')

    # Check for NaN values and identify problematic columns
    nan_columns = training_data_df.columns[training_data_df.isna().any()].tolist()
    if nan_columns:
        raise ValueError(f"NaN values detected in training_data_df after window {i} in columns: {nan_columns}")


del window_profits_df,window_wallet_features_df,training_market_indicators_data_df,training_transfers_sequencing_df
gc.collect()

u.obj_mem()

In [None]:
# Generate Clusters Using All Other Features
# ----------------------------------------------------------
# Append clustering features based on all numeric features in the base training data
training_cluster_features_df = wcl.create_basic_cluster_features(training_data_df)
training_cluster_features_df = training_cluster_features_df.add_prefix('cluster_')
training_data_df = training_data_df.join(training_cluster_features_df, how='inner')



# Save TRAINING_DATA_DF
# ----------------------------------------------------------
# Verify all input wallets exist in final output
missing_wallets = set(training_wallet_cohort) - set(training_data_df.index)
if missing_wallets:
    raise ValueError(f"Lost {len(missing_wallets)} wallets from original cohort during feature generation. First few missing: {list(missing_wallets)[:5]}")

# Save and clear from memory
training_data_df.to_parquet("temp/wallet_modeling_dfs/training_data_df.parquet",index=True)
del training_data_df,training_cluster_features_df
gc.collect()


logger.info("Feature generation complete.")
logger.info(f"Current large object memory usage: {u.obj_mem()['size_mb'].sum():.1f} MB")
u.obj_mem()

## Wallet Modeling

### Retrieve modeling datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Retrieve Modeling Profits and Market Data
# ----------------------------------------------------------
# Retrieve full historical through modeling period datasets
wmo.retrieve_period_datasets(
    wallets_config['training_data']['modeling_period_start'],
    wallets_config['training_data']['modeling_period_end'],
    parquet_prefix = 'modeling'
)


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Load parquet files
modeling_market_data_df_full = pd.read_parquet("temp/wallet_modeling_dfs/modeling_market_data_df_full.parquet")
modeling_profits_df_full = pd.read_parquet("temp/wallet_modeling_dfs/modeling_profits_df_full.parquet")


# Remove pre-modeling period prices
modeling_market_data_df = modeling_market_data_df_full[modeling_market_data_df_full['date']
                                                       >=wallets_config['training_data']['modeling_period_start']]
del modeling_market_data_df_full
gc.collect()


# Filter to only training wallet cohort
training_wallet_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet", columns=[]).index.values
modeling_profits_df = modeling_profits_df_full[modeling_profits_df_full['wallet_address'].isin(training_wallet_cohort)]
del modeling_profits_df_full
gc.collect()


# Impute rows for period end
modeling_profits_df = pri.impute_profits_for_multiple_dates(modeling_profits_df,
                                                            modeling_market_data_df,
                                                            [wallets_config['training_data']['modeling_period_end']],
                                                            n_threads=24)


# Assert period, save files, remove from memory
u.assert_period(wallets_config,modeling_profits_df,'modeling')
u.assert_period(wallets_config,modeling_market_data_df,'modeling')
modeling_profits_df.to_parquet("temp/wallet_modeling_dfs/modeling_profits_df.parquet",index=False)
modeling_market_data_df.to_parquet("temp/wallet_modeling_dfs/modeling_market_data_df.parquet",index=False)
del modeling_profits_df,modeling_market_data_df
gc.collect()

### define modeling cohort and features (loadable parquet)

In [None]:

# Create training_cohort-Indexed modeling_wallet_features_df
# -----------------------------------------------------------
# Create a DataFrame with training wallet cohort as the index
training_wallet_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet", columns=[]).index.values
modeling_wallet_features_df = pd.DataFrame(index=training_wallet_cohort)
modeling_wallet_features_df.index.name = 'wallet_address'

modeling_wallet_features_df.shape

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Create training_cohort-Indexed modeling_wallet_features_df
# -----------------------------------------------------------
# Create a DataFrame with training wallet cohort as the index
training_wallet_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet", columns=[]).index.values
modeling_wallet_features_df = pd.DataFrame(index=training_wallet_cohort)
modeling_wallet_features_df.index.name = 'wallet_address'

# Store feature sets with their prefixes for bulk renaming
feature_column_names = {}


# Identify Modeling Period Cohort
# -----------------------------------------------------------
# Retrieve modeling wallet cohort after applying modeling period activity filters
modeling_profits_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_profits_df.parquet")
modeling_wallet_cohort_trading_features_df = wmo.identify_modeling_cohort(modeling_profits_df)
modeling_wallet_features_df = modeling_wallet_features_df.join(modeling_wallet_cohort_trading_features_df, how='left')\
    .fillna({col: 0 for col in modeling_wallet_cohort_trading_features_df.columns})


# Generate Modeling Period Performance Features
# -----------------------------------------------------------
# Calculate performance metrics for the modeling cohort only
modeling_modeling_cohort_performance_features_df = (wpf.calculate_performance_features(
    modeling_wallet_features_df[modeling_wallet_features_df['in_modeling_cohort']==1])
    .copy()
    .drop(['max_investment', 'total_net_flows'], axis=1))

# Calculate performance metrics for the training cohort (wallets with 0 activity still impact rank orders)
modeling_training_cohort_performance_features_df = (wpf.calculate_performance_features(
    modeling_wallet_features_df)
    .copy()
    .drop(['max_investment', 'total_net_flows'], axis=1))

In [None]:
modeling_wallet_features_df.describe()

In [None]:
metrics_df = modeling_wallet_features_df[['max_investment','total_net_flows','cash_net_flows']].copy().round(6)
returns_winsorization = wallets_config['modeling']['returns_winsorization']
epsilon = 1e-10

# Calculate base return, including unrealized price change impacts
metrics_df['return'] = np.where(abs(metrics_df['max_investment']) == 0,0,
                                metrics_df['total_net_flows'] / metrics_df['max_investment'])

# Calculate realized return, based on actual cash flows only
metrics_df['realized_return'] = np.where(abs(metrics_df['max_investment']) == 0,0,
                                metrics_df['cash_net_flows'] / metrics_df['max_investment'])

# Apply winsorization
if returns_winsorization > 0:
    metrics_df['return_unwinsorized'] = metrics_df['return']
    metrics_df['return'] = u.winsorize(metrics_df['return'],returns_winsorization)



In [None]:
modeling_wallet_cohort_trading_features_df.loc[w]

In [None]:
w = 8619465
adj_profits_df = wtf.add_cash_flow_transfers_logic(modeling_profits_df.copy())
adj_profits_df[adj_profits_df['wallet_address']==w].sort_values('date')

In [None]:
metrics_df.sort_values(by='realized_return',ascending=False).head(10)

In [None]:
modeling_training_cohort_performance_features_df.describe()

In [None]:
modeling_modeling_cohort_performance_features_df.describe()

In [None]:
feature_column_names['modeling_cohort_'] = modeling_cohort_performance_features_df.columns
feature_column_names

In [None]:
modeling_wallet_cohort_trading_features_df.describe()

In [None]:
modeling_wallet_cohort_features_df.head()

In [None]:
modeling_wallet_cohort_features_df.head()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Join trading features and in_modeling_cohort boolean
modeling_wallet_features_df = modeling_wallet_features_df.join(modeling_wallet_cohort_features_df, how='left')\
    .fillna({col: 0 for col in modeling_wallet_cohort_features_df.columns})

# Join performance features
modeling_wallet_features_df = modeling_wallet_features_df.join(modeling_performance_features_df, how='left')\
    .fillna({col: 0 for col in modeling_performance_features_df.columns})


In [None]:
modeling_profits_df.head()

### select target variable and build model

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create MODELING_DF and Construct Wallet Model
# ----------------------------------------------------------
# Retrieve training data for the full training wallet cohort
training_data_df = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet")

# Filter training data to only the modeling cohort through inner join to target variable
modeling_cohort_target_var_df = modeling_wallet_features_df[['in_modeling_cohort', wallets_config['modeling']['target_variable']]]

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config)
model_results = wallet_model.run_experiment(training_data_df,modeling_cohort_target_var_df)
del training_data_df
gc.collect()

# Extract the trained model
model = model_results['pipeline'].named_steps['regressor']

# Generate and save all model artifacts
model_id, evaluator, wallet_scores_df = wmr.generate_and_save_model_artifacts(
    model_results=model_results,
    base_path='../wallet_modeling'
)
u.notify()

In [None]:
modeling_cohort_target_var_df.describe()

# Post Model Analysis

### assess wallet model performance

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Reload evaluator
evaluator = wime.RegressionEvaluator(
    y_train=model_results['y_train'],
    y_true=model_results['y_test'],
    y_pred=model_results['y_pred'],
    training_cohort_pred=model_results['training_cohort_pred'],
    training_cohort_actuals=model_results['training_cohort_actuals'],
    model=model,
    feature_names=model_results['X_train'].columns.tolist()
)

# Print results
print(evaluator.summary_report())
evaluator.plot_evaluation()

evaluator.importance_summary()

In [None]:
pd.DataFrame(model_results['training_cohort_pred']).describe()

### Cluster analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# List of the x features with the highest importance in the model
x_features = 8
top_feature_metrics = list((pd.DataFrame(evaluator.metrics['importances'])
                      .sort_values(by='importance',ascending=False)
                      .head(x_features)['feature']))
all_metrics = list(set(top_feature_metrics))

# Cluster numbers
n_clusters=4

styled_df = wime.create_cluster_report(modeling_df, model_results, n_clusters, all_metrics)
styled_df

## Wallet Validation Period Performance

### Retrieve validation datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Retrieve Validation Profits and Market Data
# ----------------------------------------------------------
# Retrieve full historical through validation period datasets
wmo.retrieve_period_datasets(
    wallets_config['training_data']['validation_period_start'],
    wallets_config['training_data']['validation_period_end'],
    parquet_prefix = 'validation'
)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Load parquet files
validation_market_data_df_full = pd.read_parquet("temp/wallet_modeling_dfs/validation_market_data_df_full.parquet")
validation_profits_df_full = pd.read_parquet("temp/wallet_modeling_dfs/validation_profits_df_full.parquet")

# Remove pre-validation period prices
validation_market_data_df = validation_market_data_df_full[validation_market_data_df_full['date']
                                                       >=wallets_config['training_data']['validation_period_start']]
del validation_market_data_df_full
gc.collect()


# Filter to only training wallet cohort
training_wallet_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet", columns=[]).index.values
validation_profits_df = validation_profits_df_full[validation_profits_df_full['wallet_address'].isin(training_wallet_cohort)]
del validation_profits_df_full
gc.collect()

# Impute rows for period end
validation_profits_df = pri.impute_profits_for_multiple_dates(validation_profits_df,
                                                              validation_market_data_df,
                                                              [wallets_config['training_data']['validation_period_end']],
                                                              n_threads=24)


# Assert period, save files, remove from memory
u.assert_period(wallets_config,validation_profits_df,'validation')
u.assert_period(wallets_config,validation_market_data_df,'validation')
validation_profits_df.to_parquet("temp/wallet_modeling_dfs/validation_profits_df.parquet",index=False)
validation_market_data_df.to_parquet("temp/wallet_modeling_dfs/validation_market_data_df.parquet",index=False)
del validation_profits_df,validation_market_data_df
gc.collect()

### generate wallet_validation_features_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Load parquet
validation_profits_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_profits_df.parquet")
validation_market_data_df = pd.read_parquet("temp/wallet_modeling_dfs/validation_market_data_df.parquet")


# Create a DataFrame with all wallets that should exist
validation_wallet_features_df = pd.DataFrame(index=training_wallet_cohort)
validation_wallet_features_df.index.name = 'wallet_address'


# Calculate modeling period wallet metrics
validation_profits_df = wtf.add_cash_flow_transfers_logic(validation_profits_df)
trading_features_df = wtf.calculate_wallet_trading_features(validation_profits_df)
validation_wallet_features_df = validation_wallet_features_df.join(trading_features_df, how='left')\
    .fillna({col: 0 for col in trading_features_df.columns})

# Performance features (inner join, no fill)
performance_features_df = (wpf.calculate_performance_features(validation_wallet_features_df)
                                .drop(['max_investment', 'total_net_flows'], axis=1))  # already exist as trading features
validation_wallet_features_df = validation_wallet_features_df.join(performance_features_df, how='inner')
validation_wallet_features_df.describe()

### wallet validation period trading/performance by score quantile

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create analysis by prediction bands
metrics = [
    'return',
    'realized_return',
    'return_unwinsorized',
    'max_investment',
    'total_net_flows',
    'cash_net_flows',
    'total_volume',
]

min_wallet_volume_usd = 1000
num_quantiles = 5

wiwv.create_quantile_report(
    validation_wallet_features_df,
    model_results['y_pred'],
    metrics,  # Your existing metrics list
    num_quantiles,  # Split into quintiles
    min_wallet_volume_usd
)


### coin-aggregated wallet metrics by coin performance

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate coin-level features about wallet behavior during the
coin_wallet_features_df = wicv.calculate_coin_metrics_from_wallet_scores(
    validation_profits_df,
    wallet_scores_df,
    validation_market_data_df
)

# Filter coins by market cap
analyze_df = coin_wallet_features_df[
    (coin_wallet_features_df['market_cap_filled'] >= wallets_config['coin_validation_analysis']['min_market_cap'])
    & (coin_wallet_features_df['market_cap_filled'] <= wallets_config['coin_validation_analysis']['max_market_cap'])
].copy()

# Create styled performance analysis
wicv.create_top_coins_wallet_metrics_report(analyze_df,percentile=90,method='median')


## Basic coin model testing

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# 1. Generate model scores (using existing wallet model results)
wallet_scores_df = pd.DataFrame({
    'score': model_results['y_pred']
}, index=model_results['X_test'].index)

# 2. Prepare the modeling dataset using modeling period data
coin_modeling_df = wicm.prepare_features_and_targets(
    coin_validation_df=coin_validation_df,
    modeling_profits_df=modeling_profits_df,
    modeling_market_data_df=modeling_market_data_df,
    wallet_scores_df=wallet_scores_df
)

# 3. Train model and get evaluation
model, evaluator = wicm.train_coin_prediction_model(coin_modeling_df)

# 4. View results
print(evaluator.summary_report())
evaluator.plot_evaluation()

# 5. Optional: Generate feature importance summary
evaluator.importance_summary()

# 6. Optional: Analyze predictions by market cap segment
predictions_df = pd.DataFrame({
    'y_true': evaluator.y_true,
    'y_pred': evaluator.y_pred,
    'market_cap': coin_modeling_df['market_cap_filled']
})

segment_results, summary_df = wicv.analyze_market_cap_segments(predictions_df)
wicv.plot_segment_heatmap(summary_df)

In [None]:
# Create a DataFrame with all wallets that should exist
validation_wallet_features_df = pd.DataFrame(index=wallet_cohort)
validation_wallet_features_df.index.name = 'wallet_address'


# Calculate modeling period wallet metrics
validation_profits_df = wtf.add_cash_flow_transfers_logic(validation_profits_df)
trading_features_df = wtf.calculate_wallet_trading_features(validation_profits_df)
validation_wallet_features_df = validation_wallet_features_df.join(trading_features_df, how='left')\
    .fillna({col: 0 for col in trading_features_df.columns})

# Performance features (inner join, no fill)
performance_features_df = (wpf.calculate_performance_features(validation_wallet_features_df)
                                .drop(['max_investment', 'total_net_flows'], axis=1))  # already exist as trading features
validation_wallet_features_df = validation_wallet_features_df.join(performance_features_df, how='inner')
validation_wallet_features_df.describe()

In [None]:
# Create coin_modeling_df
coin_modeling_df = coin_wallet_features_df.copy().drop('market_cap',axis=1)
coin_modeling_df['coin_return_unwinsorized'] = coin_modeling_df['coin_return']
coin_modeling_df['coin_return'] = u.winsorize(coin_modeling_df['coin_return'],0.05)

# Filter coins by market cap
coin_modeling_df = coin_modeling_df[
    (coin_modeling_df['market_cap_filled'] >= wallets_config['coin_validation_analysis']['min_market_cap'])
    & (coin_modeling_df['market_cap_filled'] <= wallets_config['coin_validation_analysis']['max_market_cap'])
].copy()


In [None]:
coin_modeling_df

In [None]:
df = coin_modeling_df.copy()

# 1. Simple feature prep and model
X, y = wicm.prepare_features(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

# 2. Train
model.fit(X_train, y_train)

# 3. Predict
y_pred = model.predict(X_test)

# 4. Evaluate with the fancy evaluator
feature_names = df.columns.drop(['coin_return', 'market_cap_filled']).tolist()
evaluator = wime.RegressionEvaluator(y_train, y_test, y_pred, model=model, feature_names=feature_names)

# 5. Get the goods
print(evaluator.summary_report())

# 6. Plot everything
evaluator.plot_evaluation()

## Cluster analysis

## experiments beta

In [None]:
# Create modeling dataset using existing pipeline
modeling_wallets_df = wmo.filter_modeling_period_wallets(modeling_profits_df)
target_vars_df = wpf.calculate_performance_features(modeling_wallets_df)


In [None]:
### save model artifacts
[importlib.reload(module) for module in modules]
wallets_config.reload()

# 1. Initialize dependencies
metrics_config = {
    'rmse': lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    'r2': r2_score
}

# 2. Define experiment sequence
sequence_config = {
    'run_baseline': True,
    'parameter_variations': {
        'modeling': {
            'target_variable': [
                'max_investment',
                'total_net_flows',
                'return',
                'realized_return',
                'return_unwinsorized',
                'performance_score',
                'size_adjusted_rank'
            ]
        }
    }
}

# 3. Create experiment manager
exp_manager = wem.ExperimentsManager(
    config=wallets_config.config,
    training_data_df=training_data_df,
)

# 4. Run experiments and get results
results_df = exp_manager.run_experiment_sequence(modeling_profits_df, sequence_config)

# 5. View results
print(results_df)

In [None]:
results_df

### Validation period assessments

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wallet_performance_df, bucketed_performance_df = wicv.calculate_validation_metrics(
    X_test=model_results['X_test'],
    y_pred=model_results['y_pred'],
    validation_profits_df=validation_profits_df,
    n_buckets=10,
    method='ntiles'
)

bucketed_performance_df

## coin performance analysis

### compare wallet metrics for the top n% of coins vs the others

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Partition coin features for analysis
analyze_df = coin_wallet_features_df[
    (coin_wallet_features_df['market_cap_filled'] >= wallets_config['coin_validation_analysis']['min_market_cap'])
    & (coin_wallet_features_df['market_cap_filled'] <= wallets_config['coin_validation_analysis']['max_market_cap'])
].copy()

# Create styled performance analysis
styled_df = wicv.create_top_coins_wallet_metrics_report(analyze_df,percentile=90,method='median')

# Display results
styled_df

### plotting coin feature performance vs market cap

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Get the analysis results
segment_results, summary_df = wicv.analyze_market_cap_segments(
    coin_wallet_features_df,
    top_n=10
)

# Or create the visualizations
wicv.plot_segment_heatmap(summary_df)
wicv.plot_metric_consistency(summary_df)  # Optional secondary visualization


### coin performance of top n for each bucket

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Run analysis
top_n = wallets_config['coin_validation_analysis']['top_n']
max_market_cap = wallets_config['coin_validation_analysis']['max_market_cap']
min_market_cap = wallets_config['coin_validation_analysis']['min_market_cap']

metric_top_coin_performance_df = wicv.validate_coin_performance(coin_wallet_features_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

### compare performance of high vs low score coins

In [None]:
coin_wallet_features_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wicv.print_performance_analysis(coin_wallet_features_df)

# Junkyard

# Tests failing

In [None]:

class ProfitsValidator:
    """
    Validates profits DataFrame follows expected format and constraints.
    Only validates training period data.
    """
    def validate_all(self, profits_df, training_period_start, training_period_end):
        """Run all validation checks and return dict of results"""
        dates = {
            'training_period_start': pd.to_datetime(training_period_start),
            'training_period_end': pd.to_datetime(training_period_end),
        }

        return {
            'no_duplicates': self.check_no_duplicates(profits_df),
            'period_boundaries': self.check_period_boundaries(profits_df, dates),
            'no_negatives': self.check_no_negative_balances(profits_df),
            'date_range': self.check_date_range(profits_df, dates),
            'no_missing': self.check_no_missing_values(profits_df)
        }

    def check_no_duplicates(self, profits_df):
        """Check for duplicate records"""
        deduped_df = profits_df[['coin_id', 'wallet_address', 'date']].drop_duplicates()
        return len(profits_df) == len(deduped_df)

    def check_period_boundaries(self, profits_df, dates):
        """Check records exist at period boundaries"""
        profits_df['date'] = pd.to_datetime(profits_df['date'])
        pairs = profits_df[['coin_id', 'wallet_address']].drop_duplicates()
        n_pairs = len(pairs)

        period_df = profits_df[profits_df['date'] == dates['training_period_end']]
        period_pairs = period_df[['coin_id', 'wallet_address']].drop_duplicates()
        return len(period_pairs) == n_pairs

    def check_no_negative_balances(self, profits_df):
        """Check for negative USD balances"""
        return (profits_df['usd_balance'] >= -0.1).all()

    def check_date_range(self, profits_df, dates):
        """Verify date coverage"""
        profits_df['date'] = pd.to_datetime(profits_df['date'])
        return (profits_df['date'].min() >= dates['training_period_start'] and
                profits_df['date'].max() == dates['training_period_end'])

    def check_no_missing_values(self, profits_df):
        """Check for missing values"""
        return not profits_df.isna().any().any()



In [None]:

# pylint:disable=line-too-long

def test_profits_data():
    """
    Returns raw profits data that can be remapped for many-to-many testing.
    """
    profits_data = [
        # w01_multiple_coins - btc & eth (multiple transactions, multiple coins)
        {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-01-01', 'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-05-01', 'usd_balance': 120, 'usd_net_transfers': 50, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-10-01', 'usd_balance': 180, 'usd_net_transfers': 0, 'is_imputed': True},

        {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-01-01', 'usd_balance': 200, 'usd_net_transfers': 200, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-05-01', 'usd_balance': 300, 'usd_net_transfers': 50, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-10-01', 'usd_balance': 280, 'usd_net_transfers': 0, 'is_imputed': True},

        # w02_net_loss - btc (net loss)
        {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-01-01', 'usd_balance': 300, 'usd_net_transfers': 300, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-05-01', 'usd_balance': 250, 'usd_net_transfers': -100, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-10-01', 'usd_balance': 100, 'usd_net_transfers': 0, 'is_imputed': True},

        # w03_sell_all_and_rebuy
        {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-01-01', 'usd_balance': 50, 'usd_net_transfers': 50, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-03-01', 'usd_balance': 0,  'usd_net_transfers': -50, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-08-01', 'usd_balance': 40, 'usd_net_transfers': 40, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-10-01', 'usd_balance': 42, 'usd_net_transfers': 0, 'is_imputed': True},

        # w04_only_period_end - btc (only final row)
        {'coin_id': 'sol', 'wallet_address': 'w04_only_period_end', 'date': '2024-10-01', 'usd_balance': 70, 'usd_net_transfers': 70, 'is_imputed': False},

        # w04a_only_period_end_w_balance - btc
        {'coin_id': 'eth', 'wallet_address': 'w04a_only_period_end_w_balance', 'date': '2024-01-01', 'usd_balance': 30, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'eth', 'wallet_address': 'w04a_only_period_end_w_balance', 'date': '2024-10-01', 'usd_balance': 90, 'usd_net_transfers': 50, 'is_imputed': False},

        # w04b_only_period_start_buy
        {'coin_id': 'sol', 'wallet_address': 'w04b_only_period_start_buy', 'date': '2024-01-01', 'usd_balance': 300, 'usd_net_transfers': 300, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w04b_only_period_start_buy', 'date': '2024-10-01', 'usd_balance': 900, 'usd_net_transfers': 0, 'is_imputed': True},

        # w04c_only_period_start_buy_w_existing_balance
        {'coin_id': 'btc', 'wallet_address': 'w04c_only_period_start_buy_w_existing_balance', 'date': '2024-01-01', 'usd_balance': 350, 'usd_net_transfers': 300, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w04c_only_period_start_buy_w_existing_balance', 'date': '2024-10-01', 'usd_balance': 1050, 'usd_net_transfers': 0, 'is_imputed': True},

        # w04d_only_period_start_sell
        {'coin_id': 'sol', 'wallet_address': 'w04d_only_period_start_sell', 'date': '2024-01-01', 'usd_balance': 0, 'usd_net_transfers': -200, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w04d_only_period_start_sell', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

        # w04e_only_period_start_sell_partial
        {'coin_id': 'btc', 'wallet_address': 'w04e_only_period_start_sell_partial', 'date': '2024-01-01', 'usd_balance': 500, 'usd_net_transfers': -10, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w04e_only_period_start_sell_partial', 'date': '2024-10-01', 'usd_balance': 600, 'usd_net_transfers': 0, 'is_imputed': True},

        # w05_only_imputed - btc (only imputed rows at start and end)
        {'coin_id': 'sol', 'wallet_address': 'w05_only_imputed', 'date': '2024-01-01', 'usd_balance': 50, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'sol', 'wallet_address': 'w05_only_imputed', 'date': '2024-10-01', 'usd_balance': 70, 'usd_net_transfers': 0, 'is_imputed': True},

        # w06_tiny_transactions - very small transactions relative to portfolio size
        {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-01-01', 'usd_balance': 1250, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-02-01', 'usd_balance': 1220, 'usd_net_transfers': 1, 'is_imputed': False},
        {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-08-01', 'usd_balance': 0, 'usd_net_transfers': -350, 'is_imputed': False},
        {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

        # w07_tiny_transactions2 - very small transactions relative to portfolio size
        {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-01-01', 'usd_balance': 400, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-02-01', 'usd_balance': 1220, 'usd_net_transfers': -20, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-08-01', 'usd_balance': 0, 'usd_net_transfers': -150, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

        # w08_offsetting_transactions - large offsetting transactions in the middle of the period
        {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-01-01', 'usd_balance': 500, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-02-01', 'usd_balance': 10400, 'usd_net_transfers': 10000, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-02-02', 'usd_balance': 400, 'usd_net_transfers': -10000, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-10-01', 'usd_balance': 750, 'usd_net_transfers': 0, 'is_imputed': True},

        # w09_memecoin_winner - Large swings in portfolio value
        {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-01-01', 'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-03-01', 'usd_balance': 250, 'usd_net_transfers': -500, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-05-01', 'usd_balance': 50, 'usd_net_transfers': -100, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-10-01', 'usd_balance': 10, 'usd_net_transfers': 0, 'is_imputed': True},

        # w10_memecoin_loser - Large swings in portfolio value
        {'coin_id': 'myro', 'wallet_address': 'w10_memecoin_loser', 'date': '2024-03-01', 'usd_balance': 250, 'usd_net_transfers': 250, 'is_imputed': False},
        {'coin_id': 'myro', 'wallet_address': 'w10_memecoin_loser', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': -20, 'is_imputed': False},

        # w11_sells_early
        {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-03-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-04-01', 'usd_balance': 250, 'usd_net_transfers': 250, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-5-01', 'usd_balance': 0, 'usd_net_transfers': -300, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

        # w12_buys_late
        {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-03-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-09-01', 'usd_balance': 500, 'usd_net_transfers': 250, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-10-01', 'usd_balance': 550, 'usd_net_transfers': 0, 'is_imputed': True},
    ]

    return pd.DataFrame(profits_data)



test_profits_data = test_profits_data()

def test_profits_df(test_profits_data):
    """
    Returns test profits DataFrame with cash flow transfers added.
    """
    profits_df = test_profits_data.copy()
    training_period_start = '2024-01-01'
    training_period_end = '2024-10-01'

    # Validate test data format before proceeding
    validator = ProfitsValidator()
    validation_results = validator.validate_all(
        profits_df,
        training_period_start,
        training_period_end
    )
    assert all(validation_results.values()), "Test data failed validation checks."

    # Remove rows with a rounded 0 balance and 0 transfers which happens in wmo.retrieve_datasets() once validation checks are passed
    profits_df = profits_df[
        ~((profits_df['usd_balance'] == 0) &
        (profits_df['usd_net_transfers'] == 0))
    ]

    # Add cash flow transfers logic
    cash_flow_profits_df = wtf.add_cash_flow_transfers_logic(profits_df)

    return cash_flow_profits_df

test_profits_df = test_profits_df(test_profits_data)

test_trading_features_df = wtf.calculate_wallet_trading_features(test_profits_df)
test_trading_features_df_new = wtf.calculate_wallet_trading_features_new(test_profits_df)

In [None]:
"""
Test wallet with multiple coins and transactions.

Scenario:
- Two coins (BTC, ETH)
- Multiple transactions per coin
- Mix of real and imputed rows
"""
# Get w01 data
wallet = 'w01_multiple_coins'
w01_profits = test_profits_df[test_profits_df['wallet_address'] == wallet]
w01_features = test_trading_features_df.loc[wallet]

# Test basic metrics
assert w01_features['transaction_days'] == 2  # Jan 1 and May 1
assert w01_features['unique_coins_traded'] == 2  # BTC and ETH
assert w01_features['cash_buy_inflows'] == 400  # Initial: BTC 100 + ETH 200, Add: BTC 50 + ETH 50

# Test volume metrics
assert w01_features['total_volume'] == 400  # Sum of all transfers
assert w01_features['average_transaction'] == 100  # 400 / 4 transactions

# Test imputed metrics
assert w01_features['total_inflows'] == 400  # Initial balances
assert w01_features['total_net_flows'] > 0  # Should be profitable given ending balances > deposits

# Test activity metrics
total_days = (w01_profits['date'].max() - w01_profits['date'].min()).days + 1
assert w01_features['activity_density'] == pytest.approx(2 / total_days, rel=1e-10)



In [None]:
w01_profits

In [None]:
w01_features

In [None]:
test_trading_features_df_new.loc[wallet]

In [None]:
w01_features