In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import re
import pdb
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    explained_variance_score,
    mean_absolute_percentage_error,
    roc_auc_score
)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['ALERT_SOUND_FILEPATH']="../../../Local/assets/sounds/mixkit-alert-bells-echo-765.wav"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp

# Wallet modeling
import wallet_modeling.wallet_modeling_orchestrator as wmo
import wallet_modeling.wallet_training_data as wtd
import wallet_modeling.model_reporting as wmr
import wallet_modeling.wallet_model as wm
from wallet_modeling.wallets_config_manager import WalletsConfig

# Wallet features
import wallet_features.clustering_features as wcl
import wallet_features.market_cap_features as wmc
import wallet_features.market_timing_features as wmt
import wallet_features.performance_features as wpf
import wallet_features.trading_features as wtf
import wallet_features.transfers_features as wts
import wallet_features.features_orchestrator as wfo

# Wallet insights
import wallet_insights.wallet_model_evaluation as wime
import wallet_insights.validation_analysis as wiv
import wallet_insights.coin_forecasting as wicf


# reload all modules
modules = [u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp,
           wmo, wtd, wmr, wme,
           wcl, wmc, wmt, wpf, wtf, wts, wfo,
           wime, wiv, wicf]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')
wallets_config = WalletsConfig.load_from_yaml('../config/wallets_config.yaml')
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

logger.info("Good morning, let's get to work")

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

u.export_code(code_directories=['wallet_features','data_retrieval','wallet_modeling'])

## Full Training Data Sequence

### retrieve datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Retrieve datasets
profits_df,market_data_df = wmo.retrieve_datasets()
profits_df_full = profits_df.copy()

In [None]:
profits_df_full

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


profits_df = profits_df_full.copy()

# Define wallet cohort after cleaning
training_wallet_metrics_df,wallet_cohort = wmo.define_wallet_cohort(profits_df,market_data_df)

# Generate profits_df for all training windows and the modeling period
training_profits_df, training_windows_profits_dfs, modeling_profits_df, validation_profits_df = wmo.split_profits_df(profits_df,
                                                                               market_data_df,wallet_cohort)

# Market data: add indicators
# Remove all market_data records after the training period to ensure no leakage
training_market_data_df = (market_data_df[market_data_df['date']
                                          <= wallets_config['training_data']['training_period_end']])

# Add new columns
# Generate basic indicators
market_indicators_data_df = ind.add_market_data_dualcolumn_indicators(training_market_data_df)
market_indicators_data_df = ind.generate_time_series_indicators(market_indicators_data_df,
                                                        wallets_metrics_config['time_series']['market_data'],
                                                        'coin_id')

# Transfers data retrieval for the wallet_ids in temp.wallet_modeling_cohort
transfers_sequencing_df = wts.retrieve_transfers_sequencing()

### generate features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate features for the full training dataset
training_wallet_features_df = wfo.calculate_wallet_features(training_profits_df, market_indicators_data_df,
                                                           transfers_sequencing_df, wallet_cohort)

# Define the full feature set by appending a suffix for each window
training_data_df = training_wallet_features_df.add_suffix("_all_windows")

# Generate features for each window
for i, window_profits_df in enumerate(training_windows_profits_dfs, 1):
    logger.info("Generating features for window %s...", i)

    # Generate the features
    window_wallet_features_df = wfo.calculate_wallet_features(window_profits_df, market_indicators_data_df,
                                                             transfers_sequencing_df, wallet_cohort)

    # Add column suffix and join to training_data_df
    window_wallet_features_df = window_wallet_features_df.add_suffix(f'_w{i}')
    training_data_df = training_data_df.join(window_wallet_features_df, how='left')

# Append clustering features based on all numeric features in the base training data
cluster_features_df = wcl.create_basic_cluster_features(training_data_df)
cluster_features_df = cluster_features_df.add_prefix('cluster_')
training_data_df = training_data_df.join(cluster_features_df, how='inner')

logger.info("Feature generation complete.")

training_data_df.describe()

## investigating negative performance values

In [None]:
# result was an issue with winsorization labeling and a bug in cash flows return logic

training_data_df_full = training_data_df.copy()

In [None]:
window_profits_df = training_windows_profits_dfs[0].copy()
profits_df = training_windows_profits_dfs[0].copy()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

window_wallet_features_df = wfo.calculate_wallet_features(window_profits_df, market_indicators_data_df,
                                                            transfers_sequencing_df, wallet_cohort)



In [None]:
window_wallet_features_df.describe()

In [None]:
cols = [
    'trading_total_inflows',
    'trading_total_outflows',
    'trading_total_net_flows',
    'trading_max_investment',
    'trading_transaction_days',
    'trading_unique_coins_traded',
    'trading_cash_buy_inflows',
    'trading_cash_sell_outflows',
    'trading_cash_net_flows',
    'trading_total_volume',
    'trading_average_transaction',
    'trading_activity_density',
    'trading_volume_vs_investment_ratio',
    'performance_return',
    'performance_return_unwinsorized',
    'performance_realized_return',
    'performance_performance_score',
    'performance_size_adjusted_rank',
]

investigate_df = window_wallet_features_df[cols]
investigate_df.describe()

## investigating edge case performance nans

In [None]:
training_data_df_full = training_data_df.copy()

In [None]:
training_data_df = training_data_df_full.copy()
# Get all rows with any NA
na_rows = training_data_df[training_data_df.isna().any(axis=1)]
na_subset = na_rows.loc[:, na_rows.isna().any()]
na_subset

In [None]:
window_wallet_features_df.shape

In [None]:
profits_df = training_windows_profits_dfs[0].copy()

In [None]:
logger.info("here we go")

### calculate all except performance_df

In [None]:
# Create a DataFrame with all wallets that should exist
wallet_features_df = pd.DataFrame(index=wallet_cohort)
wallet_features_df.index.name = 'wallet_address'

# Store feature sets with their prefixes for bulk renaming
feature_column_names = {}

# Trading features (inner join, custom fill)
profits_df = wtf.add_cash_flow_transfers_logic(profits_df)
trading_features_df = wtf.calculate_wallet_trading_features(profits_df)
trading_features_df = wtf.fill_trading_features_data(trading_features_df, wallet_cohort)
feature_column_names['trading_'] = trading_features_df.columns
wallet_features_df = wallet_features_df.join(trading_features_df, how='inner')

# Market timing features (fill zeros)
timing_features_df = wmt.calculate_market_timing_features(profits_df, market_indicators_data_df)
feature_column_names['timing_'] = timing_features_df.columns
wallet_features_df = wallet_features_df.join(timing_features_df, how='left')\
    .fillna({col: 0 for col in timing_features_df.columns})

# Market cap features (fill zeros)
market_features_df = wmc.calculate_market_cap_features(profits_df, market_indicators_data_df)
feature_column_names['mktcap_'] = market_features_df.columns
wallet_features_df = wallet_features_df.join(market_features_df, how='left')\
    .fillna({col: 0 for col in market_features_df.columns})

# Transfers features (fill -1)
transfers_features_df = wts.calculate_transfers_sequencing_features(profits_df, transfers_sequencing_df)
feature_column_names['transfers_'] = transfers_features_df.columns
wallet_features_df = wallet_features_df.join(transfers_features_df, how='left')\
    .fillna({col: -1 for col in transfers_features_df.columns})


### base calc of performance_df

In [None]:
performance_features_df = wpf.calculate_performance_features(wallet_features_df)
performance_features_df.describe()


In [None]:
# Get all rows with any NA
na_rows = performance_features_df[performance_features_df.isna().any(axis=1)]
na_subset = na_rows.loc[:, na_rows.isna().any()]
na_subset

### walk through performance_df steps

In [None]:
wallet_features_df_full = wallet_features_df.copy()

In [None]:
wallet_features_df = wallet_features_df_full.copy()
wallet_features_df

In [None]:
metrics_df = wallet_features_df[['max_investment','total_net_flows']].copy().round(6)
returns_winsorization = wallets_config['modeling']['returns_winsorization']
epsilon = 1e-10

# Calculate base return, including unrealized price change impacts
metrics_df['return'] = np.where(abs(metrics_df['max_investment']) == 0,0,
                                metrics_df['total_net_flows'] / metrics_df['max_investment'])

# Calculate realized return, based on actual cash flows only
metrics_df['realized_return'] = np.where(abs(metrics_df['max_investment']) == 0,0,
                                metrics_df['total_net_flows'] / metrics_df['max_investment'])

# Apply winsorization
if returns_winsorization > 0:
    metrics_df['return'] = u.winsorize(metrics_df['return'],returns_winsorization)

# Normalize returns
metrics_df['norm_return'] = (metrics_df['return'] - metrics_df['return'].min()) / \
    (metrics_df['return'].max() - metrics_df['return'].min())

# Normalize logged investments
log_invested = np.log10(metrics_df['max_investment'] + epsilon)
metrics_df['norm_invested'] = (log_invested - log_invested.min()) / \
    (log_invested.max() - log_invested.min())

# # # Performance score
# # metrics_df['performance_score'] = (0.6 * metrics_df['norm_return'] +
# #                                     0.4 * metrics_df['norm_invested'])


metrics_df.describe()



In [None]:
metrics_df = wallet_features_df[['max_investment','total_net_flows']].copy().round(6)

metrics_df.loc[[4157559,28800922]]

In [None]:
base_metrics_df['max_investment']

In [None]:
# Get all rows with any NA
na_rows = metrics_df[metrics_df.isna().any(axis=1)]
na_rows
# na_subset = na_rows.loc[:, na_rows.isna().any()]
# na_subset

In [None]:

# Performance features (inner join, no fill)
performance_features_df = wpf.calculate_performance_features(wallet_features_df)
# feature_column_names['performance_'] = performance_features_df.drop(['max_investment', 'total_net_flows'], axis=1).columns
# wallet_features_df = wallet_features_df.join(
#     performance_features_df.drop(['max_investment', 'total_net_flows'], axis=1),
#     how='inner'
# )

performance_features_df

In [None]:
bad_perf_df = wpf.calculate_performance_features(window_wallet_features_df)
bad_perf_df

In [None]:
profits_df = training_windows_profits_dfs[0].copy()


In [None]:
# Create a DataFrame with all wallets that should exist
wallet_features_df = pd.DataFrame(index=wallet_cohort)
wallet_features_df.index.name = 'wallet_address'

# Store feature sets with their prefixes for bulk renaming
feature_column_names = {}

# Trading features (inner join, custom fill)
profits_df = wtf.add_cash_flow_transfers_logic(profits_df)
trading_features_df = wtf.calculate_wallet_trading_features(profits_df)
trading_features_df = wtf.fill_trading_features_data(trading_features_df, wallet_cohort)
feature_column_names['trading_'] = trading_features_df.columns
wallet_features_df = wallet_features_df.join(trading_features_df, how='inner')

# Market timing features (fill zeros)
timing_features_df = wmt.calculate_market_timing_features(profits_df, market_indicators_data_df)
feature_column_names['timing_'] = timing_features_df.columns
wallet_features_df = wallet_features_df.join(timing_features_df, how='left')\
    .fillna({col: 0 for col in timing_features_df.columns})

# Market cap features (fill zeros)
market_features_df = wmc.calculate_market_cap_features(profits_df, market_indicators_data_df)
feature_column_names['mktcap_'] = market_features_df.columns
wallet_features_df = wallet_features_df.join(market_features_df, how='left')\
    .fillna({col: 0 for col in market_features_df.columns})

# Transfers features (fill -1)
transfers_features_df = wts.calculate_transfers_sequencing_features(profits_df, transfers_sequencing_df)
feature_column_names['transfers_'] = transfers_features_df.columns
wallet_features_df = wallet_features_df.join(transfers_features_df, how='left')\
    .fillna({col: -1 for col in transfers_features_df.columns})

# Performance features (inner join, no fill)
performance_features_df = wpf.calculate_performance_features(wallet_features_df)


In [None]:
len(training_data_df_full)

In [None]:
pd.DataFrame(training_data_df.loc[w]).T.to_csv('baddata.csv')

In [None]:
w = 4157559
# w = 28800922

wallet_features_df.loc[w]
trading_features_df.loc[w]

In [None]:
trading_features_df.loc[w]

In [None]:
profits_df[profits_df['wallet_address']==w]
# profits_df_full[profits_df_full['wallet_address']==w].sort_values

In [None]:
window_profits_df.sample(5)

In [None]:
c = '28846ace-0e04-4cbe-83d8-8390cfe04c3b'
w = 4157559


bad_profits_df = profits_df_full[
    (profits_df_full['coin_id'].isin([c,'3be6bd20-cd71-496f-b963-1e76d6303984','31b3d3aa-cffb-40a1-9971-0d5d9be7fa9a']))
    & (profits_df_full['wallet_address'].isin([w,28800922,15850862,13897369]))
].sort_values(by='date').copy()

# bad_profits_df = u.cw_filter_df(profits_df_full,c,w).sort_values(by='date')
bad_trading_features_df = wtf.add_cash_flow_transfers_logic(bad_profits_df)
bad_trading_features_df = wtf.calculate_wallet_trading_features(bad_trading_features_df)
bad_trading_features_df



In [None]:
bad_performance_features_df = wpf.calculate_performance_features(bad_trading_features_df)
bad_performance_features_df

In [None]:
metrics_df = bad_trading_features_df[['max_investment','total_net_flows']].copy().round(6)
metrics_df
returns_winsorization = wallets_config['modeling']['returns_winsorization']
epsilon = 1e-10

# Calculate base return, including unrealized price change impacts
metrics_df['return'] = np.where(abs(metrics_df['max_investment']) == 0,0,
                                metrics_df['total_net_flows'] / metrics_df['max_investment'])

# Calculate realized return, based on actual cash flows only
metrics_df['realized_return'] = np.where(abs(metrics_df['max_investment']) == 0,0,
                                metrics_df['total_net_flows'] / metrics_df['max_investment'])

# Apply winsorization
if returns_winsorization > 0:
    metrics_df['return'] = u.winsorize(metrics_df['return'],returns_winsorization)

# Normalize returns
metrics_df['norm_return'] = (metrics_df['return'] - metrics_df['return'].min()) / \
    (metrics_df['return'].max() - metrics_df['return'].min())

# Normalize logged investments
log_invested = np.log10(metrics_df['max_investment'] + epsilon)
metrics_df['norm_invested'] = (log_invested - log_invested.min()) / \
    (log_invested.max() - log_invested.min())

# Performance score
metrics_df['performance_score'] = (0.6 * metrics_df['norm_return'] +
                                    0.4 * metrics_df['norm_invested'])

# # Size-adjusted rank
# # Create mask for zero values
# zero_mask = metrics_df['max_investment'] == 0

# # Create quartiles series initialized with 'q0' for zero values
# quartiles = pd.Series('q0', index=metrics_df.index)

# # Calculate quartiles for non-zero values
# non_zero_quartiles = pd.qcut(metrics_df['max_investment'][~zero_mask],
#                             q=4,
#                             labels=['q1', 'q2', 'q3', 'q4'])

# # Assign the quartiles to non-zero values
# quartiles[~zero_mask] = non_zero_quartiles

# # Calculate size-adjusted rank within each quartile
# metrics_df['size_adjusted_rank'] = metrics_df.groupby(quartiles)['return'].rank(pct=True)


# # Clean up intermediate columns
# cols_to_drop = ['norm_return', 'norm_invested', 'norm_gain']
# metrics_df = metrics_df.drop(columns=[c for c in cols_to_drop
#                                     if c in metrics_df.columns])

metrics_df

In [None]:
bad_trading_features_df = wtf.calculate_wallet_trading_features(bad_trading_features_df)

In [None]:
performance_features_df[performance_features_df['performance_score'].isna()]

In [None]:
len(wallet_features_df)

In [None]:
df = pd.DataFrame(training_data_df.isna().sum())
df.columns = ['nan']
df[df['nan']>0]

## time weighted returns addition

In [None]:
# [importlib.reload(module) for module in modules]
# wallets_config.reload()

# # Create a DataFrame with all wallets that should exist
# wallet_features_df = pd.DataFrame(index=wallet_cohort)
# wallet_features_df.index.name = 'wallet_address'

# # Trading features (inner join, custom fill)
# profits_df = wtf.add_cash_flow_transfers_logic(window_profits_df)
# trading_features_df = wtf.calculate_wallet_trading_features(profits_df)
# trading_features_df = wtf.fill_trading_features_data(trading_features_df, wallet_cohort)
# wallet_features_df = wallet_features_df.join(trading_features_df, how='inner')

# # Time weighted returns (fill zeros)
# # time_weighted_returns_df = wpf.calculate_time_weighted_returns(profits_df)
# # wallet_features_df = wallet_features_df.join(time_weighted_returns_df, how='left')\
# #     .fillna({col: 0 for col in time_weighted_returns_df.columns})

# performance_features_df = wpf.calculate_performance_features(wallet_features_df)
# wallet_features_df = wallet_features_df.join(performance_features_df,how='inner')

In [None]:
# [importlib.reload(module) for module in modules]
# wallets_config.reload()

# profits_df = wtf.add_cash_flow_transfers_logic(window_profits_df)


# trading_features_df = wtf.calculate_wallet_trading_features(profits_df)
# trading_features_df = wtf.fill_trading_features_data(trading_features_df, wallet_cohort)
# trading_features_df.columns


In [None]:
performance_features_df.columns

In [None]:
trading_features_df

In [None]:
performance_features_df.reset_index(drop=True).corr(method='pearson')

In [None]:
def create_corr_matrix_viz(df: pd.DataFrame, figsize=(12, 10)) -> None:
    """
    Creates and plots correlation matrix heatmap for numerical features.

    Params:
    - df (DataFrame): input feature dataframe
    - figsize (tuple): figure size for plot, defaults to (12, 10)
    """
    import seaborn as sns
    import matplotlib.pyplot as plt

    # Calculate correlation matrix
    corr_matrix = df.reset_index(drop=True).corr(method='pearson')

    # Create heatmap
    plt.figure(figsize=figsize)
    sns.heatmap(
        corr_matrix,
        annot=True,  # Show correlation values
        cmap='RdBu',  # Red-Blue diverging colormap
        center=0,     # Center colormap at 0
        fmt='.2f',    # Round to 2 decimal places
        square=True,  # Make cells square
        cbar_kws={'label': 'Correlation Coefficient'}
    )
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()

create_corr_matrix_viz(performance_features_df)
plt.show()


In [None]:
trading_features_df

In [None]:
time_weighted_returns_df = wpf.calculate_time_weighted_returns(profits_df)
time_weighted_returns_df.describe()

In [None]:
w = 39759
training_profits_df[training_profits_df['wallet_address']==w].sort_values(by=['coin_id','date'])

In [None]:
trading_features_df.loc[39759]

In [None]:
time_weighted_performance_df = wpf.calculate_time_weighted_returns(window_profits_df)
time_weighted_performance_df

In [None]:
tradi

## Wallet Modeling

### join target variable to training data

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Clean inactive wallets from modeling period data
modeling_wallets_df = wmo.filter_modeling_period_wallets(modeling_profits_df)

# Generate target variables
target_vars_df = wpf.calculate_performance_features(modeling_wallets_df)

# Merge training data and target variables?
modeling_df = training_data_df.join(target_vars_df[wallets_config['modeling']['target_variable']],
                                    how='inner')


### build model

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create an experiment instance
experiment = wme.WalletModel(wallets_config)

# Run the experiment and get results
model_results = experiment.run_experiment(modeling_df)

# Extract the trained model
model = model_results['pipeline'].named_steps['regressor']

### assess model performance

In [None]:
### save model artifacts
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate and save all model artifacts
model_id, evaluator, wallet_scores_df, coin_validation_df = wmr.generate_and_save_model_artifacts(
    model_results=model_results,
    validation_profits_df=validation_profits_df,
    base_path='../wallet_modeling'
)
u.play_notification()

# Print results
print(evaluator.summary_report())
evaluator.plot_evaluation()
evaluator.importance_summary()

In [None]:
cluster_analysis_df

In [None]:
### save model artifacts
[importlib.reload(module) for module in modules]
wallets_config.reload()



comparison_metrics = [
    'trading_max_investment_all_windows',
    'mktcap_portfolio_wtd_market_cap_all_windows',
    'trading_total_net_flows_all_windows',
    'performance_return_all_windows',
    'performance_performance_score_all_windows',
    'trading_max_investment_w4',
    'mktcap_portfolio_wtd_market_cap_w4',
    'trading_total_net_flows_w4',
    'performance_return_w4',
    'performance_performance_score_w4',
]

# Create df that includes comparison metrics and all cluster feature columns
cluster_cols = [col for col in modeling_df.columns if col.startswith('cluster_')]
cluster_analysis_df = modeling_df[cluster_cols + comparison_metrics].copy()

# Assign wallets to categorical clusters based on the distance values
cluster_analysis_df = wime.assign_clusters_from_distances(cluster_analysis_df, wallets_config['features']['clustering_n_clusters'])

# Generate metrics for clusters
cluster_profiles = wime.analyze_cluster_metrics(
   cluster_analysis_df,
   wallets_config['features']['clustering_n_clusters'],
   comparison_metrics
)

# Assess model performance in the test set of each cluster
cluster_performance = wime.analyze_cluster_performance(
    cluster_analysis_df,
    wallets_config['features']['clustering_n_clusters'],
    model_results['y_test'],  # True values
    model_results['y_pred']   # Predictions
)



In [None]:
n = 2

# Join metrics with performance and display results
cluster_results_df = cluster_profiles[n].join(cluster_performance[n]).T
cluster_results_df = wime.format_numeric_columns(cluster_results_df)
cluster_results_df

In [None]:
n = 4

# Join metrics with performance and display results
cluster_results_df = cluster_profiles[n].join(cluster_performance[n]).T
cluster_results_df = wime.format_numeric_columns(cluster_results_df)
cluster_results_df

In [None]:
def style_rows(df: pd.DataFrame) -> pd.DataFrame.style:
    """
    Apply row-wise conditional formatting to DataFrame where each row is scaled independently.
    Uses blue gradient with transparency for dark mode compatibility.

    Params:
    - df (DataFrame): input DataFrame to style

    Returns:
    - styled_df (DataFrame.style): DataFrame with conditional formatting applied
    """
    def row_style(row):
        # Skip non-numeric rows
        if not np.issubdtype(row.dtype, np.number):
            return [''] * len(row)

        # Handle rows with NaN values
        valid_vals = row.dropna()
        if len(valid_vals) == 0:
            return [''] * len(row)

        # Normalize values between 0 and 1 for each row
        min_val = valid_vals.min()
        max_val = valid_vals.max()
        if min_val == max_val:
            return ['background-color: rgba(0, 0, 255, 0)'] * len(row)

        norm = (row - min_val) / (max_val - min_val)
        # Convert to rgba colors (transparent to solid blue)
        colors = [f'background-color: rgba(0, 0, 255, {x:.2f})' if pd.notna(x) else '' for x in norm]
        return colors

    return df.style.apply(row_style, axis=1)

In [None]:
n = 7

# Join metrics with performance and display results
cluster_results_df = cluster_profiles[n].join(cluster_performance[n]).T
cluster_results_df = style_rows(cluster_results_df)
# cluster_results_df = wime.format_numeric_columns(cluster_results_df)

cluster_results_df

In [None]:
def style_row_wise(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply row-wise background coloring to DataFrame relative to each row's values.

    Params:
    - df (DataFrame): Input dataframe to style

    Returns:
    - Styled DataFrame with row-wise background colors
    """
    # Function to normalize single row to 0-1 scale
    def row_background(row):
        min_val = row.min()
        max_val = row.max()
        # Avoid division by zero if all values are the same
        if min_val == max_val:
            return ['background-color: transparent'] * len(row)
        # Normalize to 0-1 scale
        normalized = (row - min_val) / (max_val - min_val)
        # Convert to colors (light blue to dark blue)
        return ['background-color: #{:02x}{:02x}ff'.format(
            int(255 * (1 - x)),
            int(255 * (1 - x))
        ) for x in normalized]

    return df.style.apply(row_background, axis=1)

style_row_wise(cluster_results_df)

In [None]:
n = 5


# # Access results like:
# k2_profiles = cluster_profiles[2]  # Medians for k=2 clusters
# k5_profiles = cluster_profiles[5]  # Medians for k=5 clusters

In [None]:
cluster_profiles[]

In [None]:
type(model_results['y_pred'])

In [None]:
k2_profiles.round(3).T
k5_profiles.round(3).T

In [None]:
(highest_importances_df.sort_values(by='importance', ascending=False).groupby('prefix')).first()

In [None]:
feature_report_df.head()

In [None]:
df = pd.DataFrame(evaluator.metrics['importances']).head(20)
df['prefix'] = df['feature'].str.split('_').str[1]

In [None]:
# Get total importance by prefix
df = feature_importance_df.copy()

prefix_totals = (df
    .assign(prefix=df['feature'].str.split('_').str[1])
    .groupby('prefix')['importance']
    .sum()
    .reset_index()
    .rename(columns={'importance': 'total_importance'}))

# Get best features by prefix
best_features = (df
    .assign(prefix=df['feature'].str.split('_').str[1])
    .sort_values('importance', ascending=False)
    .groupby('prefix')
    .first()
    .reset_index()
    .rename(columns={'feature': 'best_feature', 'importance': 'best_feature_importance'}))

# Join them
result = (prefix_totals
    .merge(best_features[['prefix', 'best_feature', 'best_feature_importance']], on='prefix')
    .sort_values('total_importance', ascending=False))

result

### Validation period assessments

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wallet_performance_df, bucketed_performance_df = wiv.calculate_validation_metrics(
    X_test=model_results['X_test'],
    y_pred=model_results['y_pred'],
    validation_profits_df=validation_profits_df,
)

bucketed_performance_df

## coin performance predictions

### create coin_validation_df with metrics and returns

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Consolidate wallet scores at the coin level
wallet_scores_df = pd.DataFrame({'score': model_results['y_pred']}, index=model_results['y_test'].index)
coin_wallet_metrics_df = wicf.calculate_coin_metrics_from_wallet_scores(validation_profits_df, wallet_scores_df)

# Calculate coin performance during the validation period
coin_performance_df = wicf.calculate_coin_performance(market_data_df,
                                                     wallets_config['training_data']['validation_period_start'],
                                                     wallets_config['training_data']['validation_period_end'])

# Join aggregated wallet metrics with actual coin performance
coin_validation_df = coin_wallet_metrics_df.join(coin_performance_df, how='inner')

### plotting coin feature performance vs market cap

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Get the analysis results
segment_results, summary_df = wicf.analyze_market_cap_segments(
    coin_validation_df,
    top_n=10
)

# Or create the visualizations
wicf.plot_segment_heatmap(summary_df)
# wicf.plot_metric_consistency(summary_df)  # Optional secondary visualization


### coin performance of top n for each bucket

In [None]:

# Run analysis
top_n = wallets_config['coin_forecasting']['top_n']
max_market_cap = wallets_config['coin_forecasting']['max_market_cap']
min_market_cap = wallets_config['coin_forecasting']['min_market_cap']

metric_top_coin_performance_df = wicf.validate_coin_performance(coin_validation_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

### compare performance of high vs low score coins

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wicf.print_performance_analysis(coin_validation_df)

## Junkyard

## Tests failing

In [None]:
test_data = pd.DataFrame([
    # BTC wallet with imputed values
    {'coin_id': 'btc', 'wallet_address': 'wallet_a', 'date': '2024-01-01',
        'usd_balance': 50, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'btc', 'wallet_address': 'wallet_a', 'date': '2024-10-01',
        'usd_balance': 70, 'usd_net_transfers': 0, 'is_imputed': True},
    # ETH wallet with transfers
    {'coin_id': 'eth', 'wallet_address': 'wallet_a', 'date': '2024-01-01',
        'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'wallet_a', 'date': '2024-02-01',
        'usd_balance': 250, 'usd_net_transfers': 50, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'wallet_a', 'date': '2024-10-01',
        'usd_balance': 125, 'usd_net_transfers': 0, 'is_imputed': False}
])
test_data['date'] = pd.to_datetime(test_data['date'])
portfolio_test_data = test_data.copy()
portfolio_test_data.sort_values(['date'])

In [None]:
# @pytest.mark.unit
# def test_calculate_time_weighted_returns_multi_coin_portfolio(portfolio_test_data):
# """Tests TWR calculation for a wallet holding both BTC and ETH with mixed imputed/actual balances."""
result = wpf.calculate_time_weighted_returns(portfolio_test_data)

# Manual calculation combining BTC and ETH positions:
# Jan 1: Total $150 ($50 BTC + $100 ETH, $100 transfer)
# Feb 1: Total $250 ($50 BTC + $250 ETH, $50 transfer)
# Oct 1: Total $195 ($70 BTC + $125 ETH)

expected_twr = -0.157
expected_days = 274  # Jan 1 to Oct 1
expected_annual = ((1 + expected_twr) ** (365/274)) - 1  # ≈ -0.204

# Assertions
assert result.loc['wallet_a', 'days_held'] == expected_days
assert abs(result.loc['wallet_a', 'time_weighted_return'] - expected_twr) < 0.01
assert abs(result.loc['wallet_a', 'annualized_twr'] - expected_annual) < 0.01

In [None]:
result

In [None]:
class ProfitsValidator:
    """
    Validates profits DataFrame follows expected format and constraints.
    Only validates training period data.
    """
    def validate_all(self, profits_df, training_period_start, training_period_end):
        """Run all validation checks and return dict of results"""
        dates = {
            'training_period_start': pd.to_datetime(training_period_start),
            'training_period_end': pd.to_datetime(training_period_end),
        }

        return {
            'no_duplicates': self.check_no_duplicates(profits_df),
            'period_boundaries': self.check_period_boundaries(profits_df, dates),
            'no_negatives': self.check_no_negative_balances(profits_df),
            'date_range': self.check_date_range(profits_df, dates),
            'no_missing': self.check_no_missing_values(profits_df)
        }

    def check_no_duplicates(self, profits_df):
        """Check for duplicate records"""
        deduped_df = profits_df[['coin_id', 'wallet_address', 'date']].drop_duplicates()
        return len(profits_df) == len(deduped_df)

    def check_period_boundaries(self, profits_df, dates):
        """Check records exist at period boundaries"""
        profits_df['date'] = pd.to_datetime(profits_df['date'])
        pairs = profits_df[['coin_id', 'wallet_address']].drop_duplicates()
        n_pairs = len(pairs)

        period_df = profits_df[profits_df['date'] == dates['training_period_end']]
        period_pairs = period_df[['coin_id', 'wallet_address']].drop_duplicates()
        return len(period_pairs) == n_pairs

    def check_no_negative_balances(self, profits_df):
        """Check for negative USD balances"""
        return (profits_df['usd_balance'] >= -0.1).all()

    def check_date_range(self, profits_df, dates):
        """Verify date coverage"""
        profits_df['date'] = pd.to_datetime(profits_df['date'])
        return (profits_df['date'].min() >= dates['training_period_start'] and
                profits_df['date'].max() == dates['training_period_end'])

    def check_no_missing_values(self, profits_df):
        """Check for missing values"""
        return not profits_df.isna().any().any()



In [None]:
profits_data = [
    # w01_multiple_coins - btc & eth (multiple transactions, multiple coins)
    {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-01-01', 'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-05-01', 'usd_balance': 120, 'usd_net_transfers': 50, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-10-01', 'usd_balance': 180, 'usd_net_transfers': 0, 'is_imputed': True},

    {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-01-01', 'usd_balance': 200, 'usd_net_transfers': 200, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-05-01', 'usd_balance': 300, 'usd_net_transfers': 50, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-10-01', 'usd_balance': 280, 'usd_net_transfers': 0, 'is_imputed': True},

    # w02_net_loss - btc (net loss)
    {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-01-01', 'usd_balance': 300, 'usd_net_transfers': 300, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-05-01', 'usd_balance': 250, 'usd_net_transfers': -100, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-10-01', 'usd_balance': 100, 'usd_net_transfers': 0, 'is_imputed': True},

    # w03_sell_all_and_rebuy
    {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-01-01', 'usd_balance': 50, 'usd_net_transfers': 50, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-03-01', 'usd_balance': 0,  'usd_net_transfers': -50, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-08-01', 'usd_balance': 40, 'usd_net_transfers': 40, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-10-01', 'usd_balance': 42, 'usd_net_transfers': 0, 'is_imputed': True},

    # w04_only_period_end - btc (only final row)
    {'coin_id': 'sol', 'wallet_address': 'w04_only_period_end', 'date': '2024-10-01', 'usd_balance': 70, 'usd_net_transfers': 70, 'is_imputed': False},

    # w04a_only_period_end_w_balance - btc
    {'coin_id': 'eth', 'wallet_address': 'w04a_only_period_end_w_balance', 'date': '2024-01-01', 'usd_balance': 30, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'eth', 'wallet_address': 'w04a_only_period_end_w_balance', 'date': '2024-10-01', 'usd_balance': 90, 'usd_net_transfers': 50, 'is_imputed': False},

    # w04b_only_period_start_buy
    {'coin_id': 'sol', 'wallet_address': 'w04b_only_period_start_buy', 'date': '2024-01-01', 'usd_balance': 300, 'usd_net_transfers': 300, 'is_imputed': False},
    {'coin_id': 'sol', 'wallet_address': 'w04b_only_period_start_buy', 'date': '2024-10-01', 'usd_balance': 900, 'usd_net_transfers': 0, 'is_imputed': True},

    # w04c_only_period_start_buy_w_existing_balance
    {'coin_id': 'btc', 'wallet_address': 'w04c_only_period_start_buy_w_existing_balance', 'date': '2024-01-01', 'usd_balance': 350, 'usd_net_transfers': 300, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w04c_only_period_start_buy_w_existing_balance', 'date': '2024-10-01', 'usd_balance': 1050, 'usd_net_transfers': 0, 'is_imputed': True},

    # w04d_only_period_start_sell
    {'coin_id': 'sol', 'wallet_address': 'w04d_only_period_start_sell', 'date': '2024-01-01', 'usd_balance': 0, 'usd_net_transfers': -200, 'is_imputed': False},
    {'coin_id': 'sol', 'wallet_address': 'w04d_only_period_start_sell', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

    # w04e_only_period_start_sell_partial
    {'coin_id': 'btc', 'wallet_address': 'w04e_only_period_start_sell_partial', 'date': '2024-01-01', 'usd_balance': 500, 'usd_net_transfers': -10, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w04e_only_period_start_sell_partial', 'date': '2024-10-01', 'usd_balance': 600, 'usd_net_transfers': 0, 'is_imputed': True},

    # w05_only_imputed - btc (only imputed rows at start and end)
    {'coin_id': 'sol', 'wallet_address': 'w05_only_imputed', 'date': '2024-01-01', 'usd_balance': 50, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'sol', 'wallet_address': 'w05_only_imputed', 'date': '2024-10-01', 'usd_balance': 70, 'usd_net_transfers': 0, 'is_imputed': True},

    # w06_tiny_transactions - very small transactions relative to portfolio size
    {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-01-01', 'usd_balance': 1250, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-02-01', 'usd_balance': 1220, 'usd_net_transfers': 1, 'is_imputed': False},
    {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-08-01', 'usd_balance': 0, 'usd_net_transfers': -350, 'is_imputed': False},
    {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

    # w07_tiny_transactions2 - very small transactions relative to portfolio size
    {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-01-01', 'usd_balance': 400, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-02-01', 'usd_balance': 1220, 'usd_net_transfers': -20, 'is_imputed': False},
    {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-08-01', 'usd_balance': 0, 'usd_net_transfers': -150, 'is_imputed': False},
    {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

    # w08_offsetting_transactions - large offsetting transactions in the middle of the period
    {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-01-01', 'usd_balance': 500, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-02-01', 'usd_balance': 10400, 'usd_net_transfers': 10000, 'is_imputed': False},
    {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-02-02', 'usd_balance': 400, 'usd_net_transfers': -10000, 'is_imputed': False},
    {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-10-01', 'usd_balance': 750, 'usd_net_transfers': 0, 'is_imputed': True},

    # w09_memecoin_winner - Large swings in portfolio value
    {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-01-01', 'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
    {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-03-01', 'usd_balance': 250, 'usd_net_transfers': -500, 'is_imputed': False},
    {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-05-01', 'usd_balance': 50, 'usd_net_transfers': -100, 'is_imputed': False},
    {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-10-01', 'usd_balance': 10, 'usd_net_transfers': 0, 'is_imputed': True},

    # w10_memecoin_loser - Large swings in portfolio value
    {'coin_id': 'myro', 'wallet_address': 'w10_memecoin_loser', 'date': '2024-03-01', 'usd_balance': 250, 'usd_net_transfers': 250, 'is_imputed': False},
    {'coin_id': 'myro', 'wallet_address': 'w10_memecoin_loser', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': -20, 'is_imputed': False},

    # w11_sells_early
    {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-03-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-04-01', 'usd_balance': 250, 'usd_net_transfers': 250, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-5-01', 'usd_balance': 0, 'usd_net_transfers': -300, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

    # w12_buys_late
    {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-03-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-09-01', 'usd_balance': 500, 'usd_net_transfers': 250, 'is_imputed': False},
    {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-10-01', 'usd_balance': 550, 'usd_net_transfers': 0, 'is_imputed': True},
]

test_profits_data = pd.DataFrame(profits_data)

In [None]:
profits_df = test_profits_data.copy()
training_period_start = '2024-01-01'
training_period_end = '2024-10-01'

# Validate test data format before proceeding
validator = ProfitsValidator()
validation_results = validator.validate_all(
    profits_df,
    training_period_start,
    training_period_end
)
assert all(validation_results.values()), "Test data failed validation checks."

# Remove rows with a rounded 0 balance and 0 transfers which happens in wmo.retrieve_datasets() once validation checks are passed
profits_df = profits_df[
    ~((profits_df['usd_balance'] == 0) &
    (profits_df['usd_net_transfers'] == 0))
]

# Add cash flow transfers logic
cash_flow_profits_df = wtf.add_cash_flow_transfers_logic(profits_df)

test_profits_df = cash_flow_profits_df.copy()
test_trading_features_df = wtf.calculate_wallet_trading_features(test_profits_df)

In [None]:
# Reassign wallets to create a lot of overlap
reassign_dict = {
    'w01_multiple_coins': 'w1',
    'w02_net_loss': 'w2',
    'w03_sell_all_and_rebuy': 'w2',
    'w04_only_period_end': 'w3',
    'w04a_only_period_end_w_balance': 'w3',
    'w04b_only_period_start_buy': 'w2',
    'w04c_only_period_start_buy_w_existing_balance': 'w4',
    'w04d_only_period_start_sell': 'w4',
    'w04e_only_period_start_sell_partial': 'w5',
    'w05_only_imputed': 'w5',
    'w06_tiny_transactions': 'w5',
    'w07_tiny_transactions2': 'w2',
    'w08_offsetting_transactions': 'w1',
    'w09_memecoin_winner': 'w3',
    'w10_memecoin_loser': 'w4',
    'w11_sells_early': 'w6',
    'w12_buys_late': 'w6'
}
remapped_profits_df = test_profits_data.copy()
remapped_profits_df['wallet_address_original'] = remapped_profits_df['wallet_address']
remapped_profits_df['wallet_address'] = remapped_profits_df['wallet_address'].map(reassign_dict)

# Rest of the sequence remains unchanged
profits_df = remapped_profits_df.copy()
training_period_start = '2024-01-01'
training_period_end = '2024-10-01'

# Validate test data format before proceeding
validator = ProfitsValidator()
validation_results = validator.validate_all(
    profits_df,
    training_period_start,
    training_period_end
)
assert all(validation_results.values()), "Test data failed validation checks."

# Remove rows with a rounded 0 balance and 0 transfers which happens in wmo.retrieve_datasets() once validation checks are passed
profits_df = profits_df[
    ~((profits_df['usd_balance'] == 0) &
    (profits_df['usd_net_transfers'] == 0))
]

# Add cash flow transfers logic
cash_flow_profits_df = wtf.add_cash_flow_transfers_logic(profits_df)

# Confirm that all the addresses have been mapped
expected_addresses = ['w1', 'w2', 'w3', 'w4', 'w5', 'w6']
assert sorted(list(cash_flow_profits_df['wallet_address'].unique())) == expected_addresses

test_remapped_profits_df = cash_flow_profits_df.copy()
test_remapped_trading_features_df = wtf.calculate_wallet_trading_features(test_remapped_profits_df)

In [None]:
# import pytest
# import pandas as pd
# import numpy as np

# @pytest.mark.unit
# def test_twr_aggregation_after_remapping(test_remapped_profits_df):
"""
Validates time-weighted return calculations by comparing:
1. TWR calculated on individual coin-wallet pairs then aggregated
2. TWR calculated on wallet-level portfolio values (treating all coins as one portfolio)

Approach:
1. First aggregate portfolio values and cash flows by date for each wallet
2. Calculate TWR on wallet-level portfolio sequences
3. Compare against calculate_time_weighted_returns() results
"""
# Step 1: Create wallet-level daily portfolio values and flows
wallet_daily = (test_remapped_profits_df
    .groupby(['wallet_address', 'date'])
    .agg({
        'usd_balance': 'sum',  # Total portfolio value
        'usd_net_transfers': 'sum',  # Total cash flows
        'is_imputed': 'any'  # Track if any coin was imputed
    })
    .reset_index()
    .sort_values(['wallet_address', 'date']))

# Step 2: Calculate wallet-level TWR metrics
def calculate_portfolio_twr(wallet_data):
    """Calculate TWR for a single wallet's aggregated portfolio"""
    # Pre-transfer balances
    wallet_data['pre_transfer_balance'] = (
        wallet_data['usd_balance'] - wallet_data['usd_net_transfers']
    )

    # Previous balances and holding periods
    wallet_data['prev_balance'] = wallet_data['usd_balance'].shift()
    wallet_data['days_held'] = (
        wallet_data['date'].diff().dt.days.fillna(0)
    )

    # Period returns
    wallet_data['period_return'] = np.where(
        wallet_data['usd_net_transfers'] != 0,
        wallet_data['pre_transfer_balance'] / wallet_data['prev_balance'],
        wallet_data['usd_balance'] / wallet_data['prev_balance']
    )
    wallet_data['period_return'] = (
        wallet_data['period_return']
        .replace([np.inf, -np.inf], 1)
        .fillna(1)
    )

    # Calculate weighted returns
    wallet_data['weighted_return'] = (
        (wallet_data['period_return'] - 1) * wallet_data['days_held']
    )

    # Total days held (calendar days between first and last observation)
    total_days = max((wallet_data['date'].max() - wallet_data['date'].min()).days, 1)

    # Calculate TWR
    twr = (wallet_data['weighted_return'].sum() / total_days)

    # Calculate annualized TWR
    ann_twr = ((1 + twr) ** (365 / total_days)) - 1

    return pd.Series({
        'time_weighted_return': twr,
        'days_held': total_days,
        'annualized_twr': ann_twr
    })

# Calculate expected TWR using portfolio-level approach
expected_twr = (wallet_daily
    .groupby('wallet_address')
    .apply(calculate_portfolio_twr)
    .sort_index())

# Calculate actual TWR using original function
actual_twr = wpf.calculate_time_weighted_returns(test_remapped_profits_df).sort_index()

# Compare results with tolerance for floating point arithmetic
pd.testing.assert_frame_equal(
    expected_twr,
    actual_twr,
    check_exact=False,
    rtol=1e-5  # Allow 0.001% relative difference
)


In [None]:
expected_twr

In [None]:
actual_twr

In [None]:
orig_twr_with_mapping

In [None]:
expected_twr

In [None]:
actual_twr