In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import re
import pdb
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['ALERT_SOUND_FILEPATH']="../../../Local/assets/sounds/mixkit-alert-bells-echo-765.wav"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp

# Wallet modeling
import wallet_modeling.wallet_modeling_orchestrator as wmo
import wallet_modeling.wallet_training_data as wtd
import wallet_modeling.model_reporting as wmr
import wallet_modeling.wallet_model_experiment as wme
from wallet_modeling.wallets_config_manager import WalletsConfig

# Wallet features
import wallet_features.clustering_features as wcl
import wallet_features.market_cap_features as wmc
import wallet_features.market_timing_features as wmt
import wallet_features.performance_features as wpf
import wallet_features.trading_features as wtf
import wallet_features.transfers_features as wts
import wallet_features.features_orchestrator as wfo

# Wallet insights
import wallet_insights.wallet_model_evaluation as wime
import wallet_insights.validation_analysis as wiv
import wallet_insights.coin_forecasting as wicf


# reload all modules
modules = [u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp,
           wmo, wtd, wmr, wme,
           wcl, wmc, wmt, wpf, wtf, wts, wfo,
           wime, wiv, wicf]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')
wallets_config = WalletsConfig.load_from_yaml('../config/wallets_config.yaml')
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

logger.info("Good morning, let's get to work")

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

u.export_code(code_directories=['wallet_features'])

## Full Training Data Sequence

### retrieve datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Retrieve datasets
profits_df,market_data_df = wmo.retrieve_datasets()
profits_df_full = profits_df.copy()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))



# Define wallet cohort after cleaning
training_wallet_metrics_df,wallet_cohort = wmo.define_wallet_cohort(profits_df,market_data_df)

# Generate profits_df for all training windows and the modeling period
training_profits_df, training_windows_profits_dfs, modeling_profits_df, validation_profits_df = wmo.split_profits_df(profits_df,
                                                                               market_data_df,wallet_cohort)

# Market data: add indicators
market_indicators_data_df = ind.generate_time_series_indicators(market_data_df,
                                                        wallets_metrics_config['time_series']['market_data'],
                                                        'coin_id')


# Transfers data retrieval for the wallet_ids in temp.wallet_modeling_cohort
transfers_sequencing_df = wts.retrieve_transfers_sequencing()

### generate features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate features for the full training dataset
training_wallet_features_df = wfo.calculate_wallet_features(training_profits_df, market_indicators_data_df,
                                                           transfers_sequencing_df, wallet_cohort)

# Define the full feature set by appending a suffix for each window
training_data_df = training_wallet_features_df.add_suffix("_all_windows")

# Generate features for each window
for i, window_profits_df in enumerate(training_windows_profits_dfs, 1):
    # Generate the features
    window_wallet_features_df = wfo.calculate_wallet_features(window_profits_df, market_indicators_data_df,
                                                             transfers_sequencing_df, wallet_cohort)

    # Add column suffix and join to training_data_df
    window_wallet_features_df = window_wallet_features_df.add_suffix(f'_w{i}')
    training_data_df = training_data_df.join(window_wallet_features_df, how='left')

# Append clustering features based on all numeric features in the base training data
cluster_features = wcl.create_basic_cluster_features(training_data_df)
training_data_df = training_data_df.join(cluster_features, how='inner')


training_data_df.describe()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create a DataFrame with all wallets that should exist
wallet_features_df = pd.DataFrame(index=wallet_cohort)
wallet_features_df.index.name = 'wallet_address'

# Trading features (inner join, custom fill)
profits_df = wtf.add_cash_flow_transfers_logic(window_profits_df)
trading_features_df = wtf.calculate_wallet_trading_features(profits_df)
trading_features_df = wtf.fill_trading_features_data(trading_features_df, wallet_cohort)
wallet_features_df = wallet_features_df.join(trading_features_df, how='inner')

# Time weighted returns (fill zeros)
time_weighted_returns_df = wpf.calculate_time_weighted_returns(profits_df)
wallet_features_df = wallet_features_df.join(time_weighted_returns_df, how='left')\
    .fillna({col: 0 for col in time_weighted_returns_df.columns})
performance_features_df = wpf.calculate_performance_features(wallet_features_df)
# wallet_features_df = wallet_features_df.join(
#     performance_features_df.drop(['max_investment', 'total_net_flows'], axis=1),
#     how='inner'
# )

In [None]:
performance_features_df = performance_features_df.join(time_weighted_returns_df)

In [None]:
wallet_features_df

In [None]:

trading_features_df['return'] = np.where(abs(trading_features_df['max_investment']) == 0,0,
                                trading_features_df['total_net_flows'] / trading_features_df['max_investment'])
trading_features_df

In [None]:
performance_features_df.describe()

In [None]:
performance_features_df.reset_index(drop=True).corr(method='pearson')

In [None]:
def create_corr_matrix_viz(df: pd.DataFrame, figsize=(12, 10)) -> None:
    """
    Creates and plots correlation matrix heatmap for numerical features.

    Params:
    - df (DataFrame): input feature dataframe
    - figsize (tuple): figure size for plot, defaults to (12, 10)
    """
    import seaborn as sns
    import matplotlib.pyplot as plt

    # Calculate correlation matrix
    corr_matrix = df.reset_index(drop=True).corr(method='pearson')

    # Create heatmap
    plt.figure(figsize=figsize)
    sns.heatmap(
        corr_matrix,
        annot=True,  # Show correlation values
        cmap='RdBu',  # Red-Blue diverging colormap
        center=0,     # Center colormap at 0
        fmt='.2f',    # Round to 2 decimal places
        square=True,  # Make cells square
        cbar_kws={'label': 'Correlation Coefficient'}
    )
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()

create_corr_matrix_viz(performance_features_df)
plt.show()


In [None]:
trading_features_df

In [None]:
time_weighted_returns_df = wpf.calculate_time_weighted_returns(profits_df)
time_weighted_returns_df.describe()

In [None]:
w = 39759
training_profits_df[training_profits_df['wallet_address']==w].sort_values(by=['coin_id','date'])

In [None]:
trading_features_df.loc[39759]

In [None]:
time_weighted_performance_df = wpf.calculate_time_weighted_returns(window_profits_df)
time_weighted_performance_df

In [None]:
tradi

### join target variable to training data

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Clean inactive wallets from modeling period data
modeling_wallets_df = wmo.filter_modeling_period_wallets(modeling_profits_df)

# Generate target variables
target_vars_df = wpf.calculate_performance_features(modeling_wallets_df)

# Merge training data and target variables?
modeling_df = training_data_df.join(target_vars_df[wallets_config['modeling']['target_variable']],
                                    how='inner')


## Wallet Modeling

### build model

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create an experiment instance
experiment = wme.WalletModel(wallets_config)

# Run the experiment and get results
model_results = experiment.run_experiment(modeling_df)

# Extract the trained model
model = model_results['pipeline'].named_steps['regressor']

### assess model performance

In [None]:
### save model artifacts
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate and save all model artifacts
model_id, evaluator, wallet_scores_df, coin_validation_df = wmr.generate_and_save_model_artifacts(
    model_results=model_results,
    validation_profits_df=validation_profits_df,
    base_path='../wallet_modeling'
)
u.play_notification()

# Print results
evaluator.plot_evaluation()

### Validation period assessments

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wallet_performance_df, bucketed_performance_df = wiv.calculate_validation_metrics(
    X_test=model_results['X_test'],
    y_pred=model_results['y_pred'],
    validation_profits_df=validation_profits_df,
)

bucketed_performance_df

## coin performance predictions

### create coin_validation_df with metrics and returns

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Consolidate wallet scores at the coin level
wallet_scores_df = pd.DataFrame({'score': model_results['y_pred']}, index=model_results['y_test'].index)
coin_wallet_metrics_df = wicf.calculate_coin_metrics_from_wallet_scores(validation_profits_df, wallet_scores_df)

# Calculate coin performance during the validation period
coin_performance_df = wicf.calculate_coin_performance(market_data_df,
                                                     wallets_config['training_data']['validation_period_start'],
                                                     wallets_config['training_data']['validation_period_end'])

# Join aggregated wallet metrics with actual coin performance
coin_validation_df = coin_wallet_metrics_df.join(coin_performance_df, how='inner')

### plotting coin feature performance vs market cap

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Get the analysis results
segment_results, summary_df = wicf.analyze_market_cap_segments(
    coin_validation_df,
    top_n=10
)

# Or create the visualizations
wicf.plot_segment_heatmap(summary_df)
# wicf.plot_metric_consistency(summary_df)  # Optional secondary visualization


### coin performance of top n for each bucket

In [None]:

# Run analysis
top_n = wallets_config['coin_forecasting']['top_n']
max_market_cap = wallets_config['coin_forecasting']['max_market_cap']
min_market_cap = wallets_config['coin_forecasting']['min_market_cap']

metric_top_coin_performance_df = wicf.validate_coin_performance(coin_validation_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

### compare performance of high vs low score coins

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wicf.print_performance_analysis(coin_validation_df)

## Junkyard

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the data into a pandas DataFrame
def analyze_coin_metrics(df):
    """
    Analyze relationships between coin metrics and returns
    """
    # Calculate correlations with coin_return
    metrics_of_interest = [
        'weighted_avg_score',
        'composite_score',
        'score_confidence',
        'top_wallet_balance_pct',
        'top_wallet_count_pct',
        'total_wallets',
        'avg_wallet_balance',
        'market_cap'
    ]

    # Calculate correlations
    correlations = {}
    for metric in metrics_of_interest:
        correlation = df[metric].corr(df['coin_return'])
        correlations[metric] = correlation

    # Sort correlations by absolute value
    correlations_sorted = {k: v for k, v in sorted(correlations.items(),
                                                 key=lambda x: abs(x[1]),
                                                 reverse=True)}

    # Calculate basic statistics for coins with positive vs negative returns
    positive_returns = df[df['coin_return'] > 0]
    negative_returns = df[df['coin_return'] <= 0]

    comparison_stats = {}
    for metric in metrics_of_interest:
        pos_mean = positive_returns[metric].mean()
        neg_mean = negative_returns[metric].mean()
        # Perform t-test
        t_stat, p_value = stats.ttest_ind(positive_returns[metric],
                                        negative_returns[metric])

        comparison_stats[metric] = {
            'positive_mean': pos_mean,
            'negative_mean': neg_mean,
            'difference': pos_mean - neg_mean,
            'p_value': p_value
        }

    # Identify potential success indicators
    success_indicators = {
        metric: stats for metric, stats in comparison_stats.items()
        if (abs(stats['difference']) > 0.1 * stats['negative_mean'] and
            stats['p_value'] < 0.05)
    }

    return {
        'correlations': correlations_sorted,
        'comparison_stats': comparison_stats,
        'success_indicators': success_indicators
    }

# Create summary statistics
def print_analysis_results(results):
    """
    Print formatted analysis results
    """
    print("\n=== Correlation Analysis ===")
    print("\nCorrelations with coin return (sorted by strength):")
    for metric, corr in results['correlations'].items():
        print(f"{metric:25} : {corr:0.4f}")

    print("\n=== Positive vs Negative Returns Analysis ===")
    print("\nMetrics comparison for positive vs negative returns:")
    for metric, stats in results['comparison_stats'].items():
        print(f"\n{metric}:")
        print(f"  Positive returns mean: {stats['positive_mean']:0.4f}")
        print(f"  Negative returns mean: {stats['negative_mean']:0.4f}")
        print(f"  Difference: {stats['difference']:0.4f}")
        print(f"  P-value: {stats['p_value']:0.4f}")

    print("\n=== Strong Success Indicators ===")
    print("\nMetrics showing significant difference between positive and negative returns:")
    for metric, stats in results['success_indicators'].items():
        print(f"\n{metric}:")
        print(f"  Mean difference: {stats['difference']:0.4f}")
        print(f"  P-value: {stats['p_value']:0.4f}")


# Run the analysis
def main():
    # Read the data
    df = pd.read_csv('coin_wallet_metrics.csv')

    # Run analysis
    results = analyze_coin_metrics(df)

    # Print results
    print_analysis_results(results)

    # Create visualizations
    create_visualizations(df)

if __name__ == "__main__":
    main()

In [None]:
# Winsorize the returns (apply caps to the top n % of values)
returns_winsorized = u.winsorize(returns, winsorization_cutoff)

# Merge datasets
df = pd.DataFrame({
    'predictions': predictions,
    'returns': returns_winsorized,
})

# Sort by actual returns to obtain optimal performance
df_sorted = df.sort_values('returns', ascending=False)
cumulative_best_returns = np.cumsum(df_sorted['returns'])
cumulative_best_avg_returns = df_sorted['returns'].expanding().mean()

# Sort by model score to obtain modeled performance
df_sorted = df.sort_values('predictions', ascending=False)
cumulative_model_returns = np.cumsum(df_sorted['returns'])
cumulative_model_avg_returns = df_sorted['returns'].expanding().mean()

# Calculate average return across all data
average_return = np.mean(returns_winsorized)

In [None]:
df

In [None]:
cumulative_model_returns

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Run analysis
top_n = wallets_config['coin_forecasting']['top_n']
max_market_cap = wallets_config['coin_forecasting']['max_market_cap']
min_market_cap = wallets_config['coin_forecasting']['min_market_cap']

metric_top_coin_performance_df = wicf.validate_coin_performance(coin_validation_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# List the coins that would have been picked at the start of the validation period
top_coins_df = coin_validation_df[
    (coin_validation_df['market_cap_filled']<=max_market_cap)
    & (coin_validation_df['market_cap_filled']>=min_market_cap)
].copy()

sort_column = wallets_config['coin_forecasting']['sort_method']

top_coins_df.sort_values(sort_column,ascending=False).head(top_n)

## Tests failing

In [None]:


class ProfitsValidator:
    """
    Validates profits DataFrame follows expected format and constraints.
    Only validates training period data.
    """
    def validate_all(self, profits_df, training_period_start, training_period_end):
        """Run all validation checks and return dict of results"""
        dates = {
            'training_period_start': pd.to_datetime(training_period_start),
            'training_period_end': pd.to_datetime(training_period_end),
        }

        return {
            'no_duplicates': self.check_no_duplicates(profits_df),
            'period_boundaries': self.check_period_boundaries(profits_df, dates),
            'no_negatives': self.check_no_negative_balances(profits_df),
            'date_range': self.check_date_range(profits_df, dates),
            'no_missing': self.check_no_missing_values(profits_df)
        }

    def check_no_duplicates(self, profits_df):
        """Check for duplicate records"""
        deduped_df = profits_df[['coin_id', 'wallet_address', 'date']].drop_duplicates()
        return len(profits_df) == len(deduped_df)

    def check_period_boundaries(self, profits_df, dates):
        """Check records exist at period boundaries"""
        profits_df['date'] = pd.to_datetime(profits_df['date'])
        pairs = profits_df[['coin_id', 'wallet_address']].drop_duplicates()
        n_pairs = len(pairs)

        period_df = profits_df[profits_df['date'] == dates['training_period_end']]
        period_pairs = period_df[['coin_id', 'wallet_address']].drop_duplicates()
        return len(period_pairs) == n_pairs

    def check_no_negative_balances(self, profits_df):
        """Check for negative USD balances"""
        return (profits_df['usd_balance'] >= -0.1).all()

    def check_date_range(self, profits_df, dates):
        """Verify date coverage"""
        profits_df['date'] = pd.to_datetime(profits_df['date'])
        return (profits_df['date'].min() >= dates['training_period_start'] and
                profits_df['date'].max() == dates['training_period_end'])

    def check_no_missing_values(self, profits_df):
        """Check for missing values"""
        return not profits_df.isna().any().any()


class TestPeriods:
    """Test period dates"""
    TRAINING_PERIOD_START: str = '2024-01-01'
    TRAINING_PERIOD_END: str = '2024-01-10'



In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# @pytest.mark.unit
# def test_complex_scenarios():
"""
Test multiple complex business scenarios in a single profits_df.

Scenarios covered:
- Many-to-many relationships between wallets and coins.
- A wallet with only imputed rows at start and end (no real trades).
- A wallet with only 1 row at training period end.
- A wallet ending with a net loss.
- A wallet that sells full balance mid-period and repurchases later.
- Additionally tested: incremental investments, multiple coins, and ensuring
    that all coin-wallet pairs appear at the training period end.
"""

# Training period
training_period_start = '2024-01-01'
training_period_end = '2024-10-01'

# Construct the sample profits_df
profits_data = [
    # w01_multiple_coins - btc & eth (multiple transactions, multiple coins)
    {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-01-01', 'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-05-01', 'usd_balance': 120, 'usd_net_transfers': 50, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-10-01', 'usd_balance': 180, 'usd_net_transfers': 0, 'is_imputed': True},

    {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-01-01', 'usd_balance': 200, 'usd_net_transfers': 200, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-05-01', 'usd_balance': 300, 'usd_net_transfers': 50, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-10-01', 'usd_balance': 280, 'usd_net_transfers': 0, 'is_imputed': True},

    # w02_net_loss - btc (net loss)
    {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-01-01', 'usd_balance': 300, 'usd_net_transfers': 300, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-05-01', 'usd_balance': 250, 'usd_net_transfers': -100, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-10-01', 'usd_balance': 100, 'usd_net_transfers': 0, 'is_imputed': True},

    # w03_sell_all_and_rebuy
    {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-01-01', 'usd_balance': 50, 'usd_net_transfers': 50, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-03-01', 'usd_balance': 0,  'usd_net_transfers': -50, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-08-01', 'usd_balance': 40, 'usd_net_transfers': 40, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-10-01', 'usd_balance': 42, 'usd_net_transfers': 0, 'is_imputed': True},

    # # w04_only_period_end - btc (only final row)
    # {'coin_id': 'sol', 'wallet_address': 'w04_only_period_end', 'date': '2024-10-01', 'usd_balance': 70, 'usd_net_transfers': 70, 'is_imputed': False},

    # # w04a_only_period_end_w_balance - btc
    # {'coin_id': 'eth', 'wallet_address': 'w04a_only_period_end_w_balance', 'date': '2024-01-01', 'usd_balance': 30, 'usd_net_transfers': 0, 'is_imputed': True},
    # {'coin_id': 'eth', 'wallet_address': 'w04a_only_period_end_w_balance', 'date': '2024-10-01', 'usd_balance': 90, 'usd_net_transfers': 50, 'is_imputed': False},

    # # w04b_only_period_start_buy
    # {'coin_id': 'sol', 'wallet_address': 'w04b_only_period_start_buy', 'date': '2024-01-01', 'usd_balance': 300, 'usd_net_transfers': 300, 'is_imputed': False},
    # {'coin_id': 'sol', 'wallet_address': 'w04b_only_period_start_buy', 'date': '2024-10-01', 'usd_balance': 900, 'usd_net_transfers': 0, 'is_imputed': True},

    # # w04c_only_period_start_buy_w_existing_balance
    # {'coin_id': 'btc', 'wallet_address': 'w04c_only_period_start_buy_w_existing_balance', 'date': '2024-01-01', 'usd_balance': 350, 'usd_net_transfers': 300, 'is_imputed': False},
    # {'coin_id': 'btc', 'wallet_address': 'w04c_only_period_start_buy_w_existing_balance', 'date': '2024-10-01', 'usd_balance': 1050, 'usd_net_transfers': 0, 'is_imputed': True},

    # # w04d_only_period_start_sell
    # {'coin_id': 'sol', 'wallet_address': 'w04d_only_period_start_sell', 'date': '2024-01-01', 'usd_balance': 0, 'usd_net_transfers': -200, 'is_imputed': False},
    # {'coin_id': 'sol', 'wallet_address': 'w04d_only_period_start_sell', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

    # # w04e_only_period_start_sell_partial
    # {'coin_id': 'btc', 'wallet_address': 'w04e_only_period_start_sell_partial', 'date': '2024-01-01', 'usd_balance': 500, 'usd_net_transfers': -10, 'is_imputed': False},
    # {'coin_id': 'btc', 'wallet_address': 'w04e_only_period_start_sell_partial', 'date': '2024-10-01', 'usd_balance': 600, 'usd_net_transfers': 0, 'is_imputed': True},

    # # w05_only_imputed - btc (only imputed rows at start and end)
    # {'coin_id': 'sol', 'wallet_address': 'w05_only_imputed', 'date': '2024-01-01', 'usd_balance': 50, 'usd_net_transfers': 0, 'is_imputed': True},
    # {'coin_id': 'sol', 'wallet_address': 'w05_only_imputed', 'date': '2024-10-01', 'usd_balance': 70, 'usd_net_transfers': 0, 'is_imputed': True},

    # # w06_tiny_transactions - very small transactions relative to portfolio size
    # {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-01-01', 'usd_balance': 1250, 'usd_net_transfers': 0, 'is_imputed': True},
    # {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-02-01', 'usd_balance': 1220, 'usd_net_transfers': 1, 'is_imputed': False},
    # {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-08-01', 'usd_balance': 0, 'usd_net_transfers': -350, 'is_imputed': False},
    # {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

    # # w07_tiny_transactions2 - very small transactions relative to portfolio size
    # {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-01-01', 'usd_balance': 400, 'usd_net_transfers': 0, 'is_imputed': True},
    # {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-02-01', 'usd_balance': 1220, 'usd_net_transfers': -20, 'is_imputed': False},
    # {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-08-01', 'usd_balance': 0, 'usd_net_transfers': -150, 'is_imputed': False},
    # {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

    # # w08_offsetting_transactions - large offsetting transactions in the middle of the period
    # {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-01-01', 'usd_balance': 500, 'usd_net_transfers': 0, 'is_imputed': True},
    # {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-02-01', 'usd_balance': 10400, 'usd_net_transfers': 10000, 'is_imputed': False},
    # {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-02-02', 'usd_balance': 400, 'usd_net_transfers': -10000, 'is_imputed': False},
    # {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-10-01', 'usd_balance': 750, 'usd_net_transfers': 0, 'is_imputed': True},

    # # w09_memecoin_winner - Large swings in portfolio value
    # {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-01-01', 'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
    # {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-03-01', 'usd_balance': 250, 'usd_net_transfers': -500, 'is_imputed': False},
    # {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-05-01', 'usd_balance': 50, 'usd_net_transfers': -100, 'is_imputed': False},
    # {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-10-01', 'usd_balance': 10, 'usd_net_transfers': 0, 'is_imputed': True},

    # # w10_memecoin_loser - Large swings in portfolio value
    # {'coin_id': 'myro', 'wallet_address': 'w10_memecoin_loser', 'date': '2024-03-01', 'usd_balance': 250, 'usd_net_transfers': 250, 'is_imputed': False},
    # {'coin_id': 'myro', 'wallet_address': 'w10_memecoin_loser', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': -20, 'is_imputed': False},

    # # w11_sells_early
    # {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-03-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},
    # {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-04-01', 'usd_balance': 250, 'usd_net_transfers': 250, 'is_imputed': False},
    # {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-5-01', 'usd_balance': 0, 'usd_net_transfers': -300, 'is_imputed': False},
    # {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

    # # w12_buys_late
    # {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-03-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},
    # {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-09-01', 'usd_balance': 500, 'usd_net_transfers': 250, 'is_imputed': False},
    # {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-10-01', 'usd_balance': 550, 'usd_net_transfers': 0, 'is_imputed': True},
]




profits_df = pd.DataFrame(profits_data)

# # Validate test data format before proceeding
# validator = ProfitsValidator()
# validation_results = validator.validate_all(
#     profits_df,
#     training_period_start,
#     training_period_end
# )
# assert all(validation_results.values()), "Test data failed validation checks."

# # # Remove rows with a rounded 0 balance and 0 transfers which happens in wmo.retrieve_datasets() once validation checks are passed
# # profits_df = profits_df[
# #     ~((profits_df['usd_balance'] == 0) &
# #     (profits_df['usd_net_transfers'] == 0))
# # ]

# # # Add cash flow transfers logic
# # test_profits_df = wtf.add_cash_flow_transfers_logic(profits_df)

# # # # Compute wallet level trading features
# # # test_trading_features_df = wtf.calculate_wallet_trading_features(test_profits_df)

# # test_profits_df['wallet_address'].unique()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Construct the sample profits_df
profits_data = [
    # w01_multiple_coins - btc & eth (multiple transactions, multiple coins)
    {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-01-01', 'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-05-01', 'usd_balance': 120, 'usd_net_transfers': 50, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-10-01', 'usd_balance': 180, 'usd_net_transfers': 0, 'is_imputed': True},

    {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-01-01', 'usd_balance': 200, 'usd_net_transfers': 200, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-05-01', 'usd_balance': 300, 'usd_net_transfers': 50, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-10-01', 'usd_balance': 280, 'usd_net_transfers': 0, 'is_imputed': True},

    # w02_net_loss - btc (net loss)
    {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-01-01', 'usd_balance': 300, 'usd_net_transfers': 300, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-05-01', 'usd_balance': 250, 'usd_net_transfers': -100, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-10-01', 'usd_balance': 100, 'usd_net_transfers': 0, 'is_imputed': True},

    # w03_sell_all_and_rebuy
    {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-01-01', 'usd_balance': 50, 'usd_net_transfers': 50, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-03-01', 'usd_balance': 0,  'usd_net_transfers': -50, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-08-01', 'usd_balance': 40, 'usd_net_transfers': 40, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-10-01', 'usd_balance': 42, 'usd_net_transfers': 0, 'is_imputed': True},

    # w04_only_period_end - btc (only final row)
    {'coin_id': 'sol', 'wallet_address': 'w04_only_period_end', 'date': '2024-10-01', 'usd_balance': 70, 'usd_net_transfers': 70, 'is_imputed': False},

    # w04a_only_period_end_w_balance - btc
    {'coin_id': 'eth', 'wallet_address': 'w04a_only_period_end_w_balance', 'date': '2024-01-01', 'usd_balance': 30, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'eth', 'wallet_address': 'w04a_only_period_end_w_balance', 'date': '2024-10-01', 'usd_balance': 90, 'usd_net_transfers': 50, 'is_imputed': False},

    # w04b_only_period_start_buy
    {'coin_id': 'sol', 'wallet_address': 'w04b_only_period_start_buy', 'date': '2024-01-01', 'usd_balance': 300, 'usd_net_transfers': 300, 'is_imputed': False},
    {'coin_id': 'sol', 'wallet_address': 'w04b_only_period_start_buy', 'date': '2024-10-01', 'usd_balance': 900, 'usd_net_transfers': 0, 'is_imputed': True},

    # w04c_only_period_start_buy_w_existing_balance
    {'coin_id': 'btc', 'wallet_address': 'w04c_only_period_start_buy_w_existing_balance', 'date': '2024-01-01', 'usd_balance': 350, 'usd_net_transfers': 300, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w04c_only_period_start_buy_w_existing_balance', 'date': '2024-10-01', 'usd_balance': 1050, 'usd_net_transfers': 0, 'is_imputed': True},

    # w04d_only_period_start_sell
    {'coin_id': 'sol', 'wallet_address': 'w04d_only_period_start_sell', 'date': '2024-01-01', 'usd_balance': 0, 'usd_net_transfers': -200, 'is_imputed': False},
    {'coin_id': 'sol', 'wallet_address': 'w04d_only_period_start_sell', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

    # w04e_only_period_start_sell_partial
    {'coin_id': 'btc', 'wallet_address': 'w04e_only_period_start_sell_partial', 'date': '2024-01-01', 'usd_balance': 500, 'usd_net_transfers': -10, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w04e_only_period_start_sell_partial', 'date': '2024-10-01', 'usd_balance': 600, 'usd_net_transfers': 0, 'is_imputed': True},

    # w05_only_imputed - btc (only imputed rows at start and end)
    {'coin_id': 'sol', 'wallet_address': 'w05_only_imputed', 'date': '2024-01-01', 'usd_balance': 50, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'sol', 'wallet_address': 'w05_only_imputed', 'date': '2024-10-01', 'usd_balance': 70, 'usd_net_transfers': 0, 'is_imputed': True},

    # w06_tiny_transactions - very small transactions relative to portfolio size
    {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-01-01', 'usd_balance': 1250, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-02-01', 'usd_balance': 1220, 'usd_net_transfers': 1, 'is_imputed': False},
    {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-08-01', 'usd_balance': 0, 'usd_net_transfers': -350, 'is_imputed': False},
    {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

    # w07_tiny_transactions2 - very small transactions relative to portfolio size
    {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-01-01', 'usd_balance': 400, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-02-01', 'usd_balance': 1220, 'usd_net_transfers': -20, 'is_imputed': False},
    {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-08-01', 'usd_balance': 0, 'usd_net_transfers': -150, 'is_imputed': False},
    {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

    # w08_offsetting_transactions - large offsetting transactions in the middle of the period
    {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-01-01', 'usd_balance': 500, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-02-01', 'usd_balance': 10400, 'usd_net_transfers': 10000, 'is_imputed': False},
    {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-02-02', 'usd_balance': 400, 'usd_net_transfers': -10000, 'is_imputed': False},
    {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-10-01', 'usd_balance': 750, 'usd_net_transfers': 0, 'is_imputed': True},

    # w09_memecoin_winner - Large swings in portfolio value
    {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-01-01', 'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
    {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-03-01', 'usd_balance': 250, 'usd_net_transfers': -500, 'is_imputed': False},
    {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-05-01', 'usd_balance': 50, 'usd_net_transfers': -100, 'is_imputed': False},
    {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-10-01', 'usd_balance': 10, 'usd_net_transfers': 0, 'is_imputed': True},

    # w10_memecoin_loser - Large swings in portfolio value
    {'coin_id': 'myro', 'wallet_address': 'w10_memecoin_loser', 'date': '2024-03-01', 'usd_balance': 250, 'usd_net_transfers': 250, 'is_imputed': False},
    {'coin_id': 'myro', 'wallet_address': 'w10_memecoin_loser', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': -20, 'is_imputed': False},

    # w11_sells_early
    {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-03-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-04-01', 'usd_balance': 250, 'usd_net_transfers': 250, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-5-01', 'usd_balance': 0, 'usd_net_transfers': -300, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

    # w12_buys_late
    {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-03-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-09-01', 'usd_balance': 500, 'usd_net_transfers': 250, 'is_imputed': False},
    {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-10-01', 'usd_balance': 550, 'usd_net_transfers': 0, 'is_imputed': True},
]


In [None]:

# Rest of the sequence remains unchanged
profits_df = pd.DataFrame(profits_data).copy()
training_period_start = '2024-01-01'
training_period_end = '2024-10-01'

# Validate test data format before proceeding
validator = ProfitsValidator()
validation_results = validator.validate_all(
    profits_df,
    training_period_start,
    training_period_end
)
assert all(validation_results.values()), "Test data failed validation checks."

# Remove rows with a rounded 0 balance and 0 transfers which happens in wmo.retrieve_datasets() once validation checks are passed
profits_df = profits_df[
    ~((profits_df['usd_balance'] == 0) &
    (profits_df['usd_net_transfers'] == 0))
]

# Add cash flow transfers logic
cash_flow_profits_df = wtf.add_cash_flow_transfers_logic(profits_df)
wallet_features_df = wtf.calculate_wallet_trading_features(cash_flow_profits_df)
# wallet_features_df

In [None]:
# Reassign wallets to create a lot of overlap
reassign_dict = {
    'w01_multiple_coins': 'w1',
    'w02_net_loss': 'w2',
    'w03_sell_all_and_rebuy': 'w2',
    'w04_only_period_end': 'w3',
    'w04a_only_period_end_w_balance': 'w3',
    'w04b_only_period_start_buy': 'w2',
    'w04c_only_period_start_buy_w_existing_balance': 'w4',
    'w04d_only_period_start_sell': 'w4',
    'w04e_only_period_start_sell_partial': 'w5',
    'w05_only_imputed': 'w5',
    'w06_tiny_transactions': 'w5',
    'w07_tiny_transactions2': 'w2',
    'w08_offsetting_transactions': 'w1',
    'w09_memecoin_winner': 'w3',
    'w10_memecoin_loser': 'w4',
    'w11_sells_early': 'w6',
    'w12_buys_late': 'w6'
}
reassigned_profits_df = pd.DataFrame(profits_data).copy()
reassigned_profits_df['wallet_address_original'] = reassigned_profits_df['wallet_address']
reassigned_profits_df['wallet_address'] = reassigned_profits_df['wallet_address'].map(reassign_dict)

# Rest of the sequence remains unchanged
profits_df = reassigned_profits_df.copy().dropna()
# training_period_start = '2024-01-01'
# training_period_end = '2024-10-01'

# # Validate test data format before proceeding
validator = ProfitsValidator()
validation_results = validator.validate_all(
    profits_df,
    training_period_start,
    training_period_end
)
assert all(validation_results.values()), "Test data failed validation checks."

# Remove rows with a rounded 0 balance and 0 transfers which happens in wmo.retrieve_datasets() once validation checks are passed
profits_df = profits_df[
    ~((profits_df['usd_balance'] == 0) &
    (profits_df['usd_net_transfers'] == 0))
]

# Add cash flow transfers logic
test_remapped_profits_df = wtf.add_cash_flow_transfers_logic(profits_df)
test_remapped_trading_features_df = wtf.calculate_wallet_trading_features(test_remapped_profits_df)


test_profits_df = cash_flow_profits_df.copy()
test_trading_features_df = wallet_features_df.copy()

In [None]:
test_remapped_trading_features_df

In [None]:
 average_transaction	volume_vs_investment_ratio

In [None]:
# @pytest.mark.unit
# def test_ratio_metrics_after_remapping(test_profits_df, test_trading_features_df,
#                                     test_remapped_profits_df, test_remapped_trading_features_df):
"""
Verifies ratio-based metrics are calculated correctly after wallet remapping.

Tests:
1. average_transaction = total_volume / number of transactions
2. volume_vs_investment_ratio = total_volume / max_investment

Example:
If w1 and w2 map to new_w1:
w1: volume=$300 (3 trades), max_inv=$200
w2: volume=$100 (2 trades), max_inv=$100
Then new_w1 should have:
- average_transaction = $400/5 = $80
- volume_vs_investment_ratio = $400/$300 = 1.33
"""
# Create mapping from original to new wallets
wallet_mapping = (test_remapped_profits_df[['wallet_address', 'wallet_address_original']]
                    .drop_duplicates()
                    .set_index('wallet_address_original')['wallet_address'])

# Count transactions (non-imputed rows with non-zero transfers)
transaction_counts = (test_profits_df[
    (~test_profits_df['is_imputed']) &
    (test_profits_df['usd_net_transfers'] != 0)
]
.assign(new_wallet=lambda x: x['wallet_address'].map(wallet_mapping))
.groupby('new_wallet')
.size())

# Calculate expected average transaction using total_volume from features
expected_avg_transaction = (test_remapped_trading_features_df['total_volume'] /
                            transaction_counts)

# Calculate expected volume vs investment ratio using features
expected_vol_inv_ratio = np.where(
    test_remapped_trading_features_df['max_investment'] > 0,
    test_remapped_trading_features_df['total_volume'] /
    test_remapped_trading_features_df['max_investment'],
    0
)

# Compare metrics
assert np.allclose(expected_avg_transaction,
                    test_remapped_trading_features_df['average_transaction'],
                    equal_nan=True), "Average transaction doesn't match after remapping"
assert np.allclose(expected_vol_inv_ratio,
                    test_remapped_trading_features_df['volume_vs_investment_ratio'],
                    equal_nan=True), "Volume vs investment ratio doesn't match after remapping"

In [None]:
total_volumes

In [None]:
total_volumes

In [None]:
w1_df = test_remapped_profits_df[test_remapped_profits_df['wallet_address']=='w1'].sort_values(by='date')
w1_df

In [None]:
profits_df = w1_df.copy()
# Calculate base trading features
profits_df['date'] = pd.to_datetime(profits_df['date'])
profits_df = profits_df.sort_values(['wallet_address', 'coin_id', 'date'])

# Precompute necessary transformations
profits_df['abs_usd_net_transfers'] = profits_df['usd_net_transfers'].abs()
profits_df['cumsum_cash_flow_transfers'] = profits_df.sort_values(by='date').groupby('wallet_address')['cash_flow_transfers'].cumsum()
profits_df

In [None]:
actual_max_inv

In [None]:
actual_flows

In [None]:
test_remapped_trading_features_df

In [None]:
expected_volumes

In [None]:
actual_volumes

In [None]:
# Retrieve the original columns from the remapped profits_df
demapped_profits_df = test_remapped_profits_df[['coin_id','wallet_address_original','date','cash_flow_transfers']].copy()
demapped_profits_df = demapped_profits_df.rename(columns={'wallet_address_original': 'wallet_address'})
merged_df = demapped_profits_df.merge(
    cash_flow_profits_df[['coin_id', 'wallet_address', 'date', 'cash_flow_transfers']],
    on=['coin_id', 'wallet_address', 'date'],
    suffixes=('_demapped', '_cash_flow')
)
merged_df

assert np.allclose(merged_df['cash_flow_transfers_demapped'],merged_df['cash_flow_transfers_cash_flow'])

In [None]:
reassign_dict = {
    'w01_multiple_coins': 'w1',
    'w02_net_loss': 'w2',
    'w03_sell_all_and_rebuy': 'w2',
    'w04_only_period_end': 'w3',
    'w04a_only_period_end_w_balance': 'w3',
    'w04b_only_period_start_buy': 'w2',
    'w04c_only_period_start_buy_w_existing_balance': 'w4',
    'w04d_only_period_start_sell': 'w4',
    'w04e_only_period_start_sell_partial': 'w5',
    'w05_only_imputed': 'w5',
    'w06_tiny_transactions': 'w5',
    'w07_tiny_transactions2': 'w2',
    'w08_offsetting_transactions': 'w1',
    'w09_memecoin_winner': 'w3',
    'w10_memecoin_loser': 'w4',
    'w11_sells_early': 'w1',
    'w12_buys_late': 'w4'
}

end_profits_df = profits_df[profits_df['date']=='2024-10-01']
reassigned_profits_df = end_profits_df.copy()
reassigned_profits_df['wallet_address'] = reassigned_profits_df['wallet_address'].map(reassign_dict)

In [None]:
profits_df.copy().groupby(['wallet_address','coin_id'])

In [None]:
w = 'w05_only_imputed'
test_df = test_profits_df[test_profits_df['wallet_address']==w]
test_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

time_weighted_return_df = wpf.calculate_time_weighted_returns(test_df)
time_weighted_return_df

In [None]:
def test_calculate_time_weighted_returns_weighted_periods():
    """Tests TWR calculation with different holding periods, amounts, and a transfer."""

    test_data = pd.DataFrame([
        {'coin_id': 'btc', 'wallet_address': 'w03_weighted', 'date': '2024-01-01',
            'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},  # Initial $100
        {'coin_id': 'btc', 'wallet_address': 'w03_weighted', 'date': '2024-02-01',
            'usd_balance': 250, 'usd_net_transfers': 50, 'is_imputed': False},    # Added $50, value up
        {'coin_id': 'btc', 'wallet_address': 'w03_weighted', 'date': '2024-10-01',
            'usd_balance': 125, 'usd_net_transfers': 0, 'is_imputed': False}     # Value dropped
    ])

    test_data['date'] = pd.to_datetime(test_data['date'])
    result = wpf.calculate_time_weighted_returns(test_data)

    # Manual calculation:
    # Period 1: Jan 1 - Feb 1 (31 days)
    # Pre-transfer balance = 250 - 50 = 200
    # Return = 200/100 = 100% = 1.0
    # Weighted return = 1.0 * 31 = 31

    # Period 2: Feb 1 - Oct 1 (243 days)
    # Return = 125/250 = -50% = -0.5
    # Weighted return = -0.5 * 243 = -121.5

    # Total days = 274
    # Time weighted return = (31 - 121.5) / 274 = -0.33
    expected_twr = -0.33

    # Annualized = (1 - 0.33)^(365/274) - 1 ≈ -0.41
    expected_annual = ((1 + expected_twr) ** (365/274)) - 1

    # Assertions
    assert result.loc['w03_weighted', 'days_held'] == 274
    assert abs(result.loc['w03_weighted', 'time_weighted_return'] - expected_twr) < 0.01
    assert abs(result.loc['w03_weighted', 'annualized_twr'] - expected_annual) < 0.01

    # return result  # Helpful for debugging

In [None]:
time_weighted_return_df = wpf.calculate_time_weighted_returns(test_df,transfers_col = 'usd_net_transfers')
time_weighted_return_df

In [None]:
# def test_calculate_time_weighted_returns_imputed_case():
"""Tests TWR calculation for a wallet with only imputed balances."""

# Setup test data
test_data = pd.DataFrame([
    {'coin_id': 'btc', 'wallet_address': 'w05_only_imputed', 'date': '2024-01-01',
        'usd_balance': 50, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'btc', 'wallet_address': 'w05_only_imputed', 'date': '2024-10-01',
        'usd_balance': 70, 'usd_net_transfers': 0, 'is_imputed': True},
])
test_data['date'] = pd.to_datetime(test_data['date'])

# Calculate TWR
result = calculate_time_weighted_returns(test_data)

# Expected values
expected_twr = 0.40  # (70-50)/50 = 0.4
expected_days = 274  # Jan 1 to Oct 1
expected_annual = ((1 + 0.40) ** (365/274)) - 1  # ≈ 0.55

# Assertions with tolerance for floating point
assert abs(result.loc['w05_only_imputed', 'time_weighted_return'] - expected_twr) < 0.001
assert result.loc['w05_only_imputed', 'days_held'] == expected_days
assert abs(result.loc['w05_only_imputed', 'annualized_twr'] - expected_annual) < 0.001

In [None]:


def test_calculate_time_weighted_returns_memecoin_loser():
    """Tests TWR calculation for wallet with complete loss scenario."""

    test_data = pd.DataFrame([
        {'coin_id': 'bome', 'wallet_address': 'w10_memecoin_loser', 'date': '2024-03-01',
            'usd_balance': 250, 'usd_net_transfers': 250, 'is_imputed': False},
        {'coin_id': 'bome', 'wallet_address': 'w10_memecoin_loser', 'date': '2024-10-01',
            'usd_balance': 0, 'usd_net_transfers': -20, 'is_imputed': False}
    ])

    test_data['date'] = pd.to_datetime(test_data['date'])
    result = wpf.calculate_time_weighted_returns(test_data)

    # Calculate expected returns:
    # Mar-Oct: 250 -> 0+20 = -92% over 214 days
    # Total days = 214 (Mar 1 to Oct 1)

    assert result.loc['w10_memecoin_loser', 'days_held'] == 214

    # Should show significant negative returns
    time_weighted_return = result.loc['w10_memecoin_loser', 'time_weighted_return']
    assert -1.0 < time_weighted_return < -0.8  # Expecting ~-90% return

    # Check annualization
    annual_return = result.loc['w10_memecoin_loser', 'annualized_twr']
    assert -1.0 < annual_return < -0.9  # Heavy losses should persist in annualization

In [None]:
annual_return

In [None]:
annual_return

In [None]:
transfers_col = 'usd_net_transfers'
profits_df = test_df.sort_values(['wallet_address', 'coin_id', 'date'])

# Calculate holding period returns
profits_df['pre_transfer_balance'] = profits_df['usd_balance'] - profits_df[transfers_col]
profits_df['prev_balance'] = profits_df.groupby(['wallet_address', 'coin_id'])['usd_balance'].shift()
profits_df['days_held'] = profits_df.groupby(['wallet_address', 'coin_id'])['date'].diff().dt.days

# Calculate period returns and weights
profits_df['period_return'] = np.where(
    profits_df[transfers_col] != 0,
    profits_df['pre_transfer_balance'] / profits_df['prev_balance'],
    profits_df['usd_balance'] / profits_df['prev_balance']
)
profits_df['period_return'] = profits_df['period_return'].replace([np.inf, -np.inf], 1).fillna(1)

# Weight by holding period duration
profits_df['weighted_return'] = (profits_df['period_return'] - 1) * profits_df['days_held']

# Get total days for each wallet
total_days = profits_df.groupby('wallet_address')['date'].agg(lambda x: (x.max() - x.min()).days)

# Calculate TWR using total days held
def safe_twr(weighted_returns, wallet):
    if len(weighted_returns) == 0 or weighted_returns.isna().all():
        return 0
    days = max(total_days[wallet], 1)  # Get days for this wallet, minimum 1
    return weighted_returns.sum() / days

# Compute TWR and days_held using vectorized operations
twr_df = profits_df.groupby('wallet_address').agg(
    time_weighted_return=('weighted_return',
                            lambda x: safe_twr(x, profits_df.loc[x.index, 'wallet_address'].iloc[0])),
    days_held=('date', lambda x: max((x.max() - x.min()).days, 1))
)

# Annualize returns
twr_df['annualized_twr'] = ((1 + twr_df['time_weighted_return']) ** (365 / twr_df['days_held'])) - 1
twr_df = twr_df.replace([np.inf, -np.inf], np.nan)


In [None]:
profits_df

In [None]:
twr_df

In [None]:
# @pytest.mark.unit
# def test_two_row_wallet_returns(test_profits_df, test_trading_features_df):
"""
Verify return calculations for wallets with exactly two rows.

Logic:
1. Find all wallets with exactly 2 rows in profits_df
2. For each wallet, calculate simple return from first to last balance
3. Compare with TWR, expecting exact match for these simple cases

Args:
    test_profits_df: DataFrame with daily profits data
    test_trading_features_df: DataFrame with trading metrics
"""
# Find wallets with exactly two rows
two_row_wallets = (
    test_profits_df
    .groupby('wallet_address')
    .filter(lambda x: len(x) == 2)
)['wallet_address'].unique()

# Skip test if no qualifying wallets found
if len(two_row_wallets) == 0:
    pytest.skip("No wallets with exactly two rows found in test data")

for wallet in two_row_wallets:
    # Get wallet data and sort by date
    wallet_data = (
        test_profits_df[test_profits_df['wallet_address'] == wallet]
        .sort_values('date')
    )

    # Calculate simple return from first to last balance
    simple_return = (
        wallet_data.iloc[-1]['usd_balance'] /
        wallet_data.iloc[0]['usd_balance']
    ) - 1

    # Compare with TWR
    time_weighted_return_df = wpf.calculate_time_weighted_returns(test_profits_df)

    assert np.isclose(
        simple_return,
        time_weighted_return_df,
        rtol=1e-10
    ), f"TWR mismatch for wallet {wallet}. Expected {simple_return:.4f}, got {twr:.4f}"

In [None]:
test_profits_df

In [None]:
two_row_wallets

In [None]:
time_weighted_return_df

In [None]:
twr

In [None]:
def test_return_ratios_in_reasonable_range(test_trading_features_df):
    """
    Verify return ratios (net_flows/outflows) are within reasonable bounds.

    Logic:
    1. Filter for wallets with positive investments
    2. Calculate return ratio as total_net_flows / total_outflows
    3. Verify ratios don't exceed 1000% (10x)

    Args:
        test_trading_features_df: DataFrame with trading metrics including flows and investments
    """
    # Filter for wallets with actual investment activity
    active_wallets = test_trading_features_df[test_trading_features_df['max_investment'] > 0]

    # Calculate return ratios where outflows exist
    valid_outflows = active_wallets[active_wallets['total_outflows'] > 0]
    return_ratios = valid_outflows['total_net_flows'] / valid_outflows['max_investment']

    assert (return_ratios.abs() <= 10).all(), (
        f"Found extreme return ratios. Max ratio: {return_ratios.abs().max():.2f}"
    )
# valid_outflows

In [None]:
return_ratios.abs()

In [None]:
test_trading_features_df['total_net_flows'] / test_trading_features_df['max_investment']

In [None]:
cols = [
'total_inflows',
'total_outflows',
'total_net_flows',
'cash_buy_inflows',
'cash_sell_outflows',
'cash_net_flows',
'max_investment',
'total_net_flows',
'time_weighted_return',
]

test_trading_features_df[cols]

In [None]:
cash_buy_inflows = test_trading_features_df['cash_buy_inflows']
cash_sell_outflows = test_trading_features_df['cash_sell_outflows']
cash_net_flows = test_trading_features_df['cash_net_flows']

In [None]:
single_trade_wallets

In [None]:
import pandas as pd
import numpy as np
import pytest


# def test_single_trade_twr(test_profits_df, test_trading_features_df):
#     """Verify TWR matches simple return for single-trade wallets"""
# Find wallets with exactly one buy and one sell
single_trade_wallets = (
    test_profits_df[~test_profits_df['is_imputed']]
    .groupby('wallet_address')
    .filter(lambda x: len(x) == 2)
)['wallet_address'].unique()

if len(single_trade_wallets) > 0:
    for wallet in single_trade_wallets:
        wallet_data = (test_profits_df[test_profits_df['wallet_address'] == wallet]
                        .sort_values('date'))

        simple_return = (
            wallet_data.iloc[-1]['usd_balance'] /
            wallet_data.iloc[0]['usd_balance']
        ) - 1

        twr = test_trading_features_df.loc[wallet, 'time_weighted_return']
        assert np.isclose(simple_return, twr, rtol=1e-10), f"TWR mismatch for wallet {wallet}"

# # ===== Edge Case Tests =====
# def test_zero_balance_periods(test_profits_df, test_trading_features_df):
#     """Test wallets that go to zero balance and back"""
#     zero_balance_wallets = (
#         test_profits_df.groupby('wallet_address')
#         .filter(lambda x: (x['usd_balance'] == 0).any() and
#                         (x['usd_balance'] > 0).any())
#     )['wallet_address'].unique()

#     if len(zero_balance_wallets) > 0:
#         # Verify these wallets have valid metrics
#         for wallet in zero_balance_wallets:
#             features = test_trading_features_df.loc[wallet]
#             assert pd.notnull(features['time_weighted_return']), f"Missing TWR for wallet {wallet}"
#             assert features['total_volume'] > 0, f"Zero volume for active wallet {wallet}"

# def test_high_frequency_trading(test_profits_df, test_trading_features_df):
#     """Test wallets with frequent trading"""
#     busy_wallets = (
#         test_profits_df.groupby('wallet_address')
#         .filter(lambda x: len(x[~x['is_imputed']]) > 10)
#     )['wallet_address'].unique()

#     if len(busy_wallets) > 0:
#         high_freq_features = test_trading_features_df.loc[busy_wallets]
#         assert (high_freq_features['activity_density'] > 0).all(), "Missing activity for busy wallets"

In [None]:
# Get w01 data
wallet = 'w01_multiple_coins'
w01_profits = test_profits_df[test_profits_df['wallet_address'] == wallet]
w01_features = test_trading_features_df.loc[wallet]

# Test basic metrics
assert w01_features['transaction_days'] == 2  # Jan 1 and May 1
assert w01_features['unique_coins_traded'] == 2  # BTC and ETH
assert w01_features['cash_buy_inflows'] == 400  # Initial: BTC 100 + ETH 200, Add: BTC 50 + ETH 50

# Test volume metrics
assert w01_features['total_volume'] == 400  # Sum of all transfers
assert w01_features['average_transaction'] == 100  # 400 / 4 transactions

# Test imputed metrics
assert w01_features['total_inflows'] == 400  # Initial balances
assert w01_features['total_net_flows'] > 0  # Should be profitable given ending balances > deposits

# Test activity metrics
total_days = (w01_profits['date'].max() - w01_profits['date'].min()).days + 1
assert w01_features['activity_density'] == pytest.approx(2 / total_days, rel=1e-10)


In [None]:
wallets = [
    'w01_multiple_coins',
    # 'w02_net_loss',
    # 'w04_only_period_end',
    # 'w04a_only_period_end_w_balance',
    # 'w04b_only_period_start_buy',
    # 'w04c_only_period_start_buy_w_existing_balance',
    # 'w04d_only_period_start_sell',
    # 'w04e_only_period_start_sell_partial',
    # 'w05_only_imputed',
    # 'w08_offsetting_transactions',
    # 'w10_memecoin_loser',
    # 'w11_sells_early',
]

# Filter the DataFrame for specific wallets
test_df = test_profits_df[test_profits_df['wallet_address'].isin(wallets)].copy()

test_df


In [None]:
wtf.calculate_wallet_trading_features(test_df)

In [None]:
profits_df = test_df.sort_values(['wallet_address', 'coin_id', 'date'])

# Calculate holding period returns
profits_df['pre_transfer_balance'] = profits_df['usd_balance'] - profits_df['usd_net_transfers']
profits_df['prev_balance'] = profits_df.groupby(['wallet_address', 'coin_id'])['usd_balance'].shift()
profits_df['days_held'] = profits_df.groupby(['wallet_address', 'coin_id'])['date'].diff().dt.days

# Calculate period returns and weights
profits_df['period_return'] = np.where(
    profits_df['usd_net_transfers'] != 0,
    profits_df['pre_transfer_balance'] / profits_df['prev_balance'],
    profits_df['usd_balance'] / profits_df['prev_balance']
)
profits_df['period_return'] = profits_df['period_return'].replace([np.inf, -np.inf], 1).fillna(1)

# Weight by holding period duration
profits_df['weighted_return'] = (profits_df['period_return'] - 1) * profits_df['days_held']

# Safe aggregation that handles empty groups
def safe_twr(x):
    if len(x) == 0 or x.isna().all():
        return 0
    return x.sum() / max(x.count(), 1)  # Avoid div by 0

# Aggregate by wallet
twr_df = profits_df.groupby('wallet_address').agg(
    twr=('weighted_return', safe_twr),
    days_held=('date', lambda x: (x.max() - x.min()).days)
)

# Handle edge case where days_held is 0
twr_df['days_held'] = twr_df['days_held'].replace(0, 1)

# Annualize returns
twr_df['annualized_twr'] = ((1 + twr_df['twr']) ** (365 / twr_df['days_held'])) - 1
twr_df = twr_df.replace([np.inf, -np.inf], np.nan)


In [None]:
profits_df

In [None]:
twr_df

In [None]:
wallets = [
    'w05_only_imputed'
]

test_df = profits_df_test[profits_df_test['wallet_address'].isin(wallets)]
test_df

In [None]:
realized_returns_df = test_df.groupby('wallet_address').agg(
    total_investments=('cash_flow_transfers', lambda x: x[x > 0].sum()),
    total_withdrawals=('cash_flow_transfers', lambda x: -x[x < 0].sum())
)

realized_returns_df['realized_return'] = (
    realized_returns_df['total_withdrawals'] /
    realized_returns_df['total_investments']
) - 1

In [None]:
realized_returns_df

In [None]:
realized_returns_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


tr_df = wtf.calculate_realized_returns(profits_df_test)
tr_df


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


twr_df = wtf.calculate_time_weighted_returns(profits_df_test)
# twr_df.loc[['w02_net_loss', 'w05_only_imputed']]
# p2_df
twr_df


In [None]:
wallets = [
    'w1_multiple_coins',
    'w2_net_loss'
]

profits_df_test[profits_df_test['wallet_address'].isin(wallets)]

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wtf.calculate_time_weighted_returns(profits_df_test)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

test_data = {
    'coin_id': ['btc'] * 3,
    'wallet_address': ['wallet1'] * 3,
    'date': ['2024-01-01', '2024-01-03', '2024-01-10'],
    'usd_balance': [150, 220, 210],
    'usd_net_transfers': [80, -30, 0],
    'is_imputed': [False, False, True]
}
base_profits_df = pd.DataFrame(test_data)

# Validate test data format
validator = ProfitsValidator()
validation_results = validator.validate_all(
    base_profits_df,
    TestPeriods.TRAINING_PERIOD_START,
    TestPeriods.TRAINING_PERIOD_END
)
assert all(validation_results.values()), "Test data failed validation"

# Create profits_df and trading_features
profits_df = wtf.add_cash_flow_transfers_logic(base_profits_df)
trading_features_df = wtf.calculate_wallet_trading_features(profits_df)

trading_features_df

In [None]:
profits_df['date'].min() - profits_df['date'].max() + 1

In [None]:

# Ensure date is in datetime format
profits_df['date'] = pd.to_datetime(profits_df['date'])

# Sort by date and wallet_address to ensure proper cumulative calculations
profits_df = profits_df.sort_values(['wallet_address','coin_id','date'])

# Precompute necessary transformations
profits_df['abs_usd_net_transfers'] = profits_df['usd_net_transfers'].abs()

# Calculate cumsum by wallet, respecting date order
profits_df['cumsum_cash_flow_transfers'] = profits_df.groupby('wallet_address')['cash_flow_transfers'].cumsum()

# Metrics that take into account imputed rows/profits
logger.debug("Calculating wallet metrics based on imputed performance...")
imputed_metrics_df = profits_df.groupby('wallet_address').agg(
    invested=('cumsum_cash_flow_transfers', 'max'),
    total_net_flows=('cash_flow_transfers', lambda x: -x.sum()),
    unique_coins_traded=('coin_id', 'nunique')
)


imputed_metrics_df

In [None]:
# Metrics only based on observed activity
logger.debug("Calculating wallet metrics based on observed behavior...")
observed_metrics_df = profits_df[~profits_df['is_imputed']].groupby('wallet_address').agg(
    transaction_days=('date', 'nunique'),  # Changed from count to nunique for actual trading days
    total_volume=('abs_usd_net_transfers', 'sum'),
    average_transaction=('abs_usd_net_transfers', 'mean'),
    first_activity=('date', 'min'),
    last_activity=('date', 'max')
)

observed_metrics_df

In [None]:
profits_df
c = 'btc'
w = 'wallet1'

In [None]:
profits_df
c = 'btc'
w = 'wallet1'

wtf.calculate_wallet_trading_features(u.cw_filter_df(profits_df,c,w).copy())

In [None]:
profits_df
c = 'eth'
w = 'wallet1'

wtf.calculate_wallet_trading_features(u.cw_filter_df(profits_df,c,w).copy())

In [None]:
trading_features_df = trad

In [None]:
# import pytest
# import numpy as np

# @pytest.mark.unit
# def test_complex_scenarios_correctness(trading_features_df):
"""
Verify that the trading_features_df matches manually calculated results
for multiple complex scenarios. Each assertion is explained with the steps
used to compute the expected values.
"""

# wallet1 checks:
# invested=260 (max cumsum), total_net_flows=260, unique_coins_traded=2,
# transaction_days=2, total_volume=400, average_transaction=100,
# activity_days=10, activity_density=0.2
w1 = trading_features_df.loc['wallet1']
# Confirm invested
# Step: max cumsum was 260 across combined btc & eth transactions
assert np.isclose(w1['max_investment'], 260), "wallet1 invested incorrect."
# total_net_flows
# Step: sum of all cft for wallet1 was 260
assert np.isclose(w1['total_net_flows'], 260), "wallet1 total_net_flows incorrect."
# unique_coins_traded=2 (btc, eth)
assert w1['unique_coins_traded'] == 2, "wallet1 unique_coins_traded incorrect."
# transaction_days=2 (01-01 and 01-05 for observed rows)
assert w1['transaction_days'] == 2, "wallet1 transaction_days incorrect."
# total_volume=400 (sum abs transfers on observed: btc(100+50), eth(200+50))
assert w1['total_volume'] == 400, "wallet1 total_volume incorrect."
# average_transaction=100 (400 total /4 observed transactions)
assert w1['average_transaction'] == 100, "wallet1 average_transaction incorrect."
# activity_days=10 (01-01 to 01-10)
assert w1['activity_days'] == 10, "wallet1 activity_days incorrect."
# activity_density=0.2 (2/10)
assert np.isclose(w1['activity_density'], 0.2), "wallet1 activity_density incorrect."

# wallet2 checks:
# invested=-200, total_net_flows=-200, unique_coins_traded=1, transaction_days=1,
# total_volume=300, average_transaction=300, activity_days=10, activity_density=0.1
w2 = trading_features_df.loc['wallet2']
assert np.isclose(w2['max_investment'], -200), "wallet2 invested incorrect."
assert np.isclose(w2['total_net_flows'], -200), "wallet2 total_net_flows incorrect."
assert w2['unique_coins_traded'] == 1, "wallet2 unique_coins_traded incorrect."
assert w2['transaction_days'] == 1, "wallet2 transaction_days incorrect."
assert w2['total_volume'] == 300, "wallet2 total_volume incorrect."
assert w2['average_transaction'] == 300, "wallet2 average_transaction incorrect."
assert w2['activity_days'] == 10, "wallet2 activity_days incorrect."
assert np.isclose(w2['activity_density'], 0.1), "wallet2 activity_density incorrect."

# wallet3 checks:
# invested=-18, total_net_flows=-18, unique_coins_traded=1, transaction_days=3,
# total_volume=140, average_transaction≈46.6667, activity_days=10, activity_density=0.3
w3 = trading_features_df.loc['wallet3']
assert np.isclose(w3['max_investment'], -18), "wallet3 invested incorrect."
assert np.isclose(w3['total_net_flows'], -18), "wallet3 total_net_flows incorrect."
assert w3['unique_coins_traded'] == 1, "wallet3 unique_coins_traded incorrect."
assert w3['transaction_days'] == 3, "wallet3 transaction_days incorrect."
assert w3['total_volume'] == 140, "wallet3 total_volume incorrect."
# average_transaction = 140/3 ≈46.6667
assert np.isclose(w3['average_transaction'], 46.6667, atol=1e-4), "wallet3 average_transaction incorrect."
assert w3['activity_days'] == 10, "wallet3 activity_days incorrect."
# activity_density=3/10=0.3
assert np.isclose(w3['activity_density'], 0.3), "wallet3 activity_density incorrect."

# wallet4 checks:
# invested=70, total_net_flows=70, unique_coins_traded=1, transaction_days=0,
# total_volume=0, average_transaction=0, activity_days=1, activity_density=0
w4 = trading_features_df.loc['wallet4']
assert np.isclose(w4['max_investment'], 70), "wallet4 invested incorrect."
assert np.isclose(w4['total_net_flows'], 70), "wallet4 total_net_flows incorrect."
assert w4['unique_coins_traded'] == 1, "wallet4 unique_coins_traded incorrect."
assert w4['transaction_days'] == 0, "wallet4 transaction_days incorrect."
assert w4['total_volume'] == 0, "wallet4 total_volume incorrect."
assert w4['average_transaction'] == 0, "wallet4 average_transaction incorrect."
assert w4['activity_days'] == 1, "wallet4 activity_days incorrect."
assert np.isclose(w4['activity_density'], 0.0), "wallet4 activity_density incorrect."

# wallet5 checks:
# invested=0, total_net_flows=0, unique_coins_traded=1, transaction_days=0,
# total_volume=0, average_transaction=0, activity_days=10, activity_density=0
w5 = trading_features_df.loc['wallet5']
assert np.isclose(w5['max_investment'], 0), "wallet5 invested incorrect."
assert np.isclose(w5['total_net_flows'], 0), "wallet5 total_net_flows incorrect."
assert w5['unique_coins_traded'] == 1, "wallet5 unique_coins_traded incorrect."
assert w5['transaction_days'] == 0, "wallet5 transaction_days incorrect."
assert w5['total_volume'] == 0, "wallet5 total_volume incorrect."
assert w5['average_transaction'] == 0, "wallet5 average_transaction incorrect."
assert w5['activity_days'] == 10, "wallet5 activity_days incorrect."
assert np.isclose(w5['activity_density'], 0.0), "wallet5 activity_density incorrect."

In [None]:
expected_metrics