In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import re
import pdb
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['ALERT_SOUND_FILEPATH']="../../../Local/assets/sounds/mixkit-alert-bells-echo-765.wav"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp

# Wallet modeling
import wallet_modeling.wallet_orchestrator as wo
import wallet_modeling.wallet_training_data as wtd
import wallet_modeling.model_reporting as wmr
import wallet_modeling.wallet_model_experiment as wme
from wallet_modeling.wallets_config_manager import WalletsConfig

# Wallet features
import wallet_features.clustering_features as wcl
import wallet_features.market_cap_features as wmc
import wallet_features.market_timing_features as wmt
import wallet_features.performance_features as wp
import wallet_features.trading_features as wtf
import wallet_features.transfers_features as wts
import wallet_features.wallet_features as wf

# Wallet insights
import wallet_insights.wallet_model_evaluation as wime
import wallet_insights.validation_analysis as wiv
import wallet_insights.coin_forecasting as wicf


# reload all modules
modules = [u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp,
           wo, wtd, wmr, wme,
           wcl, wmc, wmt, wp, wtf, wts, wf,
           wime, wiv, wicf]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')
wallets_config = WalletsConfig.load_from_yaml('../config/wallets_config.yaml')
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

logger.info("Good morning, let's get to work")

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

u.export_code(code_directories=['wallet_features'])

## Full Training Data Sequence

### retrieve datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Retrieve datasets
profits_df,market_data_df = wo.retrieve_datasets()

# # Define wallet cohort after cleaning
# training_wallet_metrics_df,wallet_cohort = wo.define_wallet_cohort(profits_df,market_data_df)

# # Generate profits_df for all training windows and the modeling period
# training_profits_df, training_windows_profits_dfs, modeling_profits_df, validation_profits_df = wo.split_profits_df(profits_df,
#                                                                                market_data_df,wallet_cohort)


In [None]:
profits_df_full = profits_df.copy()

In [None]:

# Define wallet cohort after cleaning
training_wallet_metrics_df,wallet_cohort = wo.define_wallet_cohort(profits_df,market_data_df)

# # Generate profits_df for all training windows and the modeling period
# training_profits_df, training_windows_profits_dfs, modeling_profits_df, validation_profits_df = wo.split_profits_df(profits_df,
#                                                                                market_data_df,wallet_cohort)


## Codespace

In [None]:
training_profits_df[(training_profits_df['usd_balance']==0) & training_profits_df['usd_net_transfers']==0].shape

### generate features

In [None]:
# Impute the training period end (training period start is pre-imputed into profits_df generation)
training_period_end = [wallets_config['training_data']['training_period_end']]
imputed_profits_df = pri.impute_profits_for_multiple_dates(profits_df, market_data_df,
                                                        training_period_end, n_threads=24)

# Create a training period only profits_df
training_profits_df = imputed_profits_df[
    imputed_profits_df['date']<=wallets_config['training_data']['training_period_end']
    ].copy()



In [None]:
# Add cash flows logic column
training_profits_df = wtf.add_cash_flow_transfers_logic(training_profits_df)

# # Compute wallet level metrics over duration of training period
# training_wallet_metrics_df = wtf.calculate_wallet_trading_features(training_profits_df)

# # Apply filters based on wallet behavior during the training period
# filtered_training_wallet_metrics_df = wtd.apply_wallet_thresholds(training_wallet_metrics_df)

# # Identify cohort
# wallet_cohort = filtered_training_wallet_metrics_df.index.values

# # Upload the cohort to BigQuery for additional complex feature generation
# wtd.upload_wallet_cohort(wallet_cohort)

# logger.info("Cohort defined as %s wallets after %.2f seconds.",
#             len(wallet_cohort), time.time()-start_time)

# return filtered_training_wallet_metrics_df,wallet_cohort



In [None]:
u.export_code(code_directories=['wallet_features'])

In [None]:
training_profits_df.describe()

In [None]:
# Generate features for the full training dataset
training_wallet_features_df = wf.calculate_wallet_features(training_profits_df, market_indicators_data_df,
                                                           transfers_sequencing_df, wallet_cohort)

In [None]:
window_profits_df.describe()

In [None]:
window_wallet_features_df = wf.calculate_wallet_features(window_profits_df, market_indicators_data_df,
                                                            transfers_sequencing_df, wallet_cohort)


In [None]:
profits_df2 = window_profits_df.copy()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


profits_df2 = window_profits_df.copy()

# Create a DataFrame with all wallets that should exist
wallet_features_df = pd.DataFrame(index=wallet_cohort)
wallet_features_df.index.name = 'wallet_address'

# Trading features (inner join, custom fill)
profits_df2 = wtf.add_cash_flow_transfers_logic(profits_df2)
trading_features = wtf.calculate_wallet_trading_features(profits_df2)
trading_features = wtf.fill_trading_features_data(trading_features, wallet_cohort)
wallet_features_df = wallet_features_df.join(trading_features, how='inner')

# Market timing features (fill zeros)
timing_features = wmt.calculate_market_timing_features(profits_df2, market_indicators_data_df)
wallet_features_df = wallet_features_df.join(timing_features, how='left')\
    .fillna({col: 0 for col in timing_features.columns})

# Market cap features (fill zeros)
market_features = wmc.calculate_market_cap_features(profits_df2, market_indicators_data_df)
wallet_features_df = wallet_features_df.join(market_features, how='left')\
    .fillna({col: 0 for col in market_features.columns})

# Transfers features (fill -1)
transfers_features = wts.calculate_transfers_sequencing_features(profits_df2, transfers_sequencing_df)
wallet_features_df = wallet_features_df.join(transfers_features, how='left')\
    .fillna({col: -1 for col in transfers_features.columns})

# Performance features (inner join, no fill)
performance_features = wp.calculate_performance_features(wallet_features_df)
wallet_features_df = wallet_features_df.join(
    performance_features.drop(['invested', 'net_gain'], axis=1),
    how='inner'
)

In [None]:
if trading_features['invested'].min() < 0:
    raise ValueError(f"Found {len(trading_features[trading_features['invested']<0])} wallets "
                     "with negative invested values.")

In [None]:
w = 33872418
profits_df2[profits_df2['wallet_address']==w]

In [None]:
trading_features

In [None]:
wallets_df = wallet_features_df.copy()

metrics_df = wallets_df[['invested','net_gain']].copy().round(6)
returns_winsorization = wallets_config['modeling']['returns_winsorization']
epsilon = 1e-10

# Calculate base return
metrics_df['return'] = np.where(abs(metrics_df['invested']) == 0,0,
                                metrics_df['net_gain'] / metrics_df['invested'])

# Apply winsorization
if returns_winsorization > 0:
    metrics_df['return'] = u.winsorize(metrics_df['return'],returns_winsorization)

# Risk-Adjusted Dollar Return
metrics_df['risk_adj_return'] = metrics_df['net_gain'] * \
    (1 + np.log10(metrics_df['invested'] + epsilon))

# # Normalize returns
# metrics_df['norm_return'] = (metrics_df['return'] - metrics_df['return'].min()) / \
#     (metrics_df['return'].max() - metrics_df['return'].min())

# # Normalize logged investments
# log_invested = np.log10(metrics_df['invested'] + epsilon)
# metrics_df['norm_invested'] = (log_invested - log_invested.min()) / \
#     (log_invested.max() - log_invested.min())

# # Performance score
# metrics_df['performance_score'] = (0.6 * metrics_df['norm_return'] +
#                                     0.4 * metrics_df['norm_invested'])

# # Log-weighted return
# metrics_df['log_weighted_return'] = metrics_df['return'] * \
#     np.log10(metrics_df['invested'] + epsilon)

# # Hybrid score (combining absolute and relative performance)
# max_gain = metrics_df['net_gain'].abs().max()
# metrics_df['norm_gain'] = metrics_df['net_gain'] / max_gain
# metrics_df['hybrid_score'] = (metrics_df['norm_gain'] +
#                             metrics_df['norm_return']) / 2

# # Size-adjusted rank
# # Create mask for zero values
# zero_mask = metrics_df['invested'] == 0

# # Create quartiles series initialized with 'q0' for zero values
# quartiles = pd.Series('q0', index=metrics_df.index)

# # Calculate quartiles for non-zero values
# non_zero_quartiles = pd.qcut(metrics_df['invested'][~zero_mask],
#                             q=4,
#                             labels=['q1', 'q2', 'q3', 'q4'])

# # Assign the quartiles to non-zero values
# quartiles[~zero_mask] = non_zero_quartiles

# # Calculate size-adjusted rank within each quartile
# metrics_df['size_adjusted_rank'] = metrics_df.groupby(quartiles)['return'].rank(pct=True)


# # Clean up intermediate columns
# cols_to_drop = ['norm_return', 'norm_invested', 'norm_gain']
# metrics_df = metrics_df.drop(columns=[c for c in cols_to_drop
#                                     if c in metrics_df.columns])



In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Append clustering features based on all numeric features in the base training data
cluster_features = wcl.create_basic_cluster_features(base_training_data_df)
training_data_df = base_training_data_df.join(cluster_features, how='inner')



### join target variable to training data

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Clean inactive wallets from modeling period data
modeling_wallets_df = wo.filter_modeling_period_wallets(modeling_profits_df)

# Generate target variables
target_vars_df = wp.calculate_performance_features(modeling_wallets_df)

# Merge training data and target variables?
modeling_df = training_data_df.join(target_vars_df[wallets_config['modeling']['target_variable']],
                                    how='inner')


## Wallet Modeling

### build model

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create an experiment instance
experiment = wme.WalletModel(wallets_config)

# Run the experiment and get results
model_results = experiment.run_experiment(modeling_df)

# Extract the trained model
model = model_results['pipeline'].named_steps['regressor']

### assess model performance

In [None]:
### save model artifacts
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate and save all model artifacts
model_id, evaluator, wallet_scores_df, coin_validation_df = wmr.generate_and_save_model_artifacts(
    model_results=model_results,
    validation_profits_df=validation_profits_df,
    base_path='../wallet_modeling'
)
u.play_notification()

# Print results
evaluator.plot_evaluation()

### Validation period assessments

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wallet_performance_df, bucketed_performance_df = wiv.calculate_validation_metrics(
    X_test=model_results['X_test'],
    y_pred=model_results['y_pred'],
    validation_profits_df=validation_profits_df,
)

bucketed_performance_df

## coin performance predictions

### create coin_validation_df with metrics and returns

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Consolidate wallet scores at the coin level
wallet_scores_df = pd.DataFrame({'score': model_results['y_pred']}, index=model_results['y_test'].index)
coin_wallet_metrics_df = wicf.calculate_coin_metrics_from_wallet_scores(validation_profits_df, wallet_scores_df)

# Calculate coin performance during the validation period
coin_performance_df = wicf.calculate_coin_performance(market_data_df,
                                                     wallets_config['training_data']['validation_period_start'],
                                                     wallets_config['training_data']['validation_period_end'])

# Join aggregated wallet metrics with actual coin performance
coin_validation_df = coin_wallet_metrics_df.join(coin_performance_df, how='inner')

### plotting coin feature performance vs market cap

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Get the analysis results
segment_results, summary_df = wicf.analyze_market_cap_segments(
    coin_validation_df,
    top_n=10
)

# Or create the visualizations
wicf.plot_segment_heatmap(summary_df)
# wicf.plot_metric_consistency(summary_df)  # Optional secondary visualization


### coin performance of top n for each bucket

In [None]:

# Run analysis
top_n = wallets_config['coin_forecasting']['top_n']
max_market_cap = wallets_config['coin_forecasting']['max_market_cap']
min_market_cap = wallets_config['coin_forecasting']['min_market_cap']

metric_top_coin_performance_df = wicf.validate_coin_performance(coin_validation_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

### compare performance of high vs low score coins

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wicf.print_performance_analysis(coin_validation_df)

## Junkyard

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the data into a pandas DataFrame
def analyze_coin_metrics(df):
    """
    Analyze relationships between coin metrics and returns
    """
    # Calculate correlations with coin_return
    metrics_of_interest = [
        'weighted_avg_score',
        'composite_score',
        'score_confidence',
        'top_wallet_balance_pct',
        'top_wallet_count_pct',
        'total_wallets',
        'avg_wallet_balance',
        'market_cap'
    ]

    # Calculate correlations
    correlations = {}
    for metric in metrics_of_interest:
        correlation = df[metric].corr(df['coin_return'])
        correlations[metric] = correlation

    # Sort correlations by absolute value
    correlations_sorted = {k: v for k, v in sorted(correlations.items(),
                                                 key=lambda x: abs(x[1]),
                                                 reverse=True)}

    # Calculate basic statistics for coins with positive vs negative returns
    positive_returns = df[df['coin_return'] > 0]
    negative_returns = df[df['coin_return'] <= 0]

    comparison_stats = {}
    for metric in metrics_of_interest:
        pos_mean = positive_returns[metric].mean()
        neg_mean = negative_returns[metric].mean()
        # Perform t-test
        t_stat, p_value = stats.ttest_ind(positive_returns[metric],
                                        negative_returns[metric])

        comparison_stats[metric] = {
            'positive_mean': pos_mean,
            'negative_mean': neg_mean,
            'difference': pos_mean - neg_mean,
            'p_value': p_value
        }

    # Identify potential success indicators
    success_indicators = {
        metric: stats for metric, stats in comparison_stats.items()
        if (abs(stats['difference']) > 0.1 * stats['negative_mean'] and
            stats['p_value'] < 0.05)
    }

    return {
        'correlations': correlations_sorted,
        'comparison_stats': comparison_stats,
        'success_indicators': success_indicators
    }

# Create summary statistics
def print_analysis_results(results):
    """
    Print formatted analysis results
    """
    print("\n=== Correlation Analysis ===")
    print("\nCorrelations with coin return (sorted by strength):")
    for metric, corr in results['correlations'].items():
        print(f"{metric:25} : {corr:0.4f}")

    print("\n=== Positive vs Negative Returns Analysis ===")
    print("\nMetrics comparison for positive vs negative returns:")
    for metric, stats in results['comparison_stats'].items():
        print(f"\n{metric}:")
        print(f"  Positive returns mean: {stats['positive_mean']:0.4f}")
        print(f"  Negative returns mean: {stats['negative_mean']:0.4f}")
        print(f"  Difference: {stats['difference']:0.4f}")
        print(f"  P-value: {stats['p_value']:0.4f}")

    print("\n=== Strong Success Indicators ===")
    print("\nMetrics showing significant difference between positive and negative returns:")
    for metric, stats in results['success_indicators'].items():
        print(f"\n{metric}:")
        print(f"  Mean difference: {stats['difference']:0.4f}")
        print(f"  P-value: {stats['p_value']:0.4f}")


# Run the analysis
def main():
    # Read the data
    df = pd.read_csv('coin_wallet_metrics.csv')

    # Run analysis
    results = analyze_coin_metrics(df)

    # Print results
    print_analysis_results(results)

    # Create visualizations
    create_visualizations(df)

if __name__ == "__main__":
    main()

In [None]:
# Winsorize the returns (apply caps to the top n % of values)
returns_winsorized = u.winsorize(returns, winsorization_cutoff)

# Merge datasets
df = pd.DataFrame({
    'predictions': predictions,
    'returns': returns_winsorized,
})

# Sort by actual returns to obtain optimal performance
df_sorted = df.sort_values('returns', ascending=False)
cumulative_best_returns = np.cumsum(df_sorted['returns'])
cumulative_best_avg_returns = df_sorted['returns'].expanding().mean()

# Sort by model score to obtain modeled performance
df_sorted = df.sort_values('predictions', ascending=False)
cumulative_model_returns = np.cumsum(df_sorted['returns'])
cumulative_model_avg_returns = df_sorted['returns'].expanding().mean()

# Calculate average return across all data
average_return = np.mean(returns_winsorized)

In [None]:
df

In [None]:
cumulative_model_returns

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Run analysis
top_n = wallets_config['coin_forecasting']['top_n']
max_market_cap = wallets_config['coin_forecasting']['max_market_cap']
min_market_cap = wallets_config['coin_forecasting']['min_market_cap']

metric_top_coin_performance_df = wicf.validate_coin_performance(coin_validation_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# List the coins that would have been picked at the start of the validation period
top_coins_df = coin_validation_df[
    (coin_validation_df['market_cap_filled']<=max_market_cap)
    & (coin_validation_df['market_cap_filled']>=min_market_cap)
].copy()

sort_column = wallets_config['coin_forecasting']['sort_method']

top_coins_df.sort_values(sort_column,ascending=False).head(top_n)

## Tests failing

In [None]:


class ProfitsValidator:
    """
    Validates profits DataFrame follows expected format and constraints.
    Only validates training period data.
    """
    def validate_all(self, profits_df, training_period_start, training_period_end):
        """Run all validation checks and return dict of results"""
        dates = {
            'training_period_start': pd.to_datetime(training_period_start),
            'training_period_end': pd.to_datetime(training_period_end),
        }

        return {
            'no_duplicates': self.check_no_duplicates(profits_df),
            'period_boundaries': self.check_period_boundaries(profits_df, dates),
            'no_negatives': self.check_no_negative_balances(profits_df),
            'date_range': self.check_date_range(profits_df, dates),
            'no_missing': self.check_no_missing_values(profits_df)
        }

    def check_no_duplicates(self, profits_df):
        """Check for duplicate records"""
        deduped_df = profits_df[['coin_id', 'wallet_address', 'date']].drop_duplicates()
        return len(profits_df) == len(deduped_df)

    def check_period_boundaries(self, profits_df, dates):
        """Check records exist at period boundaries"""
        profits_df['date'] = pd.to_datetime(profits_df['date'])
        pairs = profits_df[['coin_id', 'wallet_address']].drop_duplicates()
        n_pairs = len(pairs)

        period_df = profits_df[profits_df['date'] == dates['training_period_end']]
        period_pairs = period_df[['coin_id', 'wallet_address']].drop_duplicates()
        return len(period_pairs) == n_pairs

    def check_no_negative_balances(self, profits_df):
        """Check for negative USD balances"""
        return (profits_df['usd_balance'] >= -0.1).all()

    def check_date_range(self, profits_df, dates):
        """Verify date coverage"""
        profits_df['date'] = pd.to_datetime(profits_df['date'])
        return (profits_df['date'].min() >= dates['training_period_start'] and
                profits_df['date'].max() == dates['training_period_end'])

    def check_no_missing_values(self, profits_df):
        """Check for missing values"""
        return not profits_df.isna().any().any()


class TestPeriods:
    """Test period dates"""
    TRAINING_PERIOD_START: str = '2024-01-01'
    TRAINING_PERIOD_END: str = '2024-01-10'



In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

In [None]:
# import pytest
# import numpy as np
# import pandas as pd

# @pytest.mark.unit
# def test_complex_scenarios():
"""
Test multiple complex business scenarios in a single profits_df.

Scenarios covered:
- Many-to-many relationships between wallets and coins.
- A wallet with only imputed rows at start and end (no real trades).
- A wallet with only 1 row at training period end.
- A wallet ending with a net loss.
- A wallet that sells full balance mid-period and repurchases later.
- Additionally tested: incremental investments, multiple coins, and ensuring
    that all coin-wallet pairs appear at the training period end.
"""

# Training period
training_period_start = '2024-01-01'
training_period_end = '2024-01-10'

# Construct the sample profits_df
profits_data = [
    # w1_multiple_coins - btc & eth (multiple transactions, multiple coins)
    {'coin_id': 'btc', 'wallet_address': 'w1_multiple_coins', 'date': '2024-01-01', 'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w1_multiple_coins', 'date': '2024-01-05', 'usd_balance': 150, 'usd_net_transfers': 50, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w1_multiple_coins', 'date': '2024-01-10', 'usd_balance': 180, 'usd_net_transfers': 0, 'is_imputed': True},

    {'coin_id': 'eth', 'wallet_address': 'w1_multiple_coins', 'date': '2024-01-01', 'usd_balance': 200, 'usd_net_transfers': 200, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w1_multiple_coins', 'date': '2024-01-05', 'usd_balance': 250, 'usd_net_transfers': 50, 'is_imputed': False},
    {'coin_id': 'eth', 'wallet_address': 'w1_multiple_coins', 'date': '2024-01-10', 'usd_balance': 280, 'usd_net_transfers': 0, 'is_imputed': True},

    # w2_net_loss - btc (net loss)
    {'coin_id': 'btc', 'wallet_address': 'w2_net_loss', 'date': '2024-01-01', 'usd_balance': 300, 'usd_net_transfers': 300, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w2_net_loss', 'date': '2024-01-05', 'usd_balance': 250, 'usd_net_transfers': -100, 'is_imputed': False},
    {'coin_id': 'btc', 'wallet_address': 'w2_net_loss', 'date': '2024-01-10', 'usd_balance': 100, 'usd_net_transfers': 0, 'is_imputed': True},

    # w3_sell_all_and_rebuy - ada (sell full balance mid-way and repurchase)
    {'coin_id': 'ada', 'wallet_address': 'w3_sell_all_and_rebuy', 'date': '2024-01-01', 'usd_balance': 50, 'usd_net_transfers': 50, 'is_imputed': False},
    {'coin_id': 'ada', 'wallet_address': 'w3_sell_all_and_rebuy', 'date': '2024-01-03', 'usd_balance': 0,  'usd_net_transfers': -50, 'is_imputed': False},
    {'coin_id': 'ada', 'wallet_address': 'w3_sell_all_and_rebuy', 'date': '2024-01-08', 'usd_balance': 40, 'usd_net_transfers': 40, 'is_imputed': False},
    {'coin_id': 'ada', 'wallet_address': 'w3_sell_all_and_rebuy', 'date': '2024-01-10', 'usd_balance': 42, 'usd_net_transfers': 0, 'is_imputed': True},

    # w4_only_period_end - btc (only final row)
    {'coin_id': 'btc', 'wallet_address': 'w4_only_period_end', 'date': '2024-01-10', 'usd_balance': 70, 'usd_net_transfers': 70, 'is_imputed': False},

    # w5_only_imputed - btc (only imputed rows at start and end)
    {'coin_id': 'btc', 'wallet_address': 'w5_only_imputed', 'date': '2024-01-01', 'usd_balance': 50, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'btc', 'wallet_address': 'w5_only_imputed', 'date': '2024-01-10', 'usd_balance': 50, 'usd_net_transfers': 0, 'is_imputed': True},

    # w6_tiny_transactions - very small transactions relative to portfolio size
    {'coin_id': 'myro', 'wallet_address': 'w6_multiple_coins', 'date': '2024-01-01', 'usd_balance': 1250, 'usd_net_transfers': 0, 'is_imputed': True},
    {'coin_id': 'myro', 'wallet_address': 'w6_multiple_coins', 'date': '2024-01-02', 'usd_balance': 1220, 'usd_net_transfers': 1, 'is_imputed': False},
    {'coin_id': 'myro', 'wallet_address': 'w6_multiple_coins', 'date': '2024-01-05', 'usd_balance': 1280, 'usd_net_transfers': -2, 'is_imputed': False},
    {'coin_id': 'myro', 'wallet_address': 'w6_multiple_coins', 'date': '2024-01-10', 'usd_balance': 0, 'usd_net_transfers': -1240, 'is_imputed': False},

    # w7_memecoin_winner - Large swings in portfolio value
    {'coin_id': 'pepe', 'wallet_address': 'w7_memecoin_winner', 'date': '2024-01-01', 'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
    {'coin_id': 'pepe', 'wallet_address': 'w7_memecoin_winner', 'date': '2024-01-03', 'usd_balance': 250, 'usd_net_transfers': -500, 'is_imputed': False},
    {'coin_id': 'pepe', 'wallet_address': 'w7_memecoin_winner', 'date': '2024-01-05', 'usd_balance': 50, 'usd_net_transfers': -100, 'is_imputed': False},
    {'coin_id': 'pepe', 'wallet_address': 'w7_memecoin_winner', 'date': '2024-01-10', 'usd_balance': 10, 'usd_net_transfers': 0, 'is_imputed': True},

    # w8_memecoin_loser - Large swings in portfolio value
    {'coin_id': 'wojak', 'wallet_address': 'w8_memecoin_loser', 'date': '2024-01-03', 'usd_balance': 250, 'usd_net_transfers': 250, 'is_imputed': False},
    {'coin_id': 'wojak', 'wallet_address': 'w8_memecoin_loser', 'date': '2024-01-10', 'usd_balance': 0, 'usd_net_transfers': -20, 'is_imputed': False},
]

profits_df = pd.DataFrame(profits_data)

# Validate test data format before proceeding
validator = ProfitsValidator()
validation_results = validator.validate_all(
    profits_df,
    training_period_start,
    training_period_end
)
assert all(validation_results.values()), "Test data failed validation checks."

# ACT: Run the feature calculation code (mocking actual calls)
# For demonstration, we assume the functions are accessible via wtf.
# Replace wtf. references with the actual module name as needed.
profits_df = wtf.add_cash_flow_transfers_logic(profits_df)
trading_features_df = wtf.calculate_wallet_trading_features(profits_df)

profits_df_test = profits_df.copy()

In [None]:
wallets = [
    'w1_multiple_coins',
    'w2_net_loss'
]

profits_df_test[profits_df_test['wallet_address'].isin(wallets)]

In [None]:
profits_df = profits_df_test.copy()

# Ensure data is sorted for calculations
profits_df = profits_df.sort_values(['wallet_address', 'coin_id', 'date'])

# Calculate balance before transfers for each day
profits_df['pre_transfer_balance'] = profits_df['usd_balance'] - profits_df['usd_net_transfers']

# Calculate daily returns between cash flows
# Note: Using shift() within groups handles multi-coin wallets
profits_df['daily_twr'] = (profits_df.groupby(['wallet_address', 'coin_id'])
                            ['pre_transfer_balance'].shift())
profits_df['daily_twr'] = profits_df['usd_balance'] / profits_df['daily_twr']

# Replace inf/null values with 1 (no return) for first days and zero balances
profits_df['daily_twr'] = profits_df['daily_twr'].replace([np.inf, -np.inf], 1).fillna(1)

# Calculate cumulative TWR per wallet across all coins
twr_df = profits_df.groupby('wallet_address').agg(
    twr=('daily_twr', lambda x: x.prod() - 1),  # -1 to convert to percent return
    days_held=('date', lambda x: (x.max() - x.min()).days)
)

# Add annualized TWR for comparability
twr_df['annualized_twr'] = ((1 + twr_df['twr']) ** (365 / twr_df['days_held'])) - 1

# Clean up any edge cases
twr_df = twr_df.replace([np.inf, -np.inf], np.nan)

twr_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

test_data = {
    'coin_id': ['btc'] * 3,
    'wallet_address': ['wallet1'] * 3,
    'date': ['2024-01-01', '2024-01-03', '2024-01-10'],
    'usd_balance': [150, 220, 210],
    'usd_net_transfers': [80, -30, 0],
    'is_imputed': [False, False, True]
}
base_profits_df = pd.DataFrame(test_data)

# Validate test data format
validator = ProfitsValidator()
validation_results = validator.validate_all(
    base_profits_df,
    TestPeriods.TRAINING_PERIOD_START,
    TestPeriods.TRAINING_PERIOD_END
)
assert all(validation_results.values()), "Test data failed validation"

# Create profits_df and trading_features
profits_df = wtf.add_cash_flow_transfers_logic(base_profits_df)
trading_features_df = wtf.calculate_wallet_trading_features(profits_df)

trading_features_df

In [None]:
profits_df['date'].min() - profits_df['date'].max() + 1

In [None]:

# Ensure date is in datetime format
profits_df['date'] = pd.to_datetime(profits_df['date'])

# Sort by date and wallet_address to ensure proper cumulative calculations
profits_df = profits_df.sort_values(['wallet_address','coin_id','date'])

# Precompute necessary transformations
profits_df['abs_usd_net_transfers'] = profits_df['usd_net_transfers'].abs()

# Calculate cumsum by wallet, respecting date order
profits_df['cumsum_cash_flow_transfers'] = profits_df.groupby('wallet_address')['cash_flow_transfers'].cumsum()

# Metrics that take into account imputed rows/profits
logger.debug("Calculating wallet metrics based on imputed performance...")
imputed_metrics_df = profits_df.groupby('wallet_address').agg(
    invested=('cumsum_cash_flow_transfers', 'max'),
    net_gain=('cash_flow_transfers', lambda x: -x.sum()),
    unique_coins_traded=('coin_id', 'nunique')
)


imputed_metrics_df

In [None]:
# Metrics only based on observed activity
logger.debug("Calculating wallet metrics based on observed behavior...")
observed_metrics_df = profits_df[~profits_df['is_imputed']].groupby('wallet_address').agg(
    transaction_days=('date', 'nunique'),  # Changed from count to nunique for actual trading days
    total_volume=('abs_usd_net_transfers', 'sum'),
    average_transaction=('abs_usd_net_transfers', 'mean'),
    first_activity=('date', 'min'),
    last_activity=('date', 'max')
)

observed_metrics_df

In [None]:
profits_df
c = 'btc'
w = 'wallet1'

In [None]:
profits_df
c = 'btc'
w = 'wallet1'

wtf.calculate_wallet_trading_features(u.cw_filter_df(profits_df,c,w).copy())

In [None]:
profits_df
c = 'eth'
w = 'wallet1'

wtf.calculate_wallet_trading_features(u.cw_filter_df(profits_df,c,w).copy())

In [None]:
trading_features_df = trad

In [None]:
# import pytest
# import numpy as np

# @pytest.mark.unit
# def test_complex_scenarios_correctness(trading_features_df):
"""
Verify that the trading_features_df matches manually calculated results
for multiple complex scenarios. Each assertion is explained with the steps
used to compute the expected values.
"""

# wallet1 checks:
# invested=260 (max cumsum), net_gain=260, unique_coins_traded=2,
# transaction_days=2, total_volume=400, average_transaction=100,
# activity_days=10, activity_density=0.2
w1 = trading_features_df.loc['wallet1']
# Confirm invested
# Step: max cumsum was 260 across combined btc & eth transactions
assert np.isclose(w1['invested'], 260), "wallet1 invested incorrect."
# net_gain
# Step: sum of all cft for wallet1 was 260
assert np.isclose(w1['net_gain'], 260), "wallet1 net_gain incorrect."
# unique_coins_traded=2 (btc, eth)
assert w1['unique_coins_traded'] == 2, "wallet1 unique_coins_traded incorrect."
# transaction_days=2 (01-01 and 01-05 for observed rows)
assert w1['transaction_days'] == 2, "wallet1 transaction_days incorrect."
# total_volume=400 (sum abs transfers on observed: btc(100+50), eth(200+50))
assert w1['total_volume'] == 400, "wallet1 total_volume incorrect."
# average_transaction=100 (400 total /4 observed transactions)
assert w1['average_transaction'] == 100, "wallet1 average_transaction incorrect."
# activity_days=10 (01-01 to 01-10)
assert w1['activity_days'] == 10, "wallet1 activity_days incorrect."
# activity_density=0.2 (2/10)
assert np.isclose(w1['activity_density'], 0.2), "wallet1 activity_density incorrect."

# wallet2 checks:
# invested=-200, net_gain=-200, unique_coins_traded=1, transaction_days=1,
# total_volume=300, average_transaction=300, activity_days=10, activity_density=0.1
w2 = trading_features_df.loc['wallet2']
assert np.isclose(w2['invested'], -200), "wallet2 invested incorrect."
assert np.isclose(w2['net_gain'], -200), "wallet2 net_gain incorrect."
assert w2['unique_coins_traded'] == 1, "wallet2 unique_coins_traded incorrect."
assert w2['transaction_days'] == 1, "wallet2 transaction_days incorrect."
assert w2['total_volume'] == 300, "wallet2 total_volume incorrect."
assert w2['average_transaction'] == 300, "wallet2 average_transaction incorrect."
assert w2['activity_days'] == 10, "wallet2 activity_days incorrect."
assert np.isclose(w2['activity_density'], 0.1), "wallet2 activity_density incorrect."

# wallet3 checks:
# invested=-18, net_gain=-18, unique_coins_traded=1, transaction_days=3,
# total_volume=140, average_transaction≈46.6667, activity_days=10, activity_density=0.3
w3 = trading_features_df.loc['wallet3']
assert np.isclose(w3['invested'], -18), "wallet3 invested incorrect."
assert np.isclose(w3['net_gain'], -18), "wallet3 net_gain incorrect."
assert w3['unique_coins_traded'] == 1, "wallet3 unique_coins_traded incorrect."
assert w3['transaction_days'] == 3, "wallet3 transaction_days incorrect."
assert w3['total_volume'] == 140, "wallet3 total_volume incorrect."
# average_transaction = 140/3 ≈46.6667
assert np.isclose(w3['average_transaction'], 46.6667, atol=1e-4), "wallet3 average_transaction incorrect."
assert w3['activity_days'] == 10, "wallet3 activity_days incorrect."
# activity_density=3/10=0.3
assert np.isclose(w3['activity_density'], 0.3), "wallet3 activity_density incorrect."

# wallet4 checks:
# invested=70, net_gain=70, unique_coins_traded=1, transaction_days=0,
# total_volume=0, average_transaction=0, activity_days=1, activity_density=0
w4 = trading_features_df.loc['wallet4']
assert np.isclose(w4['invested'], 70), "wallet4 invested incorrect."
assert np.isclose(w4['net_gain'], 70), "wallet4 net_gain incorrect."
assert w4['unique_coins_traded'] == 1, "wallet4 unique_coins_traded incorrect."
assert w4['transaction_days'] == 0, "wallet4 transaction_days incorrect."
assert w4['total_volume'] == 0, "wallet4 total_volume incorrect."
assert w4['average_transaction'] == 0, "wallet4 average_transaction incorrect."
assert w4['activity_days'] == 1, "wallet4 activity_days incorrect."
assert np.isclose(w4['activity_density'], 0.0), "wallet4 activity_density incorrect."

# wallet5 checks:
# invested=0, net_gain=0, unique_coins_traded=1, transaction_days=0,
# total_volume=0, average_transaction=0, activity_days=10, activity_density=0
w5 = trading_features_df.loc['wallet5']
assert np.isclose(w5['invested'], 0), "wallet5 invested incorrect."
assert np.isclose(w5['net_gain'], 0), "wallet5 net_gain incorrect."
assert w5['unique_coins_traded'] == 1, "wallet5 unique_coins_traded incorrect."
assert w5['transaction_days'] == 0, "wallet5 transaction_days incorrect."
assert w5['total_volume'] == 0, "wallet5 total_volume incorrect."
assert w5['average_transaction'] == 0, "wallet5 average_transaction incorrect."
assert w5['activity_days'] == 10, "wallet5 activity_days incorrect."
assert np.isclose(w5['activity_density'], 0.0), "wallet5 activity_density incorrect."

In [None]:
expected_metrics