In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import gc
import time
import logging
import re
import pdb
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from scipy import stats
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    explained_variance_score,
    mean_absolute_percentage_error,
    roc_auc_score
)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['ALERT_SOUND_FILEPATH']="../../../Local/assets/sounds/mixkit-alert-bells-echo-765.wav"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp

# Wallet modeling
import wallet_modeling.wallet_modeling_orchestrator as wmo
import wallet_modeling.wallet_training_data as wtd
import wallet_modeling.model_reporting as wmr
import wallet_modeling.wallet_model as wm
import wallet_modeling.experiments_manager as wem
from wallet_modeling.wallets_config_manager import WalletsConfig

# Wallet features
import wallet_features.clustering_features as wcl
import wallet_features.market_cap_features as wmc
import wallet_features.market_timing_features as wmt
import wallet_features.performance_features as wpf
import wallet_features.trading_features as wtf
import wallet_features.transfers_features as wts
import wallet_features.features_orchestrator as wfo

# Wallet insights
import wallet_insights.wallet_model_evaluation as wime
import wallet_insights.wallet_validation_analysis as wiwv
import wallet_insights.coin_validation_analysis as wicv
import wallet_insights.coin_validation_model as wicm


# reload all modules
modules = [u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp,
           wmo, wtd, wmr, wm, wem,
           wcl, wmc, wmt, wpf, wtf, wts, wfo,
           wime, wiwv, wicv, wicm]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')
wallets_config = WalletsConfig.load_from_yaml('../config/wallets_config.yaml')
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

logger.info("Good morning, let's get to work")

In [None]:
u.export_code(
    code_directories=[
        # 'training_data',
        'wallet_features',
        # 'wallet_modeling',
        # 'wallet_insights'
    ],
    # include_config = True,
    # ipynb_notebook = 'DDA-456 wallet validation performance.ipynb'
)

u.obj_mem()

# Wallet Model Construction

## Training Data Sequence

### retrieve training datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# Complete Pre-Training Profits/Market Data
# -----------------------------------------
# Retrieve training period datasets and save them to temp/wallet_modeling_dfs
_,_,_ = wmo.retrieve_period_datasets(
    wallets_config['training_data']['training_period_start'],
    wallets_config['training_data']['training_period_end'],
    parquet_prefix = 'training')


### define cohort and clean training datasets (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))



# Add Indicators to Market Data
# ----------------------------------------------------------
# Load relevant parquet dfs with pre-training history
training_market_data_df_full = pd.read_parquet("temp/wallet_modeling_dfs/training_market_data_df_full.parquet")

# Generate indicators and save file
wmo.generate_training_indicators_df(training_market_data_df_full,wallets_metrics_config)



# Identify Wallet Cohort
# ----------------------------------------------------------
# Remove market data from prior to the starting balance date
training_market_data_df = training_market_data_df_full[training_market_data_df_full['date']
                                                       >=wallets_config['training_data']['training_starting_balance_date']]
del training_market_data_df_full
gc.collect()

# Retrieve full profits history
training_profits_df_full = pd.read_parquet("temp/wallet_modeling_dfs/training_profits_df_full.parquet")

# Define wallet cohort
training_wallet_cohort = wmo.define_wallet_cohort(training_profits_df_full,training_market_data_df)



# Generate Cohort-Filtered Profits Data for Training Windows
# ----------------------------------------------------------
# Generate wallet_cohort-filtered profits_df for all training windows
training_profits_df, training_windows_profits_dfs = wmo.split_training_window_profits_dfs(training_profits_df_full,
                                                                         training_market_data_df,training_wallet_cohort)
del training_profits_df_full,training_market_data_df
gc.collect()



# Retrieve Transfers Data
# ----------------------------------------------------------
# Transfers data retrieval for the wallet_ids in temp.wallet_modeling_training_cohort
training_transfers_sequencing_df = wts.retrieve_transfers_sequencing()

### generate training features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Generate Features for the Full Training Period
# ----------------------------------------------------------
logger.info("Generating features for full training period...")
training_market_indicators_data_df = pd.read_parquet("temp/wallet_modeling_dfs/training_market_indicators_data_df.parquet")
training_wallet_features_df = wfo.calculate_wallet_features(training_profits_df,
                                                            training_market_indicators_data_df,
                                                            training_transfers_sequencing_df,
                                                            training_wallet_cohort,
                                                            wallets_config['training_data']['training_period_start'],
                                                            wallets_config['training_data']['training_period_end'])

# Define the start of training_data_df appending a suffix for the window
training_data_df = training_wallet_features_df.add_suffix("_all_windows")

# del training_profits_df,training_wallet_features_df
gc.collect()

In [None]:
# Generate Features for Each Individual Window
# ----------------------------------------------------------
# Generate features for each window
for i, window_profits_df in enumerate(training_windows_profits_dfs, 1):
    logger.info("Generating features for window %s...", i)

    # Generate the features
    window_wallet_features_df = wfo.calculate_wallet_features(window_profits_df, training_market_indicators_data_df,
                                                             training_transfers_sequencing_df, training_wallet_cohort,
                                                             wallets_config['training_data']['training_period_start'],
                                                             wallets_config['training_data']['training_period_end'])

    # Check for NaN values and identify problematic columns
    nan_columns = window_wallet_features_df.columns[window_wallet_features_df.isna().any()].tolist()
    if nan_columns:
        raise ValueError(f"NaN values detected in window {i} in columns: {nan_columns}")

    # Add column suffix and join to training_data_df
    window_wallet_features_df = window_wallet_features_df.add_suffix(f'_w{i}')
    training_data_df = training_data_df.join(window_wallet_features_df, how='left')

    # Check for NaN values and identify problematic columns
    nan_columns = training_data_df.columns[training_data_df.isna().any()].tolist()
    if nan_columns:
        raise ValueError(f"NaN values detected in training_data_df after window {i} in columns: {nan_columns}")


del window_profits_df,window_wallet_features_df,training_market_indicators_data_df,training_transfers_sequencing_df
gc.collect()

u.obj_mem()

In [None]:
# Generate Clusters Using All Other Features
# ----------------------------------------------------------
# Append clustering features based on all numeric features in the base training data
training_cluster_features_df = wcl.create_basic_cluster_features(training_data_df)
training_cluster_features_df = training_cluster_features_df.add_prefix('cluster_')
training_data_df = training_data_df.join(training_cluster_features_df, how='inner')



# Save TRAINING_DATA_DF
# ----------------------------------------------------------
# Verify all input wallets exist in final output
missing_wallets = set(training_wallet_cohort) - set(training_data_df.index)
if missing_wallets:
    raise ValueError(f"Lost {len(missing_wallets)} wallets from original cohort during feature generation. First few missing: {list(missing_wallets)[:5]}")

# Save and clear from memory
training_data_df.to_parquet("temp/wallet_modeling_dfs/training_data_df.parquet",index=True)
del training_data_df,training_cluster_features_df
gc.collect()


logger.info("Feature generation complete. Final training_df shape: %s", training_data_df.shape)
logger.info(f"Current large object memory usage: {u.obj_mem()['size_mb'].sum():.1f} MB")
u.obj_mem()

## Wallet Modeling

### Retrieve modeling datasets (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Retrieve Modeling Profits and Market Data
# ----------------------------------------------------------
# Retrieve full historical through modeling period datasets
_, _, _ = wmo.retrieve_period_datasets(
    wallets_config['training_data']['modeling_period_start'],
    wallets_config['training_data']['modeling_period_end'],
    parquet_prefix = 'modeling'
)


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Load parquet files
modeling_market_data_df_full = pd.read_parquet("temp/wallet_modeling_dfs/modeling_market_data_df_full.parquet")
modeling_profits_df_full = pd.read_parquet("temp/wallet_modeling_dfs/modeling_profits_df_full.parquet")


# Remove pre-modeling period prices
modeling_market_data_df = modeling_market_data_df_full[modeling_market_data_df_full['date']
                                                       >=wallets_config['training_data']['modeling_starting_balance_date']]
del modeling_market_data_df_full
gc.collect()


# Filter to only training wallet cohort
training_wallet_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet", columns=[]).index.values
modeling_profits_df = modeling_profits_df_full[modeling_profits_df_full['wallet_address'].isin(training_wallet_cohort)]
del modeling_profits_df_full
gc.collect()


# Impute rows for period end
modeling_profits_df = pri.impute_profits_for_multiple_dates(modeling_profits_df,
                                                            modeling_market_data_df,
                                                            [wallets_config['training_data']['modeling_period_end']],
                                                            n_threads=1)


# Assert period, save files, remove from memory
u.assert_period(wallets_config,modeling_profits_df,'modeling')
u.assert_period(wallets_config,modeling_market_data_df,'modeling')
modeling_profits_df.to_parquet("temp/wallet_modeling_dfs/modeling_profits_df.parquet",index=False)
modeling_market_data_df.to_parquet("temp/wallet_modeling_dfs/modeling_market_data_df.parquet",index=False)
del modeling_profits_df,modeling_market_data_df
gc.collect()

### define modeling cohort and features (loadable parquet)

In [None]:

# Create training_cohort-Indexed modeling_wallet_features_df
# -----------------------------------------------------------
# Create a DataFrame with training wallet cohort as the index
training_wallet_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet", columns=[]).index.values
modeling_wallet_features_df = pd.DataFrame(index=training_wallet_cohort)
modeling_wallet_features_df.index.name = 'wallet_address'

modeling_wallet_features_df.shape

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Create training_cohort-Indexed modeling_wallet_features_df
# -----------------------------------------------------------
# Create a DataFrame with training wallet cohort as the index
training_wallet_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet", columns=[]).index.values
modeling_wallet_features_df = pd.DataFrame(index=training_wallet_cohort)
modeling_wallet_features_df.index.name = 'wallet_address'

# Store feature sets with their prefixes for bulk renaming
feature_column_names = {}


# Identify Modeling Period Cohort
# -----------------------------------------------------------
# Retrieve trading features for all wallets in training_cohort with boolean for in_modeling_cohort
modeling_profits_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_profits_df.parquet")
modeling_trading_features_df = wmo.identify_modeling_cohort(modeling_profits_df)
modeling_wallet_features_df = modeling_wallet_features_df.join(modeling_trading_features_df, how='left')\
    .fillna({col: 0 for col in modeling_trading_features_df.columns})


# Generate Modeling Period Performance Features
# -----------------------------------------------------------
# Calculate performance metrics for the training cohort (wallets with 0 activity still impact rank orders)
modeling_performance_features_df = (wpf.calculate_performance_features(
    modeling_wallet_features_df)
    .copy()
    .drop(['max_investment', 'crypto_net_gain'], axis=1))
modeling_wallet_features_df = modeling_wallet_features_df.join(modeling_performance_features_df, how='left')\
    .fillna({col: 0 for col in modeling_performance_features_df.columns})

### select target variable and build model

In [None]:
modeling_wallet_features_df.head()

In [None]:
model_results.keys()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create MODELING_DF and Construct Wallet Model
# ----------------------------------------------------------
# Retrieve training data for the full training wallet cohort
training_data_df = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet")

# Filter training data to only the modeling cohort through inner join to target variable
modeling_cohort_target_var_df = modeling_wallet_features_df[['in_modeling_cohort', wallets_config['modeling']['target_variable']]]

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config)
model_results = wallet_model.run_experiment(training_data_df,modeling_cohort_target_var_df)
del training_data_df
gc.collect()

# Extract the trained model
model = model_results['pipeline'].named_steps['regressor']

# Generate and save all model artifacts
model_id, evaluator, wallet_scores_df = wmr.generate_and_save_model_artifacts(
    model_results=model_results,
    base_path='../wallet_modeling'
)
u.notify()

In [None]:
modeling_cohort_target_var_df.describe()

# Post Model Analysis

### assess wallet model performance

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Reload evaluator
evaluator = wime.RegressionEvaluator(
    y_train=model_results['y_train'],
    y_true=model_results['y_test'],
    y_pred=model_results['y_pred'],
    training_cohort_pred=model_results['training_cohort_pred'],
    training_cohort_actuals=model_results['training_cohort_actuals'],
    model=model,
    feature_names=model_results['X_train'].columns.tolist()
)

# Print results
print(evaluator.summary_report())
evaluator.plot_evaluation()

evaluator.importance_summary()

In [None]:
pd.DataFrame(model_results['training_cohort_pred']).describe()

### Cluster analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# List of the x features with the highest importance in the model
x_features = 8
top_feature_metrics = list((pd.DataFrame(evaluator.metrics['importances'])
                      .sort_values(by='importance',ascending=False)
                      .head(x_features)['feature']))
all_metrics = list(set(top_feature_metrics))

# Cluster numbers
n_clusters=4

styled_df = wime.create_cluster_report(modeling_df, model_results, n_clusters, all_metrics)
styled_df

## Wallet Validation Period Performance

### Retrieve validation datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Retrieve Validation Profits and Market Data
# ----------------------------------------------------------
# Retrieve full historical through validation period datasets
wmo.retrieve_period_datasets(
    wallets_config['training_data']['validation_period_start'],
    wallets_config['training_data']['validation_period_end'],
    parquet_prefix = 'validation'
)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Load parquet files
validation_market_data_df_full = pd.read_parquet("temp/wallet_modeling_dfs/validation_market_data_df_full.parquet")
validation_profits_df_full = pd.read_parquet("temp/wallet_modeling_dfs/validation_profits_df_full.parquet")

# Remove pre-validation period prices
validation_market_data_df = validation_market_data_df_full[validation_market_data_df_full['date']
                                                       >=wallets_config['training_data']['validation_period_start']]
del validation_market_data_df_full
gc.collect()


# Filter to only training wallet cohort
training_wallet_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet", columns=[]).index.values
validation_profits_df = validation_profits_df_full[validation_profits_df_full['wallet_address'].isin(training_wallet_cohort)]
del validation_profits_df_full
gc.collect()

# Impute rows for period end
validation_profits_df = pri.impute_profits_for_multiple_dates(validation_profits_df,
                                                              validation_market_data_df,
                                                              [wallets_config['training_data']['validation_period_end']],
                                                              n_threads=24)


# Assert period, save files, remove from memory
u.assert_period(wallets_config,validation_profits_df,'validation')
u.assert_period(wallets_config,validation_market_data_df,'validation')
validation_profits_df.to_parquet("temp/wallet_modeling_dfs/validation_profits_df.parquet",index=False)
validation_market_data_df.to_parquet("temp/wallet_modeling_dfs/validation_market_data_df.parquet",index=False)
del validation_profits_df,validation_market_data_df
gc.collect()

### generate wallet_validation_features_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Load parquet
validation_profits_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_profits_df.parquet")
validation_market_data_df = pd.read_parquet("temp/wallet_modeling_dfs/validation_market_data_df.parquet")


# Create a DataFrame with all wallets that should exist
validation_wallet_features_df = pd.DataFrame(index=training_wallet_cohort)
validation_wallet_features_df.index.name = 'wallet_address'


# Calculate modeling period wallet metrics
validation_profits_df = wtf.add_cash_flow_transfers_logic(validation_profits_df)
trading_features_df = wtf.calculate_wallet_trading_features(validation_profits_df)
validation_wallet_features_df = validation_wallet_features_df.join(trading_features_df, how='left')\
    .fillna({col: 0 for col in trading_features_df.columns})

# Performance features (inner join, no fill)
performance_features_df = (wpf.calculate_performance_features(validation_wallet_features_df)
                                .drop(['max_investment', 'crypto_net_gain'], axis=1))  # already exist as trading features
validation_wallet_features_df = validation_wallet_features_df.join(performance_features_df, how='inner')
validation_wallet_features_df.describe()

### wallet validation period trading/performance by score quantile

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create analysis by prediction bands
metrics = [
    'return',
    'realized_return',
    'return_unwinsorized',
    'max_investment',
    'crypto_net_gain',
    'net_crypto_investment',
    'total_volume',
]

min_wallet_volume_usd = 1000
num_quantiles = 5

wiwv.create_quantile_report(
    validation_wallet_features_df,
    model_results['y_pred'],
    metrics,  # Your existing metrics list
    num_quantiles,  # Split into quintiles
    min_wallet_volume_usd
)


In [None]:
profits_df.isna().sum()

### coin-aggregated wallet metrics by coin performance

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate coin-level features about wallet behavior during the
coin_wallet_features_df = wicv.calculate_coin_metrics_from_wallet_scores(
    validation_profits_df,
    wallet_scores_df,
    validation_market_data_df
)

# Filter coins by market cap
analyze_df = coin_wallet_features_df[
    (coin_wallet_features_df['market_cap_filled'] >= wallets_config['coin_validation_analysis']['min_market_cap'])
    & (coin_wallet_features_df['market_cap_filled'] <= wallets_config['coin_validation_analysis']['max_market_cap'])
].copy()

# Create styled performance analysis
wicv.create_top_coins_wallet_metrics_report(analyze_df,percentile=90,method='median')


## Basic coin model testing

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# 1. Generate model scores (using existing wallet model results)
wallet_scores_df = pd.DataFrame({
    'score': model_results['y_pred']
}, index=model_results['X_test'].index)

# 2. Prepare the modeling dataset using modeling period data
coin_modeling_df = wicm.prepare_features_and_targets(
    coin_validation_df=coin_validation_df,
    modeling_profits_df=modeling_profits_df,
    modeling_market_data_df=modeling_market_data_df,
    wallet_scores_df=wallet_scores_df
)

# 3. Train model and get evaluation
model, evaluator = wicm.train_coin_prediction_model(coin_modeling_df)

# 4. View results
print(evaluator.summary_report())
evaluator.plot_evaluation()

# 5. Optional: Generate feature importance summary
evaluator.importance_summary()

# 6. Optional: Analyze predictions by market cap segment
predictions_df = pd.DataFrame({
    'y_true': evaluator.y_true,
    'y_pred': evaluator.y_pred,
    'market_cap': coin_modeling_df['market_cap_filled']
})

segment_results, summary_df = wicv.analyze_market_cap_segments(predictions_df)
wicv.plot_segment_heatmap(summary_df)

In [None]:
# Create a DataFrame with all wallets that should exist
validation_wallet_features_df = pd.DataFrame(index=wallet_cohort)
validation_wallet_features_df.index.name = 'wallet_address'


# Calculate modeling period wallet metrics
validation_profits_df = wtf.add_cash_flow_transfers_logic(validation_profits_df)
trading_features_df = wtf.calculate_wallet_trading_features(validation_profits_df)
validation_wallet_features_df = validation_wallet_features_df.join(trading_features_df, how='left')\
    .fillna({col: 0 for col in trading_features_df.columns})

# Performance features (inner join, no fill)
performance_features_df = (wpf.calculate_performance_features(validation_wallet_features_df)
                                .drop(['max_investment', 'crypto_net_gain'], axis=1))  # already exist as trading features
validation_wallet_features_df = validation_wallet_features_df.join(performance_features_df, how='inner')
validation_wallet_features_df.describe()

In [None]:
# Create coin_modeling_df
coin_modeling_df = coin_wallet_features_df.copy().drop('market_cap',axis=1)
coin_modeling_df['coin_return_unwinsorized'] = coin_modeling_df['coin_return']
coin_modeling_df['coin_return'] = u.winsorize(coin_modeling_df['coin_return'],0.05)

# Filter coins by market cap
coin_modeling_df = coin_modeling_df[
    (coin_modeling_df['market_cap_filled'] >= wallets_config['coin_validation_analysis']['min_market_cap'])
    & (coin_modeling_df['market_cap_filled'] <= wallets_config['coin_validation_analysis']['max_market_cap'])
].copy()


In [None]:
coin_modeling_df

In [None]:
df = coin_modeling_df.copy()

# 1. Simple feature prep and model
X, y = wicm.prepare_features(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

# 2. Train
model.fit(X_train, y_train)

# 3. Predict
y_pred = model.predict(X_test)

# 4. Evaluate with the fancy evaluator
feature_names = df.columns.drop(['coin_return', 'market_cap_filled']).tolist()
evaluator = wime.RegressionEvaluator(y_train, y_test, y_pred, model=model, feature_names=feature_names)

# 5. Get the goods
print(evaluator.summary_report())

# 6. Plot everything
evaluator.plot_evaluation()

## Cluster analysis

## experiments beta

In [None]:
# Create modeling dataset using existing pipeline
modeling_wallets_df = wmo.filter_modeling_period_wallets(modeling_profits_df)
target_vars_df = wpf.calculate_performance_features(modeling_wallets_df)


In [None]:
### save model artifacts
[importlib.reload(module) for module in modules]
wallets_config.reload()

# 1. Initialize dependencies
metrics_config = {
    'rmse': lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    'r2': r2_score
}

# 2. Define experiment sequence
sequence_config = {
    'run_baseline': True,
    'parameter_variations': {
        'modeling': {
            'target_variable': [
                'max_investment',
                'crypto_net_gain',
                'return',
                'realized_return',
                'return_unwinsorized',
                'performance_score',
                'size_adjusted_rank'
            ]
        }
    }
}

# 3. Create experiment manager
exp_manager = wem.ExperimentsManager(
    config=wallets_config.config,
    training_data_df=training_data_df,
)

# 4. Run experiments and get results
results_df = exp_manager.run_experiment_sequence(modeling_profits_df, sequence_config)

# 5. View results
print(results_df)

In [None]:
results_df

### Validation period assessments

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wallet_performance_df, bucketed_performance_df = wicv.calculate_validation_metrics(
    X_test=model_results['X_test'],
    y_pred=model_results['y_pred'],
    validation_profits_df=validation_profits_df,
    n_buckets=10,
    method='ntiles'
)

bucketed_performance_df

## coin performance analysis

### compare wallet metrics for the top n% of coins vs the others

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Partition coin features for analysis
analyze_df = coin_wallet_features_df[
    (coin_wallet_features_df['market_cap_filled'] >= wallets_config['coin_validation_analysis']['min_market_cap'])
    & (coin_wallet_features_df['market_cap_filled'] <= wallets_config['coin_validation_analysis']['max_market_cap'])
].copy()

# Create styled performance analysis
styled_df = wicv.create_top_coins_wallet_metrics_report(analyze_df,percentile=90,method='median')

# Display results
styled_df

### plotting coin feature performance vs market cap

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Get the analysis results
segment_results, summary_df = wicv.analyze_market_cap_segments(
    coin_wallet_features_df,
    top_n=10
)

# Or create the visualizations
wicv.plot_segment_heatmap(summary_df)
wicv.plot_metric_consistency(summary_df)  # Optional secondary visualization


### coin performance of top n for each bucket

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Run analysis
top_n = wallets_config['coin_validation_analysis']['top_n']
max_market_cap = wallets_config['coin_validation_analysis']['max_market_cap']
min_market_cap = wallets_config['coin_validation_analysis']['min_market_cap']

metric_top_coin_performance_df = wicv.validate_coin_performance(coin_wallet_features_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

### compare performance of high vs low score coins

In [None]:
coin_wallet_features_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wicv.print_performance_analysis(coin_wallet_features_df)

# Junkyard

In [None]:
training_profits_df[training_profits_df['date']==wallets_config['training_data']['training_starting_balance_date']]

In [None]:
training_profits_df[training_profits_df['date']==wallets_config['training_data']['training_period_end']]

### performance benchmarks

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


df = u.cw_sample(training_profits_df_full,500000).copy()

# Define wallet cohort
training_wallet_cohort = wmo.define_wallet_cohort(df,training_market_data_df)



# # Generate Cohort-Filtered Profits Data for Training Windows
# # ----------------------------------------------------------
# # Generate wallet_cohort-filtered profits_df for all training windows
# training_profits_df, training_windows_profits_dfs = wmo.split_training_window_profits_dfs(training_profits_df_full,
#                                                                          training_market_data_df,training_wallet_cohort)
# del training_profits_df_full,training_market_data_df
# gc.collect()



# # Retrieve Transfers Data
# # ----------------------------------------------------------
# # Transfers data retrieval for the wallet_ids in temp.wallet_modeling_training_cohort
# training_transfers_sequencing_df = wts.retrieve_transfers_sequencing()

# Tests failing

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

In [None]:

class ProfitsValidator:
    """
    Validates profits DataFrame follows expected format and constraints.
    Only validates training period data.
    """
    def validate_all(self, profits_df, training_period_start, training_period_end):
        """Run all validation checks and return dict of results"""

        dates = {
            'training_starting_balance_date': pd.to_datetime(training_period_start) - timedelta(days=1),
            'training_period_start': pd.to_datetime(training_period_start),
            'training_period_end': pd.to_datetime(training_period_end),
        }

        return {
            'no_duplicates': self.check_no_duplicates(profits_df),
            'period_boundaries': self.check_period_boundaries(profits_df, dates),
            'no_negatives': self.check_no_negative_balances(profits_df),
            'date_range': self.check_date_range(profits_df, dates),
            'no_missing': self.check_no_missing_values(profits_df)
        }

    def check_no_duplicates(self, profits_df):
        """Check for duplicate records"""
        deduped_df = profits_df[['coin_id', 'wallet_address', 'date']].drop_duplicates()
        return len(profits_df) == len(deduped_df)

    def check_period_boundaries(self, profits_df, dates):
        """Check records exist at period boundaries"""
        profits_df['date'] = pd.to_datetime(profits_df['date'])
        pairs = profits_df[['coin_id', 'wallet_address']].drop_duplicates()
        n_pairs = len(pairs)

        period_df = profits_df[profits_df['date'] == dates['training_period_end']]
        period_pairs = period_df[['coin_id', 'wallet_address']].drop_duplicates()
        return len(period_pairs) == n_pairs

    def check_no_negative_balances(self, profits_df):
        """Check for negative USD balances"""
        return (profits_df['usd_balance'] >= -0.1).all()

    def check_date_range(self, profits_df, dates):
        """Verify date coverage"""
        profits_df['date'] = pd.to_datetime(profits_df['date'])
        return (profits_df['date'].min() >= dates['training_starting_balance_date'] and
                profits_df['date'].max() == dates['training_period_end'])

    def check_no_missing_values(self, profits_df):
        """Check for missing values"""
        return not profits_df.isna().any().any()



### fixtures

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
# pylint:disable=line-too-long

def test_profits_data():
    """
    Returns raw profits data that can be remapped for many-to-many testing.
    """
    training_period_start = '2024-01-01'
    training_period_end = '2024-10-01'

    profits_data = [
        # w01_multiple_coins - btc & eth (multiple transactions, multiple coins)
        {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-01-01', 'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-05-01', 'usd_balance': 120, 'usd_net_transfers': 50, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w01_multiple_coins', 'date': '2024-10-01', 'usd_balance': 180, 'usd_net_transfers': 0, 'is_imputed': True},

        {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-01-01', 'usd_balance': 200, 'usd_net_transfers': 200, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-05-01', 'usd_balance': 300, 'usd_net_transfers': 50, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w01_multiple_coins', 'date': '2024-10-01', 'usd_balance': 280, 'usd_net_transfers': 0, 'is_imputed': True},

        # w02_net_loss - btc (net loss)
        {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-01-01', 'usd_balance': 300, 'usd_net_transfers': 300, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-05-01', 'usd_balance': 250, 'usd_net_transfers': -100, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w02_net_loss', 'date': '2024-10-01', 'usd_balance': 100, 'usd_net_transfers': 0, 'is_imputed': True},

        # w03_sell_all_and_rebuy
        {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-01-01', 'usd_balance': 50, 'usd_net_transfers': 50, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-03-01', 'usd_balance': 0,  'usd_net_transfers': -50, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-08-01', 'usd_balance': 40, 'usd_net_transfers': 40, 'is_imputed': False},
        {'coin_id': 'eth', 'wallet_address': 'w03_sell_all_and_rebuy', 'date': '2024-10-01', 'usd_balance': 42, 'usd_net_transfers': 0, 'is_imputed': True},

        # w04_only_period_end - btc (only final row)
        {'coin_id': 'sol', 'wallet_address': 'w04_only_period_end', 'date': '2024-10-01', 'usd_balance': 70, 'usd_net_transfers': 70, 'is_imputed': False},

        # w04a_only_period_end_w_balance - btc
        {'coin_id': 'eth', 'wallet_address': 'w04a_only_period_end_w_balance', 'date': '2023-12-31', 'usd_balance': 30, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'eth', 'wallet_address': 'w04a_only_period_end_w_balance', 'date': '2024-10-01', 'usd_balance': 90, 'usd_net_transfers': 50, 'is_imputed': False},

        # w04b_only_period_start_buy
        {'coin_id': 'sol', 'wallet_address': 'w04b_only_period_start_buy', 'date': '2024-01-01', 'usd_balance': 300, 'usd_net_transfers': 300, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w04b_only_period_start_buy', 'date': '2024-10-01', 'usd_balance': 900, 'usd_net_transfers': 0, 'is_imputed': True},

        # w04c_only_period_start_buy_w_existing_balance
        {'coin_id': 'btc', 'wallet_address': 'w04c_only_period_start_buy_w_existing_balance', 'date': '2023-12-31', 'usd_balance': 40, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'btc', 'wallet_address': 'w04c_only_period_start_buy_w_existing_balance', 'date': '2024-01-01', 'usd_balance': 350, 'usd_net_transfers': 300, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w04c_only_period_start_buy_w_existing_balance', 'date': '2024-10-01', 'usd_balance': 1050, 'usd_net_transfers': 0, 'is_imputed': True},

        # w04d_only_period_start_sell
        {'coin_id': 'sol', 'wallet_address': 'w04d_only_period_start_sell', 'date': '2023-12-31', 'usd_balance': 200, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'sol', 'wallet_address': 'w04d_only_period_start_sell', 'date': '2024-01-01', 'usd_balance': 0, 'usd_net_transfers': -200, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w04d_only_period_start_sell', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

        # w04e_only_period_start_sell_partial
        {'coin_id': 'btc', 'wallet_address': 'w04e_only_period_start_sell_partial', 'date': '2023-12-31', 'usd_balance': 510, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'btc', 'wallet_address': 'w04e_only_period_start_sell_partial', 'date': '2024-01-01', 'usd_balance': 500, 'usd_net_transfers': -10, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w04e_only_period_start_sell_partial', 'date': '2024-10-01', 'usd_balance': 600, 'usd_net_transfers': 0, 'is_imputed': True},

        # w05_only_imputed - btc (only imputed rows at start and end)
        {'coin_id': 'sol', 'wallet_address': 'w05_only_imputed', 'date': '2023-12-31', 'usd_balance': 50, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'sol', 'wallet_address': 'w05_only_imputed', 'date': '2024-10-01', 'usd_balance': 70, 'usd_net_transfers': 0, 'is_imputed': True},

        # w06_tiny_transactions - very small transactions relative to portfolio size
        {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2023-12-31', 'usd_balance': 1250, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-02-01', 'usd_balance': 1220, 'usd_net_transfers': 1, 'is_imputed': False},
        {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-08-01', 'usd_balance': 0, 'usd_net_transfers': -350, 'is_imputed': False},
        {'coin_id': 'myro', 'wallet_address': 'w06_tiny_transactions', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

        # w07_tiny_transactions2 - very small transactions relative to portfolio size
        {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2023-12-31', 'usd_balance': 400, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-02-01', 'usd_balance': 1220, 'usd_net_transfers': -20, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-08-01', 'usd_balance': 0, 'usd_net_transfers': -150, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w07_tiny_transactions2', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

        # w08_offsetting_transactions - large offsetting transactions in the middle of the period
        {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2023-12-31', 'usd_balance': 500, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-02-01', 'usd_balance': 10400, 'usd_net_transfers': 10000, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-02-02', 'usd_balance': 400, 'usd_net_transfers': -10000, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w08_offsetting_transactions', 'date': '2024-10-01', 'usd_balance': 750, 'usd_net_transfers': 0, 'is_imputed': True},

        # w09_memecoin_winner - Large swings in portfolio value
        {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-01-01', 'usd_balance': 100, 'usd_net_transfers': 100, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-03-01', 'usd_balance': 250, 'usd_net_transfers': -500, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-05-01', 'usd_balance': 50, 'usd_net_transfers': -100, 'is_imputed': False},
        {'coin_id': 'floki', 'wallet_address': 'w09_memecoin_winner', 'date': '2024-10-01', 'usd_balance': 10, 'usd_net_transfers': 0, 'is_imputed': True},

        # w10_memecoin_loser - Large swings in portfolio value
        {'coin_id': 'myro', 'wallet_address': 'w10_memecoin_loser', 'date': '2024-03-01', 'usd_balance': 250, 'usd_net_transfers': 250, 'is_imputed': False},
        {'coin_id': 'myro', 'wallet_address': 'w10_memecoin_loser', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': -20, 'is_imputed': False},

        # w11_sells_early
        {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-03-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-04-01', 'usd_balance': 250, 'usd_net_transfers': 250, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-5-01', 'usd_balance': 0, 'usd_net_transfers': -300, 'is_imputed': False},
        {'coin_id': 'btc', 'wallet_address': 'w11_sells_early', 'date': '2024-10-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},

        # w12_buys_late
        {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-03-01', 'usd_balance': 0, 'usd_net_transfers': 0, 'is_imputed': True},
        {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-09-01', 'usd_balance': 500, 'usd_net_transfers': 250, 'is_imputed': False},
        {'coin_id': 'sol', 'wallet_address': 'w12_buys_late', 'date': '2024-10-01', 'usd_balance': 550, 'usd_net_transfers': 0, 'is_imputed': True},
    ]

    return pd.DataFrame(profits_data), training_period_start, training_period_end



test_profits_data = test_profits_data()

def test_profits_df(test_profits_data):
    """
    Returns test profits DataFrame with cash flow transfers added.
    """
    profits_df, training_period_start, training_period_end = test_profits_data
    profits_df = profits_df.copy()

    # Validate test data format before proceeding
    validator = ProfitsValidator()
    validation_results = validator.validate_all(
        profits_df,
        training_period_start,
        training_period_end
    )
    assert all(validation_results.values()), "Test data failed validation checks."

    # Remove rows with a rounded 0 balance and 0 transfers which happens in wmo.retrieve_datasets() once validation checks are passed
    profits_df = profits_df[
        ~((profits_df['usd_balance'] == 0) &
        (profits_df['usd_net_transfers'] == 0))
    ]


    return profits_df,training_period_start, training_period_end

test_profits_df, training_period_start, training_period_end = test_profits_df(test_profits_data)


### remap

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

def test_trading_features_df(test_profits_df):
    """
    Returns trading features computed from test profits data.
    """

    # Compute trading features
    wallet_trading_features_df = wtf.calculate_wallet_trading_features(test_profits_df,
                                                                       training_period_start,
                                                                       training_period_end)

    # Return dates for recalculation
    return wallet_trading_features_df, training_period_start, training_period_end

test_trading_features_df, training_period_start, training_period_end = test_trading_features_df(test_profits_df.copy())

In [None]:
def test_remapped_profits_df(test_profits_data):
    """
    Remaps the base profits data so many wallets hold many of the same coins and adds cash flow transfers.
    """
    # Reassign wallets to create a lot of overlap
    reassign_dict = {
        'w01_multiple_coins': 'w1',
        'w02_net_loss': 'w2',
        'w03_sell_all_and_rebuy': 'w2',
        'w04_only_period_end': 'w3',
        'w04a_only_period_end_w_balance': 'w3',
        'w04b_only_period_start_buy': 'w2',
        'w04c_only_period_start_buy_w_existing_balance': 'w4',
        'w04d_only_period_start_sell': 'w4',
        'w04e_only_period_start_sell_partial': 'w5',
        'w05_only_imputed': 'w5',
        'w06_tiny_transactions': 'w5',
        'w07_tiny_transactions2': 'w2',
        'w08_offsetting_transactions': 'w1',
        'w09_memecoin_winner': 'w3',
        'w10_memecoin_loser': 'w4',
        'w11_sells_early': 'w6',
        'w12_buys_late': 'w6'
    }
    profits_df, training_period_start, training_period_end = test_profits_data
    remapped_profits_df = profits_df.copy()
    remapped_profits_df['wallet_address_original'] = remapped_profits_df['wallet_address']
    remapped_profits_df['wallet_address'] = remapped_profits_df['wallet_address'].map(reassign_dict)

    # Rest of the sequence remains unchanged
    profits_df = remapped_profits_df.copy()
    training_period_start = '2024-01-01'
    training_period_end = '2024-10-01'

    # Validate test data format before proceeding
    validator = ProfitsValidator()
    validation_results = validator.validate_all(
        profits_df,
        training_period_start,
        training_period_end
    )
    assert all(validation_results.values()), "Test data failed validation checks."

    # Remove rows with a rounded 0 balance and 0 transfers which happens in wmo.retrieve_datasets() once validation checks are passed
    profits_df = profits_df[
        ~((profits_df['usd_balance'] == 0) &
        (profits_df['usd_net_transfers'] == 0))
    ]

    # Confirm that all the addresses have been mapped
    expected_addresses = ['w1', 'w2', 'w3', 'w4', 'w5', 'w6']
    assert sorted(list(profits_df['wallet_address'].unique())) == expected_addresses

    return profits_df, training_period_start, training_period_end


test_remapped_profits_df, training_period_start, training_period_end = test_remapped_profits_df(test_profits_data)

def test_remapped_trading_features_df(test_remapped_profits_df):
    """
    Returns trading features computed from test profits data.
    """
    # Compute trading features
    remapped_wallet_trading_features_df = wtf.calculate_wallet_trading_features(test_remapped_profits_df,
                                                                       training_period_start,
                                                                       training_period_end)

    # Confirm that all the addresses have been mapped
    expected_addresses = ['w1', 'w2', 'w3', 'w4', 'w5', 'w6']
    assert sorted(list(remapped_wallet_trading_features_df.index.values)) == expected_addresses

    return remapped_wallet_trading_features_df

test_remapped_trading_features_df =test_remapped_trading_features_df(test_remapped_profits_df)

In [None]:
test_remapped_trading_features_df

### codespace

In [None]:
test_trading_features_df.describe()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

w = 'w09_memecoin_winner'
# w = 'w04c_only_period_start_buy_w_existing_balance'

crypto_cols_profits_df = wtf.calculate_crypto_balance_columns(test_profits_df.copy(),training_period_start)

display(crypto_cols_profits_df[crypto_cols_profits_df['wallet_address']==w])
test_trading_features_df.loc[w]

In [None]:
# Filter to test case wallet
wallet = 'w09_memecoin_winner'
wallet_features = test_trading_features_df.loc[wallet]

# Calculate expected values
# total_crypto_buys: initial 100 buy
expected_buys = 100

# total_crypto_sells: 500 + 100 = 600 in sells
expected_sells = 600

# net_crypto_investment: 100 - 600 = -500
expected_net = -500

# crypto_net_gain: 10 ending - (-500 net investment) = 510
expected_gain = 510

# transaction_days: 2 days with non-imputed transactions (3/1 and 5/1)
expected_txn_days = 3

# unique_coins_traded: only traded floki
expected_coins = 1

# total_volume: abs(500) + abs(100) + 100 = 700
expected_volume = 700  # |100| + |500| + |100| = 700

# average_transaction: 600 / 2 = 300
expected_avg_txn = 700/3  # 233.33


# time_weighted_balance:
# 100 * 60 days (1/1-3/1) = 6000
# 33.333 * 61 days (3/1-5/1) = 2033.333
# 11.111 * 153 days (5/1-10/1) = 1700
# ((100 * 59) + (250 * 61) + (50 * 153)) / (59 + 61 + 153)
expected_twb = ((100 * 60) + (33.3333 * 61) + (11.1111 * 153)) / (59 + 61 + 153) # 35.65

expected_density = 3 / 274

expected_ratio = 700 / 35.65322

# Assert all values match
assert wallet_features['total_crypto_buys'] == expected_buys
assert wallet_features['total_crypto_sells'] == expected_sells
assert wallet_features['net_crypto_investment'] == expected_net
assert wallet_features['crypto_net_gain'] == expected_gain
assert wallet_features['transaction_days'] == expected_txn_days
assert wallet_features['unique_coins_traded'] == expected_coins
assert wallet_features['total_volume'] == expected_volume
assert wallet_features['average_transaction'] == expected_avg_txn
assert np.isclose(wallet_features['time_weighted_balance'], expected_twb, rtol=1e-2)
assert np.isclose(wallet_features['activity_density'], expected_density, rtol=1e-2)
assert np.isclose(wallet_features['volume_vs_twb_ratio'], expected_ratio, rtol=1e-2)

In [None]:
wallet_features['time_weighted_balance']

In [None]:
@pytest.mark.unit
def test_ratio_metrics_after_remapping(test_profits_df,
                                     test_remapped_profits_df,
                                     test_remapped_trading_features_df):
    """
    Verifies ratio-based metrics are calculated correctly after wallet remapping.

    Tests:
    1. average_transaction = total_volume / num_transactions
    2. volume_vs_twb_ratio = total_volume / time_weighted_balance
    """
    # Unpack tuples
    test_profits_df, _, _ = test_profits_df
    test_remapped_profits_df, _, _ = test_remapped_profits_df

    # Create mapping from original to new wallets
    wallet_mapping = (test_remapped_profits_df[['wallet_address', 'wallet_address_original']]
                        .drop_duplicates()
                        .set_index('wallet_address_original')['wallet_address'])

    # Count transactions and sum volumes for non-imputed rows
    metrics = (test_profits_df[~test_profits_df['is_imputed']]
                .assign(new_wallet=lambda x: x['wallet_address'].map(wallet_mapping))
                .groupby('new_wallet')
                .agg(
                    txn_count=('usd_net_transfers', 'size'),
                    total_volume=('usd_net_transfers', lambda x: abs(x).sum())
                ))

    expected_avg_transaction = (metrics['total_volume'] / metrics['txn_count']).sort_index()

    # Get actual metrics
    actual = test_remapped_trading_features_df[[
        'average_transaction',
        'volume_vs_twb_ratio'
    ]].sort_index()

    # Compare metrics
    assert np.allclose(expected_avg_transaction, actual['average_transaction'], rtol=1e-2), \
        "Average transaction doesn't match after remapping"

    # Verify volume_vs_twb_ratio calculation
    expected_ratio = np.where(
        test_remapped_trading_features_df['time_weighted_balance'] > 0,
        test_remapped_trading_features_df['total_volume'] /
        test_remapped_trading_features_df['time_weighted_balance'],
        0
    )
    assert np.allclose(expected_ratio, actual['volume_vs_twb_ratio'], rtol=1e-2), \
        "Volume vs TWB ratio doesn't match expected calculation"

In [None]:
test_profits_df

In [None]:
period_days

In [None]:
expected_density

In [None]:
actual

In [None]:
test_profits_df.head()

In [None]:
test_trading_features_df.head()

In [None]:

expected = (
    test_trading_features_df['total_volume'] /
    test_trading_features_df['time_weighted_balance']
)
actual = test_trading_features_df['volume_vs_twb_ratio']
# assert np.allclose(actual, expected, rtol=1e-2)

expected

In [None]:
# Unpack tuples
test_profits_df, _, _ = test_profits_df
test_remapped_profits_df, _, _ = test_remapped_profits_df

# Create mapping from original to new wallets
wallet_mapping = (test_remapped_profits_df[['wallet_address', 'wallet_address_original']]
                    .drop_duplicates()
                    .set_index('wallet_address_original')['wallet_address'])

# Calculate expected transaction days (non-imputed activity)
active_dates = (test_profits_df[~test_profits_df['is_imputed']]
                .assign(new_wallet=lambda x: x['wallet_address'].map(wallet_mapping))
                .groupby('new_wallet')['date']
                .nunique()
                .sort_index())

# Calculate expected unique coins
unique_coins = (test_profits_df[~test_profits_df['is_imputed']]
                .assign(new_wallet=lambda x: x['wallet_address'].map(wallet_mapping))
                .groupby('new_wallet')['coin_id']
                .nunique()
                .sort_index())

# Calculate expected activity density
period_days = (pd.to_datetime(test_profits_df['date']).max() -
                pd.to_datetime(test_profits_df['date']).min()).days + 1
expected_density = active_dates / period_days

# Get actual metrics, sorted for comparison
actual = test_remapped_trading_features_df[[
    'transaction_days',
    'unique_coins_traded',
    'activity_density'
]].sort_index()

# Compare each metric with specific error messages
assert np.allclose(active_dates, actual['transaction_days'], equal_nan=True), \
    "Transaction days don't match after remapping"
assert np.allclose(unique_coins, actual['unique_coins_traded'], equal_nan=True), \
    "Unique coins traded don't match after remapping"
assert np.allclose(expected_density, actual['activity_density'], equal_nan=True), \
    "Activity density doesn't match after remapping"

In [None]:
((100 * 60) + (33.3333 * 61) + (11.1111 * 153)) / (59 + 61 + 153)

In [None]:
pd.to_datetime(period_start_date)

In [None]:
profits_df.dtypes

In [None]:
profits_df = crypto_cols_profits_df[crypto_cols_profits_df['wallet_address']==w].copy(8)
period_start_date = training_period_start

# Sort by date and calculate holding period for each balance
profits_df = profits_df.sort_values(['wallet_address', 'coin_id', 'date'])


# Calculate days held for each balance level
profits_df['next_date'] = profits_df.groupby(['wallet_address', 'coin_id'])['date'].shift(-1)
profits_df['days_held'] = (
    profits_df['next_date'] - profits_df['date']
).dt.total_seconds() / (24 * 60 * 60)

# For the final period of each coin, use the last known date
last_mask = profits_df['next_date'].isna()
profits_df.loc[last_mask, 'days_held'] = 0

# Calculate weighted cost (balance * days held)
profits_df['weighted_cost'] = profits_df['crypto_cost_basis'] * profits_df['days_held']

# First group by wallet-coin to get coin-level TWB
coin_level_twb = profits_df.groupby(['wallet_address', 'coin_id']).agg(
    total_days=('days_held', 'sum'),
    sum_weighted_cost=('weighted_cost', 'sum')
)
coin_level_twb['coin_twb'] = (
    coin_level_twb['sum_weighted_cost'] / coin_level_twb['total_days']
)

# Then sum up to wallet level
wallet_twb = coin_level_twb.groupby('wallet_address')['coin_twb'].sum()

profits_df

In [None]:
profits_df['start_date'] = greatest(profits_df['date'],period_start_date)

In [None]:
((500 * 32) + (10500 * 1) + (242 * (10500 * (1 - (10000/10400))))) / (32 + 1 + 242)


In [None]:
(242 * (10500 * (1 - (10000/10400))))

In [None]:
20000/451.7482