In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import gc
import time
import logging
import re
import pdb
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
import scipy
from scipy import stats
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    explained_variance_score,
    mean_absolute_percentage_error,
    roc_auc_score
)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['LOGGING_FILE']="../../../Local/logs/wallet_modeling.log"
os.environ['ALERT_SOUND_FILEPATH']="../../../Local/assets/sounds/mixkit-alert-bells-echo-765.wav"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp

# Wallet features
import wallet_features.clustering_features as wcl
import wallet_features.market_cap_features as wmc
import wallet_features.market_timing_features as wmt
import wallet_features.performance_features as wpf
import wallet_features.trading_features as wtf
import wallet_features.transfers_features as wts
import wallet_features.wallet_features_orchestrator as wfo

# Wallet modeling
import wallet_modeling.wallet_modeling_orchestrator as wmo
import wallet_modeling.wallet_training_data as wtd
import wallet_modeling.model_reporting as wmr
import wallet_modeling.wallet_model as wm
import wallet_modeling.experiments_manager as wem
from wallet_modeling.wallets_config_manager import WalletsConfig

# Wallet insights
import wallet_insights.wallet_model_evaluation as wime
import wallet_insights.wallet_validation_analysis as wiwv

# Coin features
import coin_wallet_features.wallet_balance_features as cwb

# Coin modeling
import coin_modeling.coin_model as cm

# Coin insights
import coin_insights.coin_model_evaluation as cime
import coin_insights.coin_validation_analysis as civa


# reload all modules
modules = [
    u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp,
    wmo, wtd, wmr, wm, wem,
    wcl, wmc, wmt, wpf, wtf, wts, wfo,
    wime, wiwv,
    cwb,
    cm,
    cime, civa,
]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')
wallets_config = WalletsConfig.load_from_yaml('../config/wallets_config.yaml')
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# configure logger
logger = dc.setup_logger()
# logger = u.setup_local_logging(logger)
logger.setLevel(logging.INFO)

logger.info("Good morning, let's get to work")

In [None]:
u.export_code(
    code_directories=[
        # 'training_data',
        'wallet_features',
        # 'wallet_modeling',
        # 'wallet_insights'
    ],
    # include_config = True,
    # ipynb_notebook = 'DDA-456 wallet validation performance.ipynb'
)

u.obj_mem()

# Wallet Model Construction

## Training Data Sequence

### retrieve training datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# Complete Pre-Training Profits/Market Data
# -----------------------------------------
# Retrieve training period datasets and save them to temp/wallet_modeling_dfs
_,_,_ = wmo.retrieve_period_datasets(
    wallets_config['training_data']['training_period_start'],
    wallets_config['training_data']['training_period_end'],
    parquet_prefix = 'training')


### define cohort and clean training datasets (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))



# Add Indicators to Market Data
# ----------------------------------------------------------
# Load relevant parquet dfs with pre-training history
training_market_data_df_full = pd.read_parquet("temp/wallet_modeling_dfs/training_market_data_df_full.parquet")

# Generate indicators and save file
_ = wmo.generate_training_indicators_df(training_market_data_df_full,wallets_metrics_config)



# Identify Wallet Cohort
# ----------------------------------------------------------
# Remove market data from prior to the starting balance date
training_market_data_df = training_market_data_df_full[training_market_data_df_full['date']
                                        >=wallets_config['training_data']['training_starting_balance_date']]
u.assert_period(training_market_data_df,
                wallets_config['training_data']['training_period_start'],
                wallets_config['training_data']['training_period_end'])
del training_market_data_df_full
gc.collect()

# Retrieve full profits history
training_profits_df_full = pd.read_parquet("temp/wallet_modeling_dfs/training_profits_df_full.parquet")

# Define wallet cohort and return cohort-filtered training_profits_df
training_profits_df, training_wallet_cohort = wmo.define_training_wallet_cohort(training_profits_df_full,
                                                                                training_market_data_df)
u.assert_period(training_profits_df,
                wallets_config['training_data']['training_period_start'],
                wallets_config['training_data']['training_period_end'])
del training_profits_df_full
gc.collect()


# Generate Cohort-Filtered Profits Data for Training Windows
# ----------------------------------------------------------
# Generate wallet_cohort-filtered profits_df for all training windows
training_windows_profits_dfs = wmo.split_training_window_profits_dfs(
                                                        training_profits_df,
                                                        training_market_data_df,training_wallet_cohort)
training_profits_df.to_parquet("temp/wallet_modeling_dfs/training_profits_df.parquet",index=True)
del training_profits_df, training_market_data_df
gc.collect()



# Retrieve Transfers Data
# ----------------------------------------------------------
# Transfers data retrieval for the wallet_ids in temp.wallet_modeling_training_cohort
training_transfers_sequencing_df = wts.retrieve_transfers_sequencing()
training_transfers_sequencing_df.to_parquet("temp/wallet_modeling_dfs/training_transfers_sequencing_df.parquet",index=True)
del training_transfers_sequencing_df
gc.collect()


### generate training features (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Load files
training_profits_df = pd.read_parquet("temp/wallet_modeling_dfs/training_profits_df.parquet")
training_market_indicators_data_df = pd.read_parquet("temp/wallet_modeling_dfs/training_market_indicators_data_df.parquet")
training_transfers_sequencing_df = pd.read_parquet("temp/wallet_modeling_dfs/training_transfers_sequencing_df.parquet")
training_wallet_cohort = list(set(training_profits_df['wallet_address']))


# Generate Features for the Full Training Period
# ----------------------------------------------------------
logger.info("Generating features for full training period...")
training_wallet_features_df = wfo.calculate_wallet_features(training_profits_df,
                                                            training_market_indicators_data_df,
                                                            training_transfers_sequencing_df,
                                                            training_wallet_cohort,
                                                            wallets_config['training_data']['training_period_start'],
                                                            wallets_config['training_data']['training_period_end'])

# Define the start of training_data_df appending a suffix for the window
training_data_df = training_wallet_features_df.add_suffix("|all_windows")

# del training_profits_df,training_wallet_features_df
gc.collect()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Generate Features for Each Individual Window
# ----------------------------------------------------------
# Generate features for each window
for i, window_profits_df in enumerate(training_windows_profits_dfs, 1):
    logger.info("Generating features for window %s...", i)

    # Extract the window_df boundary dates that were validated by split_training_window_profits_dfs()
    window_opening_balance_date = window_profits_df['date'].min()
    window_start_date = window_opening_balance_date + timedelta(days=1)
    window_end_date = window_profits_df['date'].max()

    # Generate the features
    window_wallet_features_df = wfo.calculate_wallet_features(
        window_profits_df,  # profits_df is filtered to the window
        training_market_indicators_data_df,training_transfers_sequencing_df,  # full training period dfs
        training_wallet_cohort,  # full training cohort
        window_start_date.strftime('%Y-%m-%d'), window_end_date.strftime('%Y-%m-%d')  # window-specific dates
    )

    # Check for NaN values and identify problematic columns
    nan_columns = window_wallet_features_df.columns[window_wallet_features_df.isna().any()].tolist()
    if nan_columns:
        raise ValueError(f"NaN values detected in window {i} in columns: {nan_columns}")

    # Add column suffix and join to training_data_df
    window_wallet_features_df = window_wallet_features_df.add_suffix(f'|w{i}')
    training_data_df = training_data_df.join(window_wallet_features_df, how='left')

    # Check for NaN values and identify problematic columns
    nan_columns = training_data_df.columns[training_data_df.isna().any()].tolist()
    if nan_columns:
        raise ValueError(f"NaN values detected in training_data_df after window {i} in columns: {nan_columns}")


del window_profits_df,window_wallet_features_df,training_market_indicators_data_df,training_transfers_sequencing_df
gc.collect()

u.obj_mem()

In [None]:
# Generate Clusters Using All Other Features
# ----------------------------------------------------------
# Append clustering features based on all numeric features in the base training data
training_cluster_features_df = wcl.create_basic_cluster_features(training_data_df)
training_cluster_features_df = training_cluster_features_df.add_prefix('cluster|')
training_data_df = training_data_df.join(training_cluster_features_df, how='inner')



# Save TRAINING_DATA_DF
# ----------------------------------------------------------
# Verify all input wallets exist in final output
missing_wallets = set(training_wallet_cohort) - set(training_data_df.index)
if missing_wallets:
    raise ValueError(f"Lost {len(missing_wallets)} wallets from original cohort during feature generation. First few missing: {list(missing_wallets)[:5]}")
logger.info("Feature generation complete. Final training_df shape: %s", training_data_df.shape)


# Save and clear from memory
training_data_df.to_parquet("temp/wallet_modeling_dfs/training_data_df.parquet",index=True)
del training_data_df,training_cluster_features_df
gc.collect()
u.obj_mem()

## Wallet Modeling Data

### Retrieve modeling period datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Retrieve Modeling Profits and Market Data
# ----------------------------------------------------------
# Retrieve training coin cohort to restrict modeling period data to only training period coins
training_coin_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_market_indicators_data_df.parquet",
                                       columns=['coin_id'])['coin_id'].unique()
# Retrieve full historical through modeling period datasets
modeling_profits_df_full, modeling_market_data_df_full, modeling_coin_cohort = wmo.retrieve_period_datasets(
    wallets_config['training_data']['modeling_period_start'],
    wallets_config['training_data']['modeling_period_end'],
    coin_cohort=training_coin_cohort
)

# Remove pre-modeling period prices
modeling_market_data_df = modeling_market_data_df_full[modeling_market_data_df_full['date']
                                                       >=wallets_config['training_data']['modeling_starting_balance_date']]
del modeling_market_data_df_full,training_coin_cohort
gc.collect()


# Filter to only training wallet cohort
training_wallet_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet", columns=[]).index.values
modeling_profits_df = modeling_profits_df_full[modeling_profits_df_full['wallet_address'].isin(training_wallet_cohort)]
del modeling_profits_df_full
gc.collect()


# Assert period, save files, remove from memory
u.assert_period(modeling_market_data_df,
                wallets_config['training_data']['modeling_period_start'],
                wallets_config['training_data']['modeling_period_end'])
u.assert_period(modeling_profits_df,
                wallets_config['training_data']['modeling_period_start'],
                wallets_config['training_data']['modeling_period_end'])
modeling_profits_df.to_parquet("temp/wallet_modeling_dfs/modeling_profits_df.parquet",index=False)
modeling_market_data_df.to_parquet("temp/wallet_modeling_dfs/modeling_market_data_df.parquet",index=False)
del modeling_profits_df,modeling_market_data_df
gc.collect()

## Wallet Model Target Variable and Wallet Cohort

### define modeling cohort and features (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Create training_cohort-Indexed modeling_wallet_features_df
# -----------------------------------------------------------
# Create a DataFrame with training wallet cohort as the index
training_wallet_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet", columns=[]).index.values
modeling_wallet_features_df = pd.DataFrame(index=training_wallet_cohort)
modeling_wallet_features_df.index.name = 'wallet_address'

# Store feature sets with their prefixes for bulk renaming
feature_column_names = {}


# Identify Modeling Period Cohort
# -----------------------------------------------------------
# Retrieve trading features for all wallets in training_cohort with boolean for in_modeling_cohort
modeling_profits_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_profits_df.parquet")
modeling_trading_features_df = wmo.identify_modeling_cohort(modeling_profits_df)
modeling_wallet_features_df = modeling_wallet_features_df.join(modeling_trading_features_df, how='left')\
    .fillna({col: 0 for col in modeling_trading_features_df.columns})


# Generate Modeling Period Performance Features
# -----------------------------------------------------------
# Calculate performance metrics for the training cohort (wallets with 0 activity still impact rank orders)
modeling_performance_features_df = wpf.calculate_performance_features(modeling_wallet_features_df)
modeling_wallet_features_df = modeling_wallet_features_df.join(modeling_performance_features_df, how='left')\
    .fillna({col: 0 for col in modeling_performance_features_df.columns})

## Model Construction and Analysis

### select target variable and build model

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create MODELING_DF and Construct Wallet Model
# ----------------------------------------------------------
# Retrieve training data for the full training wallet cohort
training_data_df = pd.read_parquet("temp/wallet_modeling_dfs/training_data_df.parquet")

# Filter training data to only the modeling cohort through inner join to target variable
modeling_cohort_target_var_df = modeling_wallet_features_df[['in_modeling_cohort', wallets_config['modeling']['target_variable']]]

# Run the experiment and get results
wallet_model = wm.WalletModel(wallets_config)
model_results = wallet_model.run_experiment(training_data_df,modeling_cohort_target_var_df)
# del training_data_df
# gc.collect()

# Extract the trained model
model = model_results['pipeline'].named_steps['regressor']

# Generate and save all model artifacts
model_id, evaluator, modeling_wallet_scores_df = wmr.generate_and_save_model_artifacts(
    model_results=model_results,
    base_path='../wallet_modeling'
)
modeling_wallet_scores_df.to_parquet("temp/wallet_modeling_dfs/modeling_wallet_scores_df.parquet",index=True)


u.notify()

### assess wallet model performance

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Reload evaluator
evaluator = wime.RegressionEvaluator(
    y_train=model_results['y_train'],
    y_true=model_results['y_test'],
    y_pred=model_results['y_pred'],
    training_cohort_pred=model_results['training_cohort_pred'],
    training_cohort_actuals=model_results['training_cohort_actuals'],
    model=model,
    feature_names=model_results['X_train'].columns.tolist()
)

# Print results
print(evaluator.summary_report())
evaluator.plot_evaluation()
evaluator.importance_summary()

### Cluster analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# List of the x features with the highest importance in the model
x_features = 8
top_feature_metrics = list((pd.DataFrame(evaluator.metrics['importances'])
                      .sort_values(by='importance',ascending=False)
                      .head(x_features)['feature']))
all_metrics = list(set(top_feature_metrics))

# Cluster numbers
n_clusters=4


styled_df = wime.create_cluster_report(training_data_df, model_results, n_clusters, all_metrics)
styled_df

# Validation Period Analysis

## Original coin model feature generation

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# Generate features based on the coin config files
coin_features_training_data_df, _, _ = tw.generate_all_time_windows_model_inputs(config,metrics_config,modeling_config)

# Remove time window index since we aren't using that for now
coin_features_training_data_df = coin_features_training_data_df.reset_index(level='time_window', drop=True)

# Save to parquet
coin_features_training_data_df.to_parquet("temp/coin_modeling_dfs/coin_features_training_data_df.parquet",index=True)

u.notify()

In [None]:
# 1. Retrieve base datasets used by all windows
# ---------------------------------------------
macro_trends_df, market_data_df, profits_df, prices_df = tw.prepare_all_windows_base_data(config,
                                                                                        metrics_config)


# FILTER TO COIN COHORT


In [None]:
coin_ids = market_data_df['coin_id']
print(len(set(coin_ids)))

wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')

In [None]:

# 2. Generate flattened features for each dataset in each window
# --------------------------------------------------------------
# Generate time_windows config overrides that will modify each window's config settings
time_windows = tw.generate_time_windows(config)

all_flattened_dfs = []
all_flattened_filepaths = []

for _, time_window in enumerate(time_windows):

    # Prepare time window config files
    window_config, window_metrics_config, window_modeling_config = (
        exp.prepare_configs(modeling_config['modeling']['config_folder'], time_window))

    # Generate flattened feature dfs for all datasets for the window
    window_flattened_dfs, window_flattened_filepaths = tw.generate_window_flattened_dfs(
        market_data_df,
        macro_trends_df,
        profits_df,
        prices_df,
        window_config,
        window_metrics_config,
        window_modeling_config
    )

    # Store window's flattened features
    all_flattened_dfs.extend(window_flattened_dfs)
    all_flattened_filepaths.extend(window_flattened_filepaths)

# 3. Combine features from all datasets in all time windows with target variables
# -------------------------------------------------------------------------------
# Combine all time windows for each dataset, the join the datasets together
concatenated_dfs = tw.concat_dataset_time_windows_dfs(all_flattened_filepaths,modeling_config)
training_data_df, join_logs_df = tw.join_dataset_all_windows_dfs(concatenated_dfs)


In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# def generate_window_flattened_dfs(

window_flattened_dfs = []
window_flattened_filepaths = []

# # Market data: generate window-specific flattened metrics
# flattened_market_data_df, flattened_market_data_filepath = fg.generate_window_time_series_features(
#     market_data_df,
#     'time_series-market_data',
#     config,
#     metrics_config['time_series']['market_data'],
#     modeling_config
# )
# window_flattened_dfs.extend([flattened_market_data_df])
# window_flattened_filepaths.extend([flattened_market_data_filepath])

# Macro trends: generate window-specific flattened metrics
if not macro_trends_df.drop(columns='date').empty:
    flattened_macro_trends_df, flattened_macro_trends_filepath = fg.generate_window_macro_trends_features(
        macro_trends_df,
        'macro_trends',
        config,
        metrics_config,
        modeling_config
    )
    window_flattened_dfs.extend([flattened_macro_trends_df])
    window_flattened_filepaths.extend([flattened_macro_trends_filepath])

# Cohorts: generate window-specific flattened metrics
flattened_cohort_dfs, flattened_cohort_filepaths = fg.generate_window_wallet_cohort_features(
    profits_df,
    prices_df,
    config,
    metrics_config,
    modeling_config
)
window_flattened_dfs.extend(flattened_cohort_dfs)
window_flattened_filepaths.extend(flattened_cohort_filepaths)



In [None]:
# Macro trends: generate window-specific flattened metrics
if not macro_trends_df.drop(columns='date').empty:
    flattened_macro_trends_df, flattened_macro_trends_filepath = fg.generate_window_macro_trends_features(
        macro_trends_df,
        'macro_trends',
        config,
        metrics_config,
        modeling_config
    )
    # window_flattened_dfs.extend([flattened_macro_trends_df])
    # window_flattened_filepaths.extend([flattened_macro_trends_filepath])


In [None]:
flattened_macro_trends_df

In [None]:
u.notify()

In [None]:
coin_ids = flattened_cohort_dfs[0]['coin_id']
print(len(set(coin_ids)))

wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')


In [None]:
coin_ids = market_data_df['coin_id']
print(len(set(coin_ids)))

wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')

#### generate_window_wallet_cohort_features()

In [None]:
# def generate_window_wallet_cohort_features(

# 1. Impute all required dates
# ----------------------------
# Identify all required imputation dates
imputation_dates = pri.identify_imputation_dates(config)

# Impute all required dates
window_profits_df = pri.impute_profits_for_multiple_dates(profits_df, prices_df, imputation_dates, n_threads=24)
window_profits_df = (window_profits_df[(window_profits_df['date'] >= pd.to_datetime(min(imputation_dates))) &
                                    (window_profits_df['date'] <= pd.to_datetime(max(imputation_dates)))])



In [None]:
coin_ids = window_profits_df['coin_id']
print(len(set(coin_ids)))

wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


# 2. Generate metrics and indicators for all cohorts
# --------------------------------------------------
# Set up lists to store flattened cohort data
flattened_cohort_dfs = []
flattened_cohort_filepaths = []

# for cohort_name in metrics_config['wallet_cohorts']:
cohort_name = 'whales'

# load configs
dataset_metrics_config = metrics_config['wallet_cohorts'][cohort_name]
dataset_config = config['datasets']['wallet_cohorts'][cohort_name]

# filter profits_df to the cohort lookback
training_period_start = config['training_data']['training_period_start']
cohort_lookback = config['datasets']['wallet_cohorts'][cohort_name]['lookback_period']
cohort_lookback_start = pd.to_datetime(training_period_start) - timedelta(days=cohort_lookback)
cohort_profits_df = window_profits_df[window_profits_df['date']>=cohort_lookback_start]

# identify wallets in the cohort based on the full lookback period
cohort_summary_df = cwm.classify_wallet_cohort(cohort_profits_df, dataset_config, cohort_name)
cohort_wallets = cohort_summary_df[cohort_summary_df['in_cohort']]['wallet_address']

# # # If no cohort members were identified, continue
# # if len(cohort_wallets) == 0:
# #     logger.info("No wallets identified as members of cohort '%s'", cohort_name)
# #     continue


In [None]:
cohort_profits_df_full = cohort_profits_df.copy()

In [None]:
coin_ids = cohort_profits_df['coin_id']
print(len(set(coin_ids)))

wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')

In [None]:

# Generate cohort buysell_metrics
cohort_metrics_df = cwm.generate_buysell_metrics_df(cohort_profits_df,
                                                    config['training_data']['training_period_end'],
                                                    cohort_wallets)

# # Generate cohort indicator metrics
# cohort_metrics_df = ind.generate_time_series_indicators(cohort_metrics_df,
#                                                         metrics_config['wallet_cohorts'][cohort_name],
#                                                         'coin_id')

# # Flatten cohort metrics
# flattened_cohort_df, flattened_cohort_filepath = fg.generate_window_time_series_features(
#     cohort_metrics_df,
#     f'wallet_cohorts-{cohort_name}',
#     config,
#     dataset_metrics_config,
#     modeling_config
# )

# flattened_cohort_dfs.extend([flattened_cohort_df])
# flattened_cohort_filepaths.extend([flattened_cohort_filepath])

In [None]:
coin_ids = cohort_metrics_df['coin_id']
print(len(set(coin_ids)))

wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')

#### cwm.generate_buysell_metrics_df()

In [None]:
coin_ids = cohort_profits_df_full['coin_id']
print(len(set(coin_ids)))

wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')

In [None]:
profits_df = cohort_profits_df_full.copy()
training_period_end = config['training_data']['training_period_end']
cohort_wallets = cohort_wallets

# def generate_buysell_metrics_df(profits_df,training_period_end,cohort_wallets):

start_time = time.time()
logger.debug('Preparing buysell_metrics_df...')


# Step 1: Filter profits_df to cohort and conduct data quality checks
# -------------------------------------------------------------------
# Raise an error if either the wallet cohort or coin list is empty
if len(cohort_wallets) == 0:
    raise ValueError("Wallet cohort is empty. Provide at least one wallet address.")

# Create cohort_profits_df by filtering profits_df to only include the cohort coins and wallets
# during the training period
profits_df = profits_df[profits_df['date']<=training_period_end]
cohort_profits_df = profits_df[profits_df['wallet_address'].isin(cohort_wallets)]

logger.info("No wallet cohort activity found for %s coins during the window.",
            len(profits_df['coin_id'].unique()) - len(cohort_profits_df['coin_id'].unique()))

cohort_profits_df = cohort_profits_df[['coin_id','wallet_address','date','usd_balance','usd_net_transfers']]

# Raise an error if the filtered df is empty
if cohort_profits_df.empty:
    raise ValueError("Cohort-filtered profits_df is empty. Please check input parameters")


# Step 2: Add buy_sequence and sell_sequence columns
# --------------------------------------------------
# Initialize the buy and sell sequence columns
cohort_profits_df['buy_sequence'] = np.where(cohort_profits_df['usd_net_transfers'] > 0, 1, np.nan)
cohort_profits_df['sell_sequence'] = np.where(cohort_profits_df['usd_net_transfers'] < 0, 1, np.nan)

# Calculate cumulative sum to simulate transfer sequence, skipping rows where usd_net_transfers == 0
cohort_profits_df['buy_sequence'] = cohort_profits_df.groupby(['coin_id', 'wallet_address'], observed=True)['buy_sequence'].cumsum()
cohort_profits_df['sell_sequence'] = cohort_profits_df.groupby(['coin_id', 'wallet_address'], observed=True)['sell_sequence'].cumsum()

# Set buy_sequence and sell_sequence to null where usd_net_transfers == 0
cohort_profits_df.loc[cohort_profits_df['usd_net_transfers'] == 0, ['buy_sequence', 'sell_sequence']] = np.nan


# Step 3: Calculate coin metrics
# ------------------------------
# Initialize an empty list to store DataFrames for each coin
coin_features_list = []

# Loop through all unique coin_ids
for c in cohort_profits_df['coin_id'].unique():
    # Filter cohort_profits_df for the current coin_id and create a copy
    coin_cohort_profits_df = cohort_profits_df[cohort_profits_df['coin_id'] == c].copy()

    # Call the feature calculation function
    coin_features_df = cwm.generate_coin_buysell_metrics_df(coin_cohort_profits_df)

    # Add coin_id back to the DataFrame to retain coin information
    coin_features_df['coin_id'] = c

    # Append the result to the list
    coin_features_list.append(coin_features_df)


# Step 4: Consolidate all metrics into a filled DataFrame
# -------------------------------------------------------
# Concatenate all features DataFrames into a single DataFrame
buysell_metrics_df = pd.concat(coin_features_list, ignore_index=True)

# Ensure full date range coverage through the training_period_end for each coin-wallet pair
buysell_metrics_df = cwm.fill_buysell_metrics_df(buysell_metrics_df, training_period_end)

logger.info('Generated buysell_metrics_df after %.2f seconds.', time.time() - start_time)


coin_ids = cohort_profits_df['coin_id']
print(len(set(coin_ids)))

wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')

In [None]:


coin_ids = cohort_profits_df['coin_id']
print(len(set(coin_ids)))

wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


all_windows_time_series_df = market_data_df.copy()
dataset_name = 'time_series-market_data'
config = config
dataset_metrics_config = metrics_config['time_series']['market_data']
modeling_config = modeling_config


# def generate_window_time_series_features(

# Filter input data to time window
window_time_series_df = cwm.apply_period_boundaries(
    all_windows_time_series_df,
    config['training_data']['training_period_start'],
    config['training_data']['training_period_end'],
)

# Flatten the metrics DataFrame to be keyed only on coin_id
flattened_metrics_df = flt.flatten_coin_date_df(
    window_time_series_df,
    dataset_metrics_config,
    config['training_data']['training_period_end']  # Ensure data is up to training period end
)

# Add time window modeling period start
flattened_metrics_df.loc[:,'time_window'] = config['training_data']['modeling_period_start']

# Add dataset_name as a prefix to all columns so their lineage is fully documented
flattened_metrics_df = flattened_metrics_df.rename(
    columns=lambda x:
    f"{dataset_name.replace('-', '|')}|{x}"
    if x not in ['coin_id', 'time_window']
    else x)

# Save the flattened output and retrieve the file path
_, flattened_metrics_filepath = flt.save_flattened_outputs(
    flattened_metrics_df,
    os.path.join(
        modeling_config['modeling']['modeling_folder'],  # Folder to store flattened outputs
        'outputs/flattened_outputs'
    ),
    dataset_name,  # Descriptive metadata for the dataset
    config['training_data']['modeling_period_start']  # Ensure data starts from modeling period
)


flattened_metrics_df.head()

In [None]:

coin_ids = flattened_metrics_df['coin_id']


print(len(set(coin_ids)))
wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')


In [None]:
window_time_series_df.isna().sum()

In [None]:
window_time_series_df.describe()

In [None]:
coin_ids = window_time_series_df['coin_id']


print(len(set(coin_ids)))
wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')


In [None]:
coin_ids = all_windows_time_series_df['coin_id']


print(len(set(coin_ids)))
wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')


In [None]:
dates_df = pd.DataFrame(all_windows_time_series_df.groupby('coin_id',observed=True)['date'].max())
dates_df.groupby('date').size()

In [None]:
all_windows_time_series_df.isna().sum()

In [None]:
all_windows_time_series_df.describe()

In [None]:
# def split_dataframe_by_coverage(
time_series_df = all_windows_time_series_df.copy()
start_date = config['training_data']['training_period_start']
end_date = config['training_data']['training_period_end']
id_column = id_column='coin_id'
drop_outside_date_range = True


# Convert params to datetime
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

# Create copy of df
time_series_df = time_series_df.copy()

# Drop all rows with any NaN values
time_series_df = time_series_df.dropna()

# # Define a function to check if a date range has full coverage
# def has_full_coverage(min_date, max_date):
#     return (min_date <= start_date) and (max_date >= end_date)

# if id_column:
#     # Multi-series data
#     series_data_range = time_series_df.groupby(id_column, observed=True)['date'].agg(['min', 'max'])
#     full_duration_series = series_data_range[series_data_range.apply(lambda x: has_full_coverage(x['min'], x['max']), axis=1)].index
# else:
#     # Single-series data
#     series_data_range = time_series_df['date'].agg(['min', 'max'])
#     full_duration_series = [0] if has_full_coverage(series_data_range['min'], series_data_range['max']) else []

# # Calculate coverage statistics
# full_coverage_count = len(full_duration_series)

# # Split the dataframe
# if id_column:
#     # Convert id column to categorical to reduce memory usage
#     time_series_df[id_column] = time_series_df[id_column].astype('category')
#     full_coverage_df = time_series_df[time_series_df[id_column].isin(full_duration_series)]
#     partial_coverage_df = time_series_df[~time_series_df[id_column].isin(full_duration_series)]
# else:
#     full_coverage_df = time_series_df if full_coverage_count else pd.DataFrame(columns=time_series_df.columns)
#     partial_coverage_df = time_series_df if not full_coverage_count else pd.DataFrame(columns=time_series_df.columns)

# logger.debug("Split df with dimensions %s into %s full coverage records and %s partial coverage records.",
#             time_series_df.shape,
#             len(full_coverage_df),
#             len(partial_coverage_df))

# if drop_outside_date_range:
#     # Remove rows outside the date range for both dataframes
#     full_coverage_df = (full_coverage_df[(full_coverage_df['date'] >= start_date) &
#                                             (full_coverage_df['date'] <= end_date)])
#     partial_coverage_df = (partial_coverage_df[(partial_coverage_df['date'] >= start_date) &
#                                                 (partial_coverage_df['date'] <= end_date)])

#     # Log the number of remaining records
#     total_remaining = len(full_coverage_df) + len(partial_coverage_df)
#     logger.debug("After removing records outside the date range, %s records remain.",
#                 total_remaining)

In [None]:
coin_ids = time_series_df['coin_id']


print(len(set(coin_ids)))
wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')


In [None]:
# coin_ids = window_time_series_df['coin_id']


# print(len(set(coin_ids)))
# wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
# print(f'wallet features only: {len(wallet_features_only)}')

In [None]:
len(set(window_time_series_df['coin_id']))

In [None]:
window_time_series_df.groupby('coin_id',observed=True).agg({
    'date': ['min','max']
})

In [None]:
partial_df.groupby('coin_id',observed=True).agg({
    'date': ['min','max']
})

In [None]:
coin_ids = partial_df['coin_id']


print(len(set(coin_ids)))
wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')


In [None]:
print(flattened_macro_trends_df.shape)
flattened_macro_trends_df.head()

In [None]:
coin_ids = profits_df['coin_id']

wallet_features_only = set(modeling_profits_df['coin_id']) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')

### missing coins assessment

In [None]:
coin_features_training_data_df.shape

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# Extract id lists
coin_features_ids = coin_features_training_data_df.index
wallet_features_ids = modeling_profits_df['coin_id']


# Perform comparisons
both_sets_all_coin_ids = list(set(coin_features_ids) | set(wallet_features_ids))
overlap = set(coin_features_ids) & set(wallet_features_ids)
coin_features_only = set(coin_features_ids) - set(wallet_features_ids)
wallet_features_only = set(wallet_features_ids) - set(coin_features_ids)

print(f'all coin ids: {len(both_sets_all_coin_ids)}')
print(f'wallet coin ids: {len((set(wallet_features_ids)))}')
print(f'coin coin ids: {len(set(coin_features_ids))}')
print(f'overlapping coin ids: {len(overlap)}')
print(f'coin features only: {len(coin_features_only)}')
print(f'wallet features only: {len(wallet_features_only)}')


# Define index
all_coin_ids = list(overlap)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


market_data_df_full = dr.retrieve_market_data(dataset=config['training_data']['dataset'])

coin_ids = market_data_df_full['coin_id']
wallet_features_only = set(wallet_features_ids) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

market_data_df = market_data_df_full.copy()

market_data_df = dr.clean_market_data(market_data_df, config,
                                        config['training_data']['earliest_window_start'],
                                        config['training_data']['training_period_end'])

[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


coin_ids = market_data_df['coin_id']
wallet_features_only = set(wallet_features_ids) - set(coin_ids)
print(f'wallet features only: {len(wallet_features_only)}')

In [None]:
wallet_features_only

In [None]:
low_volume_coins

In [None]:
c =  '0bf15fd6-ed3d-4485-b066-c473cfb81f06'

volume_df = pd.DataFrame(mean_volume)
volume_df.loc[c]


## Prepare coin_training_data_df

### Retrieve datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Load modeling period scores and data
modeling_wallet_scores_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_wallet_scores_df.parquet")
modeling_market_data_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_market_data_df.parquet")
modeling_profits_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_profits_df.parquet")
u.assert_period(modeling_market_data_df,
                wallets_config['training_data']['modeling_period_start'],
                wallets_config['training_data']['modeling_period_end'])
u.assert_period(modeling_profits_df,
                wallets_config['training_data']['modeling_period_start'],
                wallets_config['training_data']['modeling_period_end'])


### define full coin_id list

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# Extract id lists
coin_features_ids = coin_features_training_data_df.index
wallet_features_ids = modeling_profits_df['coin_id']


# Perform comparisons
both_sets_all_coin_ids = list(set(coin_features_ids) | set(wallet_features_ids))
overlap = set(coin_features_ids) & set(wallet_features_ids)
coin_features_only = set(coin_features_ids) - set(wallet_features_ids)
wallet_features_only = set(wallet_features_ids) - set(coin_features_ids)

print(f'all coin ids: {len(both_sets_all_coin_ids)}')
print(f'overlapping coin ids: {len(overlap)}')
print(f'coin features only: {len(coin_features_only)}')
print(f'wallet features only: {len(wallet_features_only)}')


# Define index
all_coin_ids = list(overlap)

### generate coin features as of the end of the modeling period (loadable parquet)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Initialize with complete coin list
coin_wallet_features_df = pd.DataFrame(index=all_coin_ids)
coin_wallet_features_df.index.name = 'coin_id'

# Generate coin-level features based on modeling period end wallet scores and balances
coin_wallet_balance_features_modeling_end_df = cwb.calculate_coin_wallet_balance_features(
    modeling_profits_df,
    modeling_wallet_scores_df,
    wallets_config['training_data']['modeling_period_end'],
    all_coin_ids
)
coin_wallet_balance_features_modeling_end_df = coin_wallet_balance_features_modeling_end_df.add_prefix('wallet_balance|')
coin_wallet_balance_features_modeling_end_df = coin_wallet_balance_features_modeling_end_df.add_suffix('|modeling_end')
coin_wallet_features_df = coin_wallet_features_df.join(coin_wallet_balance_features_modeling_end_df,how='inner')

# Generate coin-level features based on modeling period start wallet scores and balances
coin_wallet_balance_features_modeling_start_df = cwb.calculate_coin_wallet_balance_features(
    modeling_profits_df,
    modeling_wallet_scores_df,
    wallets_config['training_data']['modeling_period_start'],
    all_coin_ids
)
coin_wallet_balance_features_modeling_start_df = coin_wallet_balance_features_modeling_start_df.add_prefix('wallet_balance|')
coin_wallet_balance_features_modeling_start_df = coin_wallet_balance_features_modeling_start_df.add_suffix('|modeling_start')
coin_wallet_features_df = coin_wallet_features_df.join(coin_wallet_balance_features_modeling_start_df,how='inner')

coin_wallet_features_df.describe()

In [None]:
# data from coin features pipeline
coin_features_training_data_df = pd.read_parquet("temp/coin_modeling_dfs/coin_features_training_data_df.parquet")


# Crude join
coin_training_data_df = coin_features_training_data_df.join(coin_wallet_features_df,how='inner')
coin_training_data_df.shape

## Prepare coin_modeling_df

### Retrieve validation datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Retrieve Validation Profits and Market Data
# ----------------------------------------------------------
# Retrieve full historical through validation period datasets

# Retrieve training coin cohort to ensure all training period coins are reflected
# TODO: assess whether this cohort filter should be removed
training_coin_cohort = pd.read_parquet("temp/wallet_modeling_dfs/training_market_indicators_data_df.parquet",
                                       columns=['coin_id'])['coin_id'].unique()
validation_profits_df, validation_market_data_df_full, validation_coin_cohort = wmo.retrieve_period_datasets(
    wallets_config['training_data']['validation_period_start'],
    wallets_config['training_data']['validation_period_end'],
    training_coin_cohort
)

# Remove pre-validation period prices
validation_market_data_df = validation_market_data_df_full[validation_market_data_df_full['date']
                                                       >=wallets_config['training_data']['validation_starting_balance_date']]
del validation_market_data_df_full
gc.collect()


# Assert period, save files, remove from memory
u.assert_period(validation_market_data_df,
                wallets_config['training_data']['validation_period_start'],
                wallets_config['training_data']['validation_period_end'])
u.assert_period(validation_profits_df,
                wallets_config['training_data']['validation_period_start'],
                wallets_config['training_data']['validation_period_end'])
validation_profits_df.to_parquet("temp/wallet_modeling_dfs/validation_profits_df.parquet",index=False)
validation_market_data_df.to_parquet("temp/wallet_modeling_dfs/validation_market_data_df.parquet",index=False)
del validation_profits_df,validation_market_data_df
gc.collect()

### Prepare target variable (parquet loadable)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))

# Load market data
validation_market_data_df = pd.read_parquet("temp/wallet_modeling_dfs/validation_market_data_df.parquet")


# Calculate coin return performance during validation period
validation_coin_performance_df = civa.calculate_coin_performance(
    validation_market_data_df,
    wallets_config['training_data']['validation_period_start'],
    wallets_config['training_data']['validation_period_end']
)
validation_coin_performance_df['coin_return_winsorized'] = u.winsorize(
        validation_coin_performance_df['coin_return'],
        wallets_coin_config['coin_modeling']['returns_winsorization'])

# Validation: check if any coin_ids missing from final features
missing_coins = set(coin_training_data_df.index) - set(validation_coin_performance_df.index)
if missing_coins:
    raise ValueError(f"Found {len(missing_coins)} coin_ids in training_data_df without validation period target variables.")


# Attach target variable column to training_data_df
target_var_column = wallets_coin_config['coin_modeling']['target_variable']
coin_modeling_df = coin_training_data_df.join(validation_coin_performance_df[[target_var_column]])

coin_modeling_df.describe()

## Build coin model

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_coin_config = yaml.safe_load(Path('../config/wallets_coin_config.yaml').read_text(encoding='utf-8'))


# Initialize and run model
coin_model = cm.CoinModel(wallets_coin_config=wallets_coin_config)
coin_model_results = coin_model.run_experiment(feature_df=coin_modeling_df)


# Initialize evaluator
evaluator = cime.CoinRegressionEvaluator(
    y_test=coin_model_results['y_test'],
    y_pred=coin_model_results['y_pred'],
    model=coin_model.pipeline.named_steps['regressor'],
    feature_names=coin_model_results['X_train'].columns.tolist()
)

# Generate reports
print(evaluator.summary_report())
evaluator.plot_evaluation()

## Wallet aggregated analysis

### generate validation wallet features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Load parquet
validation_profits_df = pd.read_parquet("temp/wallet_modeling_dfs/validation_profits_df.parquet")
validation_market_data_df = pd.read_parquet("temp/wallet_modeling_dfs/validation_market_data_df.parquet")


# Create a DataFrame with all wallets that should exist
validation_wallet_features_df = pd.DataFrame(index=training_wallet_cohort)
validation_wallet_features_df.index.name = 'wallet_address'


# Calculate modeling period wallet metrics
validation_trading_features_df = wtf.calculate_wallet_trading_features(validation_profits_df,
                                                            wallets_config['training_data']['validation_period_start'],
                                                            wallets_config['training_data']['validation_period_end'])
validation_wallet_features_df = validation_wallet_features_df.join(validation_trading_features_df, how='left')\
    .fillna({col: 0 for col in validation_trading_features_df.columns})

# Performance features (inner join, no fill)
performance_features_df = wpf.calculate_performance_features(validation_wallet_features_df)
validation_wallet_features_df = validation_wallet_features_df.join(performance_features_df, how='inner')
validation_wallet_features_df.describe()

### wallet validation period trading/performance by score quantile

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create analysis by prediction bands
metrics = [
    'crypto_net_gain/max_investment/winsorized',
    'net_crypto_investment/max_investment/winsorized',
    'max_investment',
    'crypto_net_gain',
    'net_crypto_investment',
    'total_volume',
]

min_wallet_volume_usd = 1000
num_quantiles = 5

wiwv.create_quantile_report(
    validation_wallet_features_df,
    model_results['y_pred'],
    metrics,  # Your existing metrics list
    num_quantiles,  # Split into ntiles
    min_wallet_volume_usd
)


## Coin aggregated analysis

### coin-aggregated wallet metrics by coin performance

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Load wallet scores from the modeling period
modeling_wallet_scores_df = pd.read_parquet("temp/wallet_modeling_dfs/modeling_wallet_scores_df.parquet")

# Generate coin-level features about wallet behavior during the
modeling_coin_end_balance_features_df = cwsf.calculate_coin_end_balance_features(
    modeling_profits_df,
    modeling_wallet_scores_df
)


# Calculate and join coin return performance during validation period
validation_coin_performance_df = civa.calculate_coin_performance(
    validation_market_data_df,
    wallets_config['training_data']['validation_period_start'],
    wallets_config['training_data']['validation_period_end']
)
validation_coin_wallet_features_df = modeling_coin_end_balance_features_df.join(validation_coin_performance_df,how='inner')


# Create styled performance analysis
civa.create_top_coins_wallet_metrics_report(validation_coin_wallet_features_df,percentile=90,method='mean')


## old analysis

### plotting coin feature performance vs market cap

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Get the analysis results
segment_results, summary_df = civa.analyze_market_cap_segments(
    coin_wallet_features_df,
    top_n=10
)

# Or create the visualizations
civa.plot_segment_heatmap(summary_df)
civa.plot_metric_consistency(summary_df)  # Optional secondary visualization


### coin performance of top n for each bucket

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Run analysis
top_n = wallets_config['coin_validation_analysis']['top_n']
max_market_cap = wallets_config['coin_validation_analysis']['max_market_cap']
min_market_cap = wallets_config['coin_validation_analysis']['min_market_cap']

metric_top_coin_performance_df = civa.validate_coin_performance(coin_wallet_features_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

civa.print_performance_analysis(coin_wallet_features_df)

# Junkyard

# Tests failing

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

In [None]:
def sample_performance_features_df():
    """
    Fixture to provide a sample DataFrame for testing ratio calculations.
    Includes profits and balance metrics for multiple wallets,
    including one with all 0 values and another with losses.
    """
    data = {
        'profits_total_gain': [100, 200, 300, 0, -50],
        'profits_realized_gain': [50, 150, 250, 0, -30],
        'profits_unrealized_gain': [50, 50, 50, 0, -20],
        'balance_max_investment': [1000, 2000, 3000, 0, 500],
        'balance_time_weighted_balance': [500, 1500, 2500, 0, 300],
        'balance_active_time_weighted_balance': [400, 1200, 2200, 0, 250],
    }
    index = ['wallet1', 'wallet2', 'wallet3', 'wallet4_zero', 'wallet5_loss']
    return pd.DataFrame(data, index=index)

sample_performance_features_df = sample

In [None]:
# @pytest.mark.unit
# def test_transform_performance_ratios(sample_performance_features_df):
"""
Test to validate the correctness of transformations applied to performance ratios by
wpf.transform_performance_ratios.
Steps:
1. Generate raw ratios using wpf.calculate_performance_ratios.
2. Apply transformations, including rank, log, winsorization, and ntile rank.
3. Validate the correctness of each transformation step.
"""
# Step 1: Generate raw ratios
ratio_df = wpf.calculate_performance_ratios(sample_performance_features_df)

# Step 2: Define balance metrics for ntile calculation
balance_features_df = sample_performance_features_df.filter(like='balance_')

# Step 3: Transform the performance ratios
transformed_df = wpf.transform_performance_ratios(ratio_df, balance_features_df)

# Explanation of assertions:
# 1. Base ratios: Validate that the base ratios remain unchanged after transformation.
for col in ratio_df.columns:
    assert np.allclose(
        transformed_df[f"{col}/base"].values,
        ratio_df[col].values,
        equal_nan=True
    ), f"Base ratio for {col} does not match expected values."

# 2. Rank: Validate that the ranks are correctly calculated as percentiles.
for col in ratio_df.columns:
    expected_rank = ratio_df[col].rank(method="average", pct=True).values
    assert np.allclose(
        transformed_df[f"{col}/rank"].values,
        expected_rank,
        equal_nan=True
    ), f"Rank for {col} does not match expected values."

# 3. Log transformation: Validate signed log calculations.
for col in ratio_df.columns:
    expected_log = np.sign(ratio_df[col]) * np.log1p(ratio_df[col].abs())
    assert np.allclose(
        transformed_df[f"{col}/log"].values,
        expected_log,
        equal_nan=True
    ), f"Log transformation for {col} does not match expected values."

# 4. Winsorization: Validate winsorized ratios based on config.
# Assume wallets_config['features']['returns_winsorization'] = 0.05
winsorization_threshold = wallets_config['features']['returns_winsorization']
for col in ratio_df.columns:
    series = ratio_df[col]
    expected_winsorized = u.winsorize(series, cutoff=winsorization_threshold)
    assert np.allclose(
        transformed_df[f"{col}/winsorized"].values,
        expected_winsorized.values,
        equal_nan=True
    ), f"Winsorized values for {col} do not match expected values."

# 5. Ntile rank: Validate that ntile ranks are calculated correctly.
ntile_count = 10  # Assume config sets this to 10
for col in ratio_df.columns:
    denominator = col.split("/")[1]
    balance_col = f"balance_{denominator}"
    metric_ntiles = pd.qcut(
        balance_features_df[balance_col],
        q=ntile_count,
        labels=False,
        duplicates="drop"
    )
    expected_ntile_rank = (
        ratio_df[col]
        .groupby(metric_ntiles)
        .rank(method="average", pct=True)
        .fillna(0)
    )
    assert np.allclose(
        transformed_df[f"{col}/ntile_rank"].values,
        expected_ntile_rank.values,
        equal_nan=True
    ), f"Ntile rank for {col} does not match expected values."






In [None]:
transformed_df.columns