In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import re
import pdb
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    explained_variance_score,
    mean_absolute_percentage_error,
    roc_auc_score
)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['ALERT_SOUND_FILEPATH']="../../../Local/assets/sounds/mixkit-alert-bells-echo-765.wav"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp

# Wallet modeling
import wallet_modeling.wallet_modeling_orchestrator as wmo
import wallet_modeling.wallet_training_data as wtd
import wallet_modeling.model_reporting as wmr
import wallet_modeling.wallet_model as wm
import wallet_modeling.experiments_manager as wem
from wallet_modeling.wallets_config_manager import WalletsConfig

# Wallet features
import wallet_features.clustering_features as wcl
import wallet_features.market_cap_features as wmc
import wallet_features.market_timing_features as wmt
import wallet_features.performance_features as wpf
import wallet_features.trading_features as wtf
import wallet_features.transfers_features as wts
import wallet_features.features_orchestrator as wfo

# Wallet insights
import wallet_insights.wallet_model_evaluation as wime
import wallet_insights.coin_validation_analysis as wica
import wallet_insights.coin_validation_model as wicm


# reload all modules
modules = [u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp,
           wmo, wtd, wmr, wm, wem,
           wcl, wmc, wmt, wpf, wtf, wts, wfo,
           wime, wica, wicm]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')
wallets_config = WalletsConfig.load_from_yaml('../config/wallets_config.yaml')
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

logger.info("Good morning, let's get to work")

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

u.export_code(
    code_directories=[
        'wallet_features',
        'wallet_modeling',
        'wallet_insights'
    ],
    include_config = True,
    ipynb_notebook = 'DDA-436 wallet scores vs coin performance.ipynb'
)

## Full Training Data Sequence

### retrieve datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# Retrieve datasets
profits_df,market_data_df = wmo.retrieve_datasets()
profits_df_full = profits_df.copy()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


profits_df = profits_df_full.copy()

# Define wallet cohort after cleaning
training_wallet_metrics_df,wallet_cohort = wmo.define_wallet_cohort(profits_df,market_data_df)

# Generate profits_df for all training windows and the modeling period
training_profits_df, training_windows_profits_dfs, modeling_profits_df, validation_profits_df = wmo.split_profits_df(profits_df,
                                                                               market_data_df,wallet_cohort)

# Market data: add indicators
# Remove all market_data records after the training period to ensure no leakage
training_market_data_df = (market_data_df[market_data_df['date']
                                          <= wallets_config['training_data']['training_period_end']])

# Add new columns
# Generate basic indicators
market_indicators_data_df = ind.add_market_data_dualcolumn_indicators(training_market_data_df)
market_indicators_data_df = ind.generate_time_series_indicators(market_indicators_data_df,
                                                        wallets_metrics_config['time_series']['market_data'],
                                                        'coin_id')

# Transfers data retrieval for the wallet_ids in temp.wallet_modeling_cohort
transfers_sequencing_df = wts.retrieve_transfers_sequencing()


### data freshness checks on profits and market data

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# def check_coin_transfers_staleness(profits_df, data_cleaning_config) -> None:
#     """
#     Warns if coin count changes exceed specified thresholds.

#     Params:
#     - profits_df (df): df showing coin-wallet-date records where transers exist
#     """

data_cleaning_config = wallets_config['data_cleaning']

# Extract thresholds
count_threshold=data_cleaning_config['transfers_coverage_warning_min_coin_increase'],
percent_threshold=data_cleaning_config['transfers_coverage_warning_min_pct_increase']
audit_window=data_cleaning_config['coverage_decrease_audit_window']

# Create counts of coins with transfers
daily_counts = profits_df.groupby('date')['coin_id'].nunique().copy()
latest_count = daily_counts.iloc[-1]
latest_date = daily_counts.index[-1]
cutoff_date = latest_date - pd.Timedelta(days=audit_window)
week_data = daily_counts.loc[cutoff_date:]

count_decrease = week_data.max() - latest_count
min_date = week_data.idxmin()
pct_decrease = count_decrease / week_data.max() * 100

if count_decrease > count_threshold and pct_decrease > percent_threshold:
    logging.warning(
        f"Transfers data coverage alert on {latest_date.date()}:\n"
        f"- Transfers coverage decreased from {week_data.max():.0f} coins ({min_date.date()}) "
        f"to {latest_count:.0f} coins ({latest_date.date()}), "
        f"a {pct_decrease:.1f}% decrease."
    )


In [None]:
week_data

In [None]:
week_data.min()

In [None]:
latest_count

In [None]:
count_decrease

In [None]:
# print(f"daily_counts: {daily_counts}")
print(f"latest_count: {latest_count}")
print(f"latest_date: {latest_date}")
print(f"cutoff_date: {cutoff_date}")
# print(f"week_data: {week_data}")
print(f"count_decrease: {count_decrease}")

In [None]:
profits_df_full.groupby('date',observed=True)['coin_id'].nunique().tail(15)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


dr.check_coin_coverage(profits_df_full,wallets_config['data_cleaning'])

In [None]:
daily_counts = profits_df_full.groupby('date')['coin_id'].nunique()
latest_count = daily_counts.iloc[-1]
week_max = daily_counts.iloc[-7:].max()
pct_change = ((latest_count - week_max) / week_max * 100).round(2)

print(f"Latest count: {latest_count:,}")
print(f"7-day high: {week_max:,}")
print(f"Percent change: {pct_change}%")

### generate features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate features for the full training dataset
logger.info("Generating features for full training period...")
training_wallet_features_df = wfo.calculate_wallet_features(training_profits_df, market_indicators_data_df,
                                                           transfers_sequencing_df, wallet_cohort)

# Define the full feature set by appending a suffix for each window
training_data_df = training_wallet_features_df.add_suffix("_all_windows")

# Generate features for each window
for i, window_profits_df in enumerate(training_windows_profits_dfs, 1):
    logger.info("Generating features for window %s...", i)

    # Generate the features
    window_wallet_features_df = wfo.calculate_wallet_features(window_profits_df, market_indicators_data_df,
                                                             transfers_sequencing_df, wallet_cohort)

    # Check for NaN values and identify problematic columns
    nan_columns = window_wallet_features_df.columns[window_wallet_features_df.isna().any()].tolist()
    if nan_columns:
        raise ValueError(f"NaN values detected in window {i} in columns: {nan_columns}")

    # Add column suffix and join to training_data_df
    window_wallet_features_df = window_wallet_features_df.add_suffix(f'_w{i}')
    training_data_df = training_data_df.join(window_wallet_features_df, how='left')

    # Check for NaN values and identify problematic columns
    nan_columns = training_data_df.columns[training_data_df.isna().any()].tolist()
    if nan_columns:
        raise ValueError(f"NaN values detected in training_data_df after window {i} in columns: {nan_columns}")

# Append clustering features based on all numeric features in the base training data
cluster_features_df = wcl.create_basic_cluster_features(training_data_df)
cluster_features_df = cluster_features_df.add_prefix('cluster_')
training_data_df = training_data_df.join(cluster_features_df, how='inner')

# Verify all input wallets exist in final output
missing_wallets = set(wallet_cohort) - set(training_data_df.index)
if missing_wallets:
    raise ValueError(f"Lost {len(missing_wallets)} wallets from original cohort during feature generation. First few missing: {list(missing_wallets)[:5]}")

logger.info("Feature generation complete.")

training_data_df.describe()

## Wallet Modeling

### join target variable to training data

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Clean inactive wallets from modeling period data
modeling_wallets_df = wmo.filter_modeling_period_wallets(modeling_profits_df)

# Generate target variables
target_vars_df = wpf.calculate_performance_features(modeling_wallets_df)

# Merge training data and target variables?
modeling_df = training_data_df.join(target_vars_df[wallets_config['modeling']['target_variable']],
                                    how='inner')


### build model

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create an experiment instance
experiment = wm.WalletModel(wallets_config)

# Run the experiment and get results
model_results = experiment.run_experiment(modeling_df)

# Extract the trained model
model = model_results['pipeline'].named_steps['regressor']

### assess wallet model performance

In [None]:
### save model artifacts
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate and save all model artifacts
model_id, evaluator, wallet_scores_df, coin_validation_df = wmr.generate_and_save_model_artifacts(
    model_results=model_results,
    validation_profits_df=validation_profits_df,
    base_path='../wallet_modeling'
)
u.play_notification()

In [None]:
### save model artifacts
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Reload evaluator
evaluator = wime.RegressionEvaluator(
    y_train=model_results['y_train'],
    y_true=model_results['y_test'],
    y_pred=model_results['y_pred'],
    model=model,
    feature_names=model_results['X_train'].columns.tolist()
)

# Print results
print(evaluator.summary_report())
evaluator.plot_evaluation()
evaluator.importance_summary()

## Cluster analysis

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# List of the x features with the highest importance in the model
x_features = 8
top_feature_metrics = list((pd.DataFrame(evaluator.metrics['importances'])
                      .sort_values(by='importance',ascending=False)
                      .head(x_features)['feature']))
all_metrics = list(set(top_feature_metrics))

# Cluster numbers
n_clusters=4

styled_df = wime.create_cluster_report(modeling_df, model_results, n_clusters, all_metrics)
styled_df

## experiments beta

In [None]:
# Create modeling dataset using existing pipeline
modeling_wallets_df = wmo.filter_modeling_period_wallets(modeling_profits_df)
target_vars_df = wpf.calculate_performance_features(modeling_wallets_df)


In [None]:
### save model artifacts
[importlib.reload(module) for module in modules]
wallets_config.reload()

# 1. Initialize dependencies
metrics_config = {
    'rmse': lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    'r2': r2_score
}

# 2. Define experiment sequence
sequence_config = {
    'run_baseline': True,
    'parameter_variations': {
        'modeling': {
            'target_variable': [
                'max_investment',
                'total_net_flows',
                'return',
                'realized_return',
                'return_unwinsorized',
                'performance_score',
                'size_adjusted_rank'
            ]
        }
    }
}

# 3. Create experiment manager
exp_manager = wem.ExperimentsManager(
    config=wallets_config.config,
    training_data_df=training_data_df,
)

# 4. Run experiments and get results
results_df = exp_manager.run_experiment_sequence(modeling_profits_df, sequence_config)

# 5. View results
print(results_df)

In [None]:
results_df

### Validation period assessments

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wallet_performance_df, bucketed_performance_df = wica.calculate_validation_metrics(
    X_test=model_results['X_test'],
    y_pred=model_results['y_pred'],
    validation_profits_df=validation_profits_df,
    n_buckets=10,
    method='ntiles'
)

bucketed_performance_df

## Basic coin model testing

### create coin_validation_df with metrics and returns

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Consolidate wallet scores at the coin level
wallet_scores_df = pd.DataFrame({'score': model_results['y_pred']}, index=model_results['y_test'].index)
coin_wallet_metrics_df = wica.calculate_coin_metrics_from_wallet_scores(validation_profits_df, wallet_scores_df)

# Calculate coin performance during the validation period
coin_performance_df = wica.calculate_coin_performance(market_data_df,
                                                     wallets_config['training_data']['validation_period_start'],
                                                     wallets_config['training_data']['validation_period_end'])

# Join aggregated wallet metrics with actual coin performance
coin_validation_df = coin_wallet_metrics_df.join(coin_performance_df, how='inner')

In [None]:
coin_modeling_df = coin_validation_df.copy().drop('market_cap',axis=1)
coin_modeling_df['coin_return'] = u.winsorize(coin_modeling_df['coin_return'],0.05)
coin_modeling_df.describe()

In [None]:
df = coin_modeling_df.copy()

# 1. Simple feature prep and model
X, y = wicm.prepare_features(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

# 2. Train
model.fit(X_train, y_train)

# 3. Predict
y_pred = model.predict(X_test)

# 4. Evaluate with the fancy evaluator
feature_names = df.columns.drop(['coin_return', 'market_cap_filled']).tolist()
evaluator = wime.RegressionEvaluator(y_train, y_test, y_pred, model=model, feature_names=feature_names)

# 5. Get the goods
print(evaluator.summary_report())

# 6. Plot everything
evaluator.plot_evaluation()

## coin performance analysis

### plotting coin feature performance vs market cap

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Get the analysis results
segment_results, summary_df = wica.analyze_market_cap_segments(
    coin_validation_df,
    top_n=10
)

# Or create the visualizations
wica.plot_segment_heatmap(summary_df)
wica.plot_metric_consistency(summary_df)  # Optional secondary visualization


### coin performance of top n for each bucket

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Run analysis
top_n = wallets_config['coin_validation_analysis']['top_n']
max_market_cap = wallets_config['coin_validation_analysis']['max_market_cap']
min_market_cap = wallets_config['coin_validation_analysis']['min_market_cap']

metric_top_coin_performance_df = wica.validate_coin_performance(coin_validation_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

### compare performance of high vs low score coins

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wica.print_performance_analysis(coin_validation_df)

## Junkyard

In [None]:
df = coin_modeling_df.copy()

# Check target distribution
print("Coin return stats:")
print(df['coin_return'].describe())

# Check for outliers/infinite values in features
print("\nInfinite values in features:")
print(df.isna().sum())
print(df.isin([np.inf, -np.inf]).sum())

# Look at feature correlations with target
correlations = df.drop(['coin_return', 'market_cap_filled'], axis=1).corrwith(df['coin_return'])
print("\nTop correlations with return:")
print(correlations.sort_values(ascending=False).head())

## Tests failing

In [None]:
def sample_profits_df_for_cleaning():
    """
    Fixture to create a sample profits DataFrame with multiple coins per wallet.
    """
    return pd.DataFrame({
        'coin_id': ['BTC', 'ETH', 'BTC', 'ETH', 'LTC', 'BTC', 'ETH'],
        'wallet_address': ['wallet1', 'wallet1', 'wallet2', 'wallet2','wallet2',
                           'wallet3', 'wallet3'],
        'date': pd.date_range(start='2023-01-01', periods=7),
        'profits_cumulative': [5000, 3000, 1000, 500, 500, 100, 50],
        'usd_inflows_cumulative': [10000, 8000, 2000, 1500, 1500, 500, 250]
    })
sample_profits_df_for_cleaning = sample_profits_df_for_cleaning()


def sample_data_cleaning_config():
    """
    Fixture to create a sample data cleaning configuration.
    """
    return {
        'max_wallet_coin_profits': 7500,
        'max_wallet_inflows': 15000,
        'price_coverage_warning_min_coin_increase': 999,
        'price_coverage_warning_min_pct_increase': 1.0,
        'transfers_coverage_warning_min_coin_increase': 999,
        'transfers_coverage_warning_min_pct_increase': 1.0,

    }

sample_data_cleaning_config = sample_data_cleaning_config()

# Call the function
cleaned_df, exclusions_logs_df = dr.clean_profits_df(sample_profits_df_for_cleaning,
                                                        sample_data_cleaning_config)


In [None]:

# Expected results
expected_cleaned_df = sample_profits_df_for_cleaning[
    sample_profits_df_for_cleaning['wallet_address'].isin(['wallet2', 'wallet3'])
].reset_index(drop=True)
expected_exclusions = pd.DataFrame({
    'wallet_address': ['wallet1'],
    'profits_exclusion': [True],
    'inflows_exclusion': [True]
})

# Assertions
assert len(cleaned_df) == 5  # wallet2 (3 records) and wallet3 (2 records) should remain
assert np.array_equal(cleaned_df.values, expected_cleaned_df.values)
assert np.array_equal(exclusions_logs_df.values, expected_exclusions.values)

# Check if profits and inflows are approximately correct for the remaining wallets
# 1000 + 500 + 500 + 100 + 50
assert pytest.approx(cleaned_df['profits_cumulative'].sum(), abs=1e-4) == 2150
# 2000 + 1500 + 1500 + 500 + 250
assert pytest.approx(cleaned_df['usd_inflows_cumulative'].sum(), abs=1e-4) == 5750

In [None]:
sample_data_cleaning_config

In [None]:
sample_profits_df_for_cleaning

In [None]:
exclusions_logs_df

In [None]:
expected_exclusions