In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import re
import pdb
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load_dotenv(Path("../../../Local/.env"))

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# silence pygame donation request
os.environ['PYGAME_HIDE_SUPPORT_PROMPT'] = "hide"
os.environ['ALERT_SOUND_FILEPATH']="../../../Local/assets/sounds/mixkit-alert-bells-echo-765.wav"

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp

# Wallet modeling
import wallet_modeling.wallet_orchestrator as wo
import wallet_modeling.wallet_training_data as wtd
import wallet_modeling.wallet_modeling as wm
import wallet_modeling.model_reporting as wmr
from wallet_modeling.wallets_config_manager import WalletsConfig

# Wallet features
import wallet_features.market_cap_features as wmc
import wallet_features.market_timing_features as wmt
import wallet_features.trading_features as wtf
import wallet_features.transfers_features as wts
import wallet_features.wallet_features as wf

# Wallet insights
import wallet_insights.wallet_model_evaluation as wime
import wallet_insights.validation_analysis as wiv
import wallet_insights.coin_forecasting as wicf


# reload all modules
modules = [u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp,
           wo, wtd, wm, wmr, wmc, wmt, wtf, wts, wf, wime, wiv, wicf]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')
wallets_config = WalletsConfig.load_from_yaml('../config/wallets_config.yaml')
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)
logger.info("Good morning, let's get to work")

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

## Full Training Data Sequence

### retrieve datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Retrieve datasets
profits_df,market_data_df = wo.retrieve_datasets()

# Define wallet cohort after cleaning
training_wallet_metrics_df,wallet_cohort = wo.define_wallet_cohort(profits_df,market_data_df)

# Generate profits_df for all training windows and the modeling period
training_profits_df, training_windows_profits_dfs, modeling_profits_df, validation_profits_df = wo.split_profits_df(profits_df,
                                                                               market_data_df,wallet_cohort)


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Market data: add indicators
market_indicators_data_df = ind.generate_time_series_indicators(market_data_df,
                                                        wallets_metrics_config['time_series']['market_data'],
                                                        'coin_id')


# Transfers data retrieval for the wallet_ids in temp.wallet_modeling_cohort
transfers_sequencing_df = wts.retrieve_transfers_sequencing()

### generate features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate features for the full training dataset
training_wallet_features_df = wf.calculate_wallet_features(training_profits_df, market_indicators_data_df,
                                                           transfers_sequencing_df, wallet_cohort)

# Define the full feature set by appending a suffix for each window
training_data_df = training_wallet_features_df.add_suffix("_all_windows")

# Generate features for each window
for i, window_profits_df in enumerate(training_windows_profits_dfs, 1):
    # Generate the features
    window_wallet_features_df = wf.calculate_wallet_features(window_profits_df, market_indicators_data_df,
                                                             transfers_sequencing_df, wallet_cohort)

    # Add column suffix and join to training_data_df
    window_wallet_features_df = window_wallet_features_df.add_suffix(f'_w{i}')
    training_data_df = training_data_df.join(window_wallet_features_df, how='left')


training_data_df.describe()

### join target variable to training data

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Clean inactive wallets from modeling period data
modeling_wallets_df = wo.filter_modeling_period_wallets(modeling_profits_df)

# Generate target variables
target_vars_df = wm.generate_target_variables(modeling_wallets_df)

# Merge training data and target variables?
modeling_df = training_data_df.join(target_vars_df[wallets_config['modeling']['target_variable']],
                                    how='inner')

modeling_df.describe()

## DDA-409 Codespace

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

transfers_df = wts.retrieve_transfers()


In [None]:
transfers_df_full = transfers_df.reset_index().copy()
print(transfers_df_full.shape)
transfers_df_full.head()

In [None]:
imputation_dates = wtd.generate_imputation_dates()

training_end = wallets_config['training_data']['training_period_end']
training_dates = [date for date in imputation_dates
                    if datetime.strptime(date, '%Y-%m-%d') <= datetime.strptime(training_end, '%Y-%m-%d')]


training_dates
training_dates[0]

In [None]:
training_dates
target_date = training_dates[0]
target_date

In [None]:
transfers_df = transfers_df_full.copy().round().replace(-0,0)
transfers_df.describe()

In [None]:
target_date = training_dates[0]
transfers_df = transfers_df_full.copy().round().replace(-0,0)


# Convert dates and set a multi-index
target_date = pd.to_datetime(target_date)
transfers_df['date'] = pd.to_datetime(transfers_df['date'])
transfers_df = transfers_df.set_index(['coin_id', 'wallet_address', 'date']).sort_index()

# 1. Split into before/including target_date and after
df_before = transfers_df.xs(slice(None, target_date), level='date', drop_level=False)

# # 2. Identify pairs needing new rows
# pairs_with_target = df_before.index[df_before.index.get_level_values('date') == target_date].droplevel('date')
# all_pairs_before = df_before.index.droplevel('date').drop_duplicates()
# pairs_need_impute = all_pairs_before.difference(pairs_with_target)
# if len(pairs_need_impute) == 0:
#     # No new rows needed
#     return pd.DataFrame(columns=transfers_df.reset_index().columns)

# df_before = df_before[df_before.index.droplevel('date').isin(pairs_need_impute)]

# # 3. Find the last date for each pair before target_date
# shifted_index = df_before.index.to_frame().shift(-1)
# is_last_date = (
#     (df_before.index.get_level_values('coin_id') != shifted_index['coin_id']) |
#     (df_before.index.get_level_values('wallet_address') != shifted_index['wallet_address'])
# )
# df_last_dates = df_before[is_last_date]

# # 4. Create new rows for the target_date
# new_rows = df_last_dates.index.to_frame()
# new_rows['date'] = target_date
# new_rows = new_rows.set_index(['coin_id', 'wallet_address', 'date'])
# new_rows['net_transfers'] = 0
# new_rows['balance'] = df_last_dates['balance']

df_before.shape

In [None]:
def impute_transfers_rows_for_date(transfers_df, target_date):
    """
    Impute rows for all coin-wallet pairs in transfers_df on the target date.

    This function:
    1. Splits transfers_df into records before/including the target date and after
    2. Identifies pairs that need new rows on the target date
    3. Finds the last known date for each such pair
    4. Creates new rows at the target date with net_transfers=0 and forward-filled balance
    5. Returns only these newly created rows

    Args:
        transfers_df (pd.DataFrame): DataFrame containing transfer information
        target_date (str or datetime): The date for which to impute rows

    Returns:
        pd.DataFrame: DataFrame of newly imputed rows
    """

    # Convert dates and set a multi-index
    target_date = pd.to_datetime(target_date)
    transfers_df['date'] = pd.to_datetime(transfers_df['date'])
    transfers_df = transfers_df.set_index(['coin_id', 'wallet_address', 'date']).sort_index()

    # 1. Split into before/including target_date and after
    df_after = transfers_df.xs(slice(target_date + pd.Timedelta('1 day'), None), level='date', drop_level=False)
    df_before = transfers_df.xs(slice(None, target_date), level='date', drop_level=False)

    # 2. Identify pairs needing new rows
    pairs_with_target = df_before.index[df_before.index.get_level_values('date') == target_date].droplevel('date')
    all_pairs_before = df_before.index.droplevel('date').drop_duplicates()
    pairs_need_impute = all_pairs_before.difference(pairs_with_target)
    if len(pairs_need_impute) == 0:
        # No new rows needed
        return pd.DataFrame(columns=transfers_df.reset_index().columns)

    df_before = df_before[df_before.index.droplevel('date').isin(pairs_need_impute)]

    # 3. Find the last date for each pair before target_date
    shifted_index = df_before.index.to_frame().shift(-1)
    is_last_date = (
        (df_before.index.get_level_values('coin_id') != shifted_index['coin_id']) |
        (df_before.index.get_level_values('wallet_address') != shifted_index['wallet_address'])
    )
    df_last_dates = df_before[is_last_date]

    # 4. Create new rows for the target_date
    new_rows = df_last_dates.index.to_frame()
    new_rows['date'] = target_date
    new_rows = new_rows.set_index(['coin_id', 'wallet_address', 'date'])
    new_rows['net_transfers'] = 0
    new_rows['balance'] = df_last_dates['balance']

    # 5. Return only the newly imputed rows
    return new_rows.reset_index()


def impute_transfers_for_all_dates(transfers_df, training_dates):
    """
    Impute rows for all coin-wallet pairs in transfers_df on each date in training_dates.

    This function:
    1. Sorts training_dates and iterates through them
    2. For each date, calls impute_transfers_rows_for_date to generate new rows
    3. Appends these new rows to transfers_df
    4. Returns the updated transfers_df with all imputed rows

    Args:
        transfers_df (pd.DataFrame): DataFrame containing transfer information
        training_dates (list of str or datetime): Dates for which to impute rows

    Returns:
        pd.DataFrame: Updated DataFrame with imputed rows for all given dates
    """

    # Ensure training_dates are sorted
    training_dates = sorted(pd.to_datetime(training_dates))

    # Iterate through each date and impute rows
    for dt in training_dates:
        new_rows = impute_transfers_rows_for_date(transfers_df, dt)
        if not new_rows.empty:
            # Append new_rows to the original DataFrame
            transfers_df = pd.concat([transfers_df, new_rows], ignore_index=True, sort=False)

    return transfers_df

In [None]:
target_date = training_dates[0]

result = impute_transfers_rows_for_date(transfers_df_full.copy(), target_date)
result.shape
result.describe()

In [None]:
result

In [None]:
result = impute_transfers_for_all_dates(transfers_df_full.copy(), training_dates)
result.shape

In [None]:
result.describe()

In [None]:
df = pd.DataFrame({
    'coin_id': ['BTC'] * 7,
    'wallet_address': ['wallet1'] * 7,
    'date': ['2024-01-01', '2024-01-05', '2024-01-10', '2024-01-15', '2024-01-20', '2024-01-29', '2024-01-30'],
    'net_transfers': [100, -50, 75, -25, 0, 0, -100]  # Buy, sell, buy, sell
})
df['balance'] = df['net_transfers'].cumsum()
df_base = df
df_base

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wts.calculate_average_holding_period(df_base)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Example usage
df = pd.DataFrame({
    'coin_id': ['BTC'] * 7,
    'wallet_address': ['wallet1'] * 7,
    'date': ['2024-01-01', '2024-01-05', '2024-01-10', '2024-01-15', '2024-01-20', '2024-01-29', '2024-01-30'],
    'net_transfers': [100, -50, 75, -25, 0, 0, -100]  # Buy, sell, buy, sell
})
df['balance'] = df['net_transfers'].cumsum()
df_base = df

# result = wts.calculate_days_since_last_buy(df)
# result

In [None]:
df = df_base.copy()
days_since_buy_df = wts.calculate_days_since_last_buy(df)
days_since_buy_df

In [None]:
df = df_base.copy()
avg_hold_df = wts.calculate_average_holding_period(df)
avg_hold_df

## Wallet Modeling

### build model

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Train the model and get results
model_results = wm.train_xgb_model(modeling_df)

# Get the model object for evaluation
model = model_results['pipeline'].named_steps['regressor']

### assess model performance

In [None]:
### save model artifacts
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate and save all model artifacts
model_id, evaluator, wallet_scores_df, coin_validation_df = wmr.generate_and_save_model_artifacts(
    model_results=model_results,
    validation_profits_df=validation_profits_df,
    base_path='../wallet_modeling'
)
u.play_notification()

# # Print results
# print(evaluator['summary_report'])
# print(f"R² Score: {evaluation['r2']:.3f}")



In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

evaluator = wime.RegressionEvaluator(
    y_true=model_results['y_test'],
    y_pred=model_results['y_pred'],
    model=model,
    feature_names=model_results['X'].columns.tolist()
)

evaluator.get_summary_report()
evaluator.plot_evaluation()

### Validation period assessments

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wallet_performance_df, bucketed_performance_df = wiv.calculate_validation_metrics(
    X_test=model_results['X_test'],
    y_pred=model_results['y_pred'],
    validation_profits_df=validation_profits_df,
)

bucketed_performance_df

## coin performance predictions

### create coin_validation_df with metrics and returns

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Consolidate wallet scores at the coin level
wallet_scores_df = pd.DataFrame({'score': model_results['y_pred']}, index=model_results['y_test'].index)
coin_wallet_metrics_df = wicf.calculate_coin_metrics_from_wallet_scores(validation_profits_df, wallet_scores_df)

# Calculate coin performance during the validation period
coin_performance_df = wicf.calculate_coin_performance(market_data_df,
                                                     wallets_config['training_data']['validation_period_start'],
                                                     wallets_config['training_data']['validation_period_end'])

# Join aggregated wallet metrics with actual coin performance
coin_validation_df = coin_wallet_metrics_df.join(coin_performance_df, how='inner')

### plotting coin feature performance vs market cap

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Get the analysis results
segment_results, summary_df = wicf.analyze_market_cap_segments(
    coin_validation_df,
    top_n=10
)

# Or create the visualizations
wicf.plot_segment_heatmap(summary_df)
# wicf.plot_metric_consistency(summary_df)  # Optional secondary visualization


### coin performance of top n for each bucket

In [None]:

# Run analysis
top_n = wallets_config['coin_forecasting']['top_n']
max_market_cap = wallets_config['coin_forecasting']['max_market_cap']
min_market_cap = wallets_config['coin_forecasting']['min_market_cap']

metric_top_coin_performance_df = wicf.validate_coin_performance(coin_validation_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

### compare performance of high vs low score coins

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

wicf.print_performance_analysis(coin_validation_df)

## Junkyard

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the data into a pandas DataFrame
def analyze_coin_metrics(df):
    """
    Analyze relationships between coin metrics and returns
    """
    # Calculate correlations with coin_return
    metrics_of_interest = [
        'weighted_avg_score',
        'composite_score',
        'score_confidence',
        'top_wallet_balance_pct',
        'top_wallet_count_pct',
        'total_wallets',
        'avg_wallet_balance',
        'market_cap'
    ]

    # Calculate correlations
    correlations = {}
    for metric in metrics_of_interest:
        correlation = df[metric].corr(df['coin_return'])
        correlations[metric] = correlation

    # Sort correlations by absolute value
    correlations_sorted = {k: v for k, v in sorted(correlations.items(),
                                                 key=lambda x: abs(x[1]),
                                                 reverse=True)}

    # Calculate basic statistics for coins with positive vs negative returns
    positive_returns = df[df['coin_return'] > 0]
    negative_returns = df[df['coin_return'] <= 0]

    comparison_stats = {}
    for metric in metrics_of_interest:
        pos_mean = positive_returns[metric].mean()
        neg_mean = negative_returns[metric].mean()
        # Perform t-test
        t_stat, p_value = stats.ttest_ind(positive_returns[metric],
                                        negative_returns[metric])

        comparison_stats[metric] = {
            'positive_mean': pos_mean,
            'negative_mean': neg_mean,
            'difference': pos_mean - neg_mean,
            'p_value': p_value
        }

    # Identify potential success indicators
    success_indicators = {
        metric: stats for metric, stats in comparison_stats.items()
        if (abs(stats['difference']) > 0.1 * stats['negative_mean'] and
            stats['p_value'] < 0.05)
    }

    return {
        'correlations': correlations_sorted,
        'comparison_stats': comparison_stats,
        'success_indicators': success_indicators
    }

# Create summary statistics
def print_analysis_results(results):
    """
    Print formatted analysis results
    """
    print("\n=== Correlation Analysis ===")
    print("\nCorrelations with coin return (sorted by strength):")
    for metric, corr in results['correlations'].items():
        print(f"{metric:25} : {corr:0.4f}")

    print("\n=== Positive vs Negative Returns Analysis ===")
    print("\nMetrics comparison for positive vs negative returns:")
    for metric, stats in results['comparison_stats'].items():
        print(f"\n{metric}:")
        print(f"  Positive returns mean: {stats['positive_mean']:0.4f}")
        print(f"  Negative returns mean: {stats['negative_mean']:0.4f}")
        print(f"  Difference: {stats['difference']:0.4f}")
        print(f"  P-value: {stats['p_value']:0.4f}")

    print("\n=== Strong Success Indicators ===")
    print("\nMetrics showing significant difference between positive and negative returns:")
    for metric, stats in results['success_indicators'].items():
        print(f"\n{metric}:")
        print(f"  Mean difference: {stats['difference']:0.4f}")
        print(f"  P-value: {stats['p_value']:0.4f}")


# Run the analysis
def main():
    # Read the data
    df = pd.read_csv('coin_wallet_metrics.csv')

    # Run analysis
    results = analyze_coin_metrics(df)

    # Print results
    print_analysis_results(results)

    # Create visualizations
    create_visualizations(df)

if __name__ == "__main__":
    main()

In [None]:
# Winsorize the returns (apply caps to the top n % of values)
returns_winsorized = u.winsorize(returns, winsorization_cutoff)

# Merge datasets
df = pd.DataFrame({
    'predictions': predictions,
    'returns': returns_winsorized,
})

# Sort by actual returns to obtain optimal performance
df_sorted = df.sort_values('returns', ascending=False)
cumulative_best_returns = np.cumsum(df_sorted['returns'])
cumulative_best_avg_returns = df_sorted['returns'].expanding().mean()

# Sort by model score to obtain modeled performance
df_sorted = df.sort_values('predictions', ascending=False)
cumulative_model_returns = np.cumsum(df_sorted['returns'])
cumulative_model_avg_returns = df_sorted['returns'].expanding().mean()

# Calculate average return across all data
average_return = np.mean(returns_winsorized)

In [None]:
df

In [None]:
cumulative_model_returns

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Run analysis
top_n = wallets_config['coin_forecasting']['top_n']
max_market_cap = wallets_config['coin_forecasting']['max_market_cap']
min_market_cap = wallets_config['coin_forecasting']['min_market_cap']

metric_top_coin_performance_df = wicf.validate_coin_performance(coin_validation_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# List the coins that would have been picked at the start of the validation period
top_coins_df = coin_validation_df[
    (coin_validation_df['market_cap_filled']<=max_market_cap)
    & (coin_validation_df['market_cap_filled']>=min_market_cap)
].copy()

sort_column = wallets_config['coin_forecasting']['sort_method']

top_coins_df.sort_values(sort_column,ascending=False).head(top_n)

## Tests failing

In [None]:

df = pd.DataFrame({
    'coin_id': ['BTC', 'BTC', 'ETH', 'ETH', 'BTC', 'BTC', 'BTC', 'BTC'],
    'wallet_address': [
        'wallet1', 'wallet2', 'wallet1', 'wallet1',
        'wallet2', 'wallet1', 'wallet2', 'wallet1'
    ],
    'date': [
        '2024-01-01', '2024-01-01', '2024-01-02', '2024-01-05',
        '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-10'
    ],
    'net_transfers': [100, 50, 200, -100, 50, -50, -50, 50]
})

expected = np.array([0, 0, 0, 3, 2, 5, 4, 4.5])
df2 = df.sort_values(by=['coin_id','wallet_address','date'])

In [None]:
df2

In [None]:
transfers_df_full.describe()

In [None]:
result = wts.calculate_average_holding_period(df2)
result