In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import re
import pdb
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar
from pyxirr import xirr

load_dotenv()

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp
import wallet_modeling.wallet_orchestrator as wo
import wallet_modeling.wallet_training_data as wtd
import wallet_modeling.wallet_modeling as wm
import wallet_features.wallet_features as wf
import wallet_features.wallet_coin_features as wcf
import wallet_features.wallet_coin_date_features as wcdf
from wallet_modeling.wallets_config_manager import WalletsConfig


# reload all modules
modules = [u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp, wo, wtd, wm, wf, wcf, wcdf]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')
wallets_config = WalletsConfig.load_from_yaml('../config/wallets_config.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)


## Codespace

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

## Full Sequence

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Retrieve datasets
market_data_df,profits_df = wo.retrieve_datasets()

# Define wallet cohort after cleaning
training_wallet_metrics_df,wallet_cohort = wo.define_wallet_cohort(profits_df,market_data_df)

# Generate profits_df for all training windows and the modeling period
training_windows_profits_dfs, modeling_period_profits_df = wo.split_profits_df(profits_df,
                                                                               market_data_df,wallet_cohort)

# Generate features for all windows and merge them together
training_data_df = wo.generate_wallet_performance_features(training_windows_profits_dfs,
                                                           training_wallet_metrics_df,wallet_cohort)



### Retrieve data

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# 2. Define modeling wallet cohort via data cleaning
# --------------------------------------------------
# Impute the training period boundary dates
training_period_boundary_dates = [
    wallets_config['training_data']['training_period_start'],
    wallets_config['training_data']['training_period_end']
]
imputed_profits_df = pri.impute_profits_for_multiple_dates(profits_df, market_data_df,
                                                           training_period_boundary_dates, n_threads=24)

# Create a training period only profits_df
training_profits_df = imputed_profits_df[
    imputed_profits_df['date']<=wallets_config['training_data']['training_period_end']
    ].copy()

# Add cash flows logic column
training_profits_df = wcf.add_cash_flow_transfers_logic(training_profits_df)

# Compute wallet level metrics over duration of training period
training_wallet_metrics_df = wf.calculate_wallet_level_metrics(training_profits_df)

# Apply filters based on wallet behavior during the training period
filtered_training_wallet_metrics_df = wtd.apply_wallet_thresholds(training_wallet_metrics_df)

# Upload the cohort to BigQuery for additional complex feature generation
wallet_cohort = filtered_training_wallet_metrics_df.index.values
wtd.upload_wallet_cohort(wallet_cohort)


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# 3. Impute all required dates for wallet cohort
# ----------------------------------------------
# Filter to only wallet cohort
cohort_profits_df = profits_df[profits_df['wallet_address'].isin(wallet_cohort)]

# Impute all required dates
imputation_dates = wtd.generate_imputation_dates()
windows_profits_df = pri.impute_profits_for_multiple_dates(cohort_profits_df, market_data_df, imputation_dates, n_threads=24)
windows_profits_df = (windows_profits_df[(windows_profits_df['date'] >= pd.to_datetime(min(imputation_dates))) &
                                    (windows_profits_df['date'] <= pd.to_datetime(max(imputation_dates)))])


In [None]:
training_profits_df.head()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# 4. Generate features for each training window
# ---------------------------------------------
# Split profits_df into training windows and the modeling period
training_windows_dfs, modeling_period_df =  wtd.split_window_dfs(windows_profits_df)

# Create training data df with full training period metrics
training_data_df = wf.fill_missing_wallet_data(filtered_training_wallet_metrics_df, wallet_cohort)
training_data_df = training_data_df.add_suffix("_all_windows")

# Generate and join dfs for each trianing window
for i in range(len(training_windows_dfs)):
    # Add metrics
    window_df = training_windows_dfs[i]
    window_df = wcf.add_cash_flow_transfers_logic(window_df)
    window_wallets_df = wf.calculate_wallet_level_metrics(window_df)

    # Fill missing values and Join to training_data_df
    window_wallets_df = wf.fill_missing_wallet_data(window_wallets_df, wallet_cohort)
    window_wallets_df = window_wallets_df.add_suffix(f'_w{i+1}')
    training_data_df = training_data_df.join(window_wallets_df,how='left')


training_data_df.describe()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate additional features
# ----------------------------
# Retrieve the buy numbers for wallets in the cohort
buyer_numbers_df = wcf.retrieve_buyer_numbers()

# Append buyer numbers to the merged_df
buyer_averages_df = buyer_numbers_df.groupby('wallet_id').mean('buyer_number')
buyer_averages_df.columns = ['average_buyer_number']
training_data_df = training_data_df.join(buyer_averages_df)
training_data_df.head()

## Target Variables

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Calculate modeling period metrics
# ---------------------------------
# Calculate modeling period wallet metrics
modeling_period_df = wcf.add_cash_flow_transfers_logic(modeling_period_df)
modeling_wallets_df = wf.calculate_wallet_level_metrics(modeling_period_df)

# Remove wallets with below the minimum investment threshold
base_wallets = len(modeling_wallets_df)
modeling_wallets_df = modeling_wallets_df[
    modeling_wallets_df['invested']>=wallets_config['data_cleaning']['min_modeling_investment']]
logger.info("Removed %s/%s wallets with modeling period investments below the threshold.",
            base_wallets - len(modeling_wallets_df), base_wallets)

# Remove wallets with transaction counts below the threshold
base_wallets = len(modeling_wallets_df)
modeling_wallets_df = modeling_wallets_df[
    modeling_wallets_df['transaction_days']>=wallets_config['data_cleaning']['min_modeling_transaction_days']]
logger.info("Removed %s/%s wallets with modeling period transaction days below the threshold.",
            base_wallets - len(modeling_wallets_df), base_wallets)



In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

winsorization=0.01
target_variable = 'performance_score'

# Generate the target variables
target_vars_df = modeling_wallets_df[['invested','net_gain']].copy()
target_vars_df = wm.generate_target_variables(target_vars_df,winsorization=winsorization)

target_vars_df.describe()

In [None]:
# Merge training data and target variables
modeling_df = training_data_df.join(target_vars_df[target_variable],how='inner')
modeling_df.describe()


## Crude Model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score
from sklearn.ensemble import GradientBoostingRegressor

df = modeling_df.copy()

# Assuming your dataframe is called 'df'
# Separate features and target
X = df.drop(target_variable, axis=1)  # dropping both return columns
y = df[target_variable]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create preprocessing steps
numeric_features = X.columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ])

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Example usage with your existing code:
evaluation = wm.evaluate_regression_model(
    y_test,
    y_pred,
    model=pipeline.named_steps['regressor'],  # Pass the actual model object
    X_test=X_test,
    feature_names=X.columns.tolist()
)

# Print summary report
print(evaluation['summary_report'])

# Access specific metrics
print(f"R² Score: {evaluation['r2']:.3f}")

# The figure can be displayed or saved
if evaluation['figures'] is not None:
    plt.show()  # or evaluation['figures'].savefig('model_evaluation.png')

In [None]:

profits_df = training_profits_df.copy().set_index('wallet_address')

# Precompute necessary transformations
profits_df['abs_usd_net_transfers'] = profits_df['usd_net_transfers'].abs()
profits_df['cumsum_usd_net_transfers'] = profits_df.groupby('wallet_address')['usd_net_transfers'].cumsum()

# Group and aggregate metrics
wallet_metrics_df = profits_df.groupby('wallet_address').agg(
    invested=('cumsum_usd_net_transfers', 'max'),
    total_net_transfers=('usd_net_transfers', 'sum'),
    unique_coins=('coin_id', 'nunique'),
    transaction_days=('date', 'count'),
    total_volume=('abs_usd_net_transfers', 'sum'),
    average_transaction=('abs_usd_net_transfers', 'mean')
)


# Compute additional derived metrics
wallet_metrics_df['net_gain'] = -wallet_metrics_df['total_net_transfers']
wallet_metrics_df['return'] = wallet_metrics_df['net_gain'] / wallet_metrics_df['invested']



# # Calculate amount invested
# wallet_invested_df = pd.DataFrame(
#     profits_df
#     .groupby(level='wallet_address')['usd_net_transfers'].cumsum()
#     .groupby(level='wallet_address').max()
# )
# wallet_invested_df.columns = ['invested']

# # Calculate net gains
# wallet_gain_df = pd.DataFrame(
#     -profits_df.groupby(level='wallet_address')['usd_net_transfers'].sum()
# )
# wallet_gain_df.columns = ['net_gain']

# # Join dfs
# wallet_performance_df = wallet_invested_df.join(wallet_gain_df)

# # Compute return
# wallet_performance_df['return'] = wallet_performance_df['net_gain']/wallet_performance_df['invested']

# return wallet_performance_df
wallet_metrics_df.describe()

In [None]:
[importlib.reload(module) for module in modules]



# Split out modeling and training records to calculate return separately
modeling_df = adj_profits_df[
    (adj_profits_df['date'] >= pd.to_datetime(wallets_config['training_data']['modeling_period_start'])) &
    (adj_profits_df['date'] <= pd.to_datetime(wallets_config['training_data']['modeling_period_end']))
]
modeling_performance_df = wf.calculate_wallet_investment_return(modeling_df)


training_df = adj_profits_df[
    (adj_profits_df['date'] >= pd.to_datetime(wallets_config['training_data']['training_period_start'])) &
    (adj_profits_df['date'] <= pd.to_datetime(wallets_config['training_data']['training_period_end']))
]
training_performance_df = wf.calculate_wallet_investment_return(training_df)


In [None]:
adj_profits_df.head()

### Calculations

In [None]:
training_performance_df

In [None]:
min_invested = 10000
filtered_df = training_performance_df[training_performance_df['invested']>=min_invested]
print(training_performance_df.shape)
print(filtered_df.shape)

In [None]:
# Join training and modeling data
performance_df = filtered_df[['return']].join(modeling_performance_df[['return']],lsuffix='_training',rsuffix='_modeling')
performance_df.shape

# Calculate percentiles
performance_df["training_percentile"] = performance_df["return_training"].rank(ascending=True, pct=True)
performance_df["modeling_percentile"] = performance_df["return_modeling"].rank(ascending=True, pct=True)

# Calculate decile buckets
performance_df['training_decile'] = np.ceil(performance_df['training_percentile']*5)
performance_df['modeling_decile'] = np.ceil(performance_df['modeling_percentile']*5)

# Check correlation
performance_df['training_percentile'].corr(performance_df['modeling_percentile'])


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Create a cross-tabulation of the deciles
heatmap_data = pd.crosstab(
    performance_df['training_decile'],
    performance_df['modeling_decile'],
    normalize='index'
) * 100  # Convert to percentages

# Plot the heatmap
sns.heatmap(heatmap_data, annot=True, fmt=".1f", cmap="coolwarm", cbar=True)

# Add title and labels
plt.title('Percentage Allocation Heatmap: Training to Modeling Deciles')
plt.xlabel('Modeling Decile')
plt.ylabel('Training Decile')
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def create_correlation_matrix(df):
    """
    Create and visualize a correlation matrix for the given DataFrame

    Parameters:
    df (pandas.DataFrame): Input DataFrame

    Returns:
    pandas.DataFrame: Correlation matrix
    """
    # Calculate the correlation matrix
    corr_matrix = df.corr(method='pearson')

    # Create a heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix,
                annot=True,  # Show correlation values
                cmap='coolwarm',  # Color scheme from red (negative) to blue (positive)
                vmin=-1, vmax=1,  # Fix the scale
                center=0,  # Center the colormap at 0
                fmt='.2f')  # Round to 2 decimal places

    plt.title('Correlation Matrix Heatmap')
    plt.tight_layout()

    return corr_matrix

# create_correlation_matrix(performance_df)

In [None]:
performance_df.corr()

## Market data analysis

In [None]:
start_prices.set_index('coin_id')

In [None]:
price_analysis_df = market_data_df.copy().set_index('coin_id')

start_prices = price_analysis_df[price_analysis_df['date']== pd.to_datetime(config['training_data']['modeling_period_start'])]
end_prices = price_analysis_df[price_analysis_df['date']== pd.to_datetime(config['training_data']['modeling_period_end'])]

# coin_modeling_returns_df = start_prices.join(end_prices)

    # (adj_profits_df['date'] >= pd.to_datetime(config['training_data']['modeling_period_start'])) &
    # (adj_profits_df['date'] <= pd.to_datetime(config['training_data']['modeling_period_end']))

In [None]:
# Calculate coin returns during modeling period
coin_modeling_returns_df = start_prices[['price']].join(end_prices[['price']],lsuffix='_start',rsuffix='_end')
coin_modeling_returns_df['coin_modeling_return'] = coin_modeling_returns_df['price_end']/coin_modeling_returns_df['price_start']
coin_modeling_returns_df["coin_modeling_percentile_return"] = coin_modeling_returns_df["coin_modeling_return"].rank(ascending=True, pct=True)

coin_modeling_returns_df.head()

In [None]:
# Calculate wallet ending balances
min_end_balance = 1000

# Calculate period end balance for each coin-wallet pair
end_balances_df = adj_profits_df[adj_profits_df['date']==pd.to_datetime(config['training_data']['training_period_end'])]
end_balances_df = end_balances_df[end_balances_df['usd_net_transfers']<=-min_end_balance]
end_balances_df['usd_balance'] = end_balances_df['usd_net_transfers'].abs()
end_balances_df = end_balances_df[['coin_id','wallet_address','usd_balance']]
end_balances_df = end_balances_df.set_index(['coin_id','wallet_address'])
end_balances_df.head()

# Add wallet performance metrics
end_balances_df = end_balances_df.join(performance_df,on='wallet_address')
end_balances_df = end_balances_df[end_balances_df['return_training'].notna()]

end_balances_df.head()

In [None]:
wallet_return_column = 'return_training'

# Assess average wallet return during training period
coin_wallet_performance = pd.DataFrame(end_balances_df.reset_index().groupby('coin_id',observed=True)[wallet_return_column].mean())
coin_wallet_performance.columns = ['avg_wallet_training_return']

coin_wallet_performance.head()

In [None]:
# coin_return_column = 'coin_modeling_return'
coin_return_column = 'coin_modeling_percentile_return'

wallet_forecast_df = coin_modeling_returns_df[[coin_return_column]].join(coin_wallet_performance)
wallet_forecast_df[coin_return_column].corr(wallet_forecast_df['avg_wallet_training_return'])


In [None]:
wallet_forecast_df.describe()

In [None]:
end_balances_df = end_balances_df[['coin_id','wallet_address','usd_balance']]

In [None]:
start_prices.loc['0037051e-677f-439f-9353-4dc896fe9ecd']

In [None]:
adj_profits_df.head()

In [None]:
adj_profits_df.head()

In [None]:
adj_profits_df

In [None]:
training_performance_df = aggregate_wallet_performance(training_df)
training_performance_df.describe()

In [None]:
modeling_performance_df = aggregate_wallet_performance(modeling_df)
modeling_performance_df.describe()

In [None]:
performance_df = filtered_df[['return']].join(modeling_performance_df[['return']],lsuffix='_training',rsuffix='_modeling')
performance_df.shape

In [None]:
# Join training and modeling data
performance_df = filtered_df[['return']].join(modeling_performance_df[['return']],lsuffix='_training',rsuffix='_modeling')
performance_df.shape

# Calculate percentiles
performance_df["training_percentile"] = performance_df["return_training"].rank(ascending=True, pct=True)
performance_df["modeling_percentile"] = performance_df["return_modeling"].rank(ascending=True, pct=True)

# Calculate decile buckets
performance_df['training_decile'] = np.ceil(performance_df['training_percentile']*10)
performance_df['modeling_decile'] = np.ceil(performance_df['modeling_percentile']*10)

# Check correlation
performance_df['training_percentile'].corr(performance_df['modeling_percentile'])


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Create a cross-tabulation of the deciles
heatmap_data = pd.crosstab(
    performance_df['training_decile'],
    performance_df['modeling_decile'],
    normalize='index'
) * 100  # Convert to percentages

# Plot the heatmap
sns.heatmap(heatmap_data, annot=True, fmt=".1f", cmap="coolwarm", cbar=True)

# Add title and labels
plt.title('Percentage Allocation Heatmap: Training to Modeling Deciles')
plt.xlabel('Modeling Decile')
plt.ylabel('Training Decile')
plt.show()

In [None]:
wallet_performance_df = training_performance_df
wallet_performance_df['return'] = wallet_performance_df['net_gain']/wallet_performance_df['invested']
wallet_performance_df.describe()

In [None]:
# wallet_performance_df.sample(10)

wallet_performance_df[wallet_performance_df['invested']==0]

In [None]:
w = '0xca6cfaa7d61371310d84b63a4ca90cbf7883a9db'

df = wallets_df_filtered.loc[w]

# print(xirr(df.index.get_level_values('date'), df['usd_net_transfers']))
df

In [None]:
# adj_profits_df[adj_profits_df['wallet_address']==w]
profits_df[profits_df['wallet_address']==w].sort_values('date')

In [None]:
adj_profits_df[adj_profits_df['wallet_address']==w].sort_values('date')


In [None]:
def calculate_wallets_xirr(profits_df, min_wallet_volume):
    """
    Calculates the XIRR of each wallet based on their cash flows across all coins they've
    interacted with in profits_df.

    Parameters:
    - profits_df (pd.DataFrame): shows daily coin-wallet transfers in USD
    - min_wallet_volume (int): wallets with less than this total USD volume will be excluded

    Returns:
    - xirr_df (pd.DataFrame): shows the XIRR of each wallet over the provided transactions
    """
    logger.info('Beginning XIRR calculation sequence...')

    # 1. Summarize cash flows on a wallet level
    # -----------------------------------------
    # Sum cash flows on a wallet level
    wallets_df = pd.DataFrame(profits_df.groupby(['wallet_address','date'])['usd_net_transfers'].sum())


    # 2. Filter wallets on data quality
    # ---------------------------------
    # Identify wallets with no transactions
    wallets_agg_df = wallets_df.groupby(level='wallet_address')['usd_net_transfers'].apply(lambda x: x.abs().sum())
    low_volume_wallets = wallets_agg_df[wallets_agg_df < min_wallet_volume].index

    # Remove transactionless wallets
    wallets_df_filtered = wallets_df[~wallets_df.index.get_level_values('wallet_address').isin(low_volume_wallets)]
    logger.info('Removed %s wallets with volume below $%s.', len(low_volume_wallets), min_wallet_volume)

    # Group by wallet_address and check for both positive and negative usd_net_transfers
    wallet_check = wallets_df_filtered.groupby('wallet_address')['usd_net_transfers'].apply(
        lambda x: (x > 0).any() and (x < 0).any()
    )
    wallets_missing_both = wallet_check[~wallet_check].index

    # Filter wallet addresses that do not have both positive and negative transfers
    wallets_df_filtered = wallets_df_filtered[~wallets_df_filtered.index.get_level_values('wallet_address').isin(wallets_missing_both)]
    logger.info('Removed %s wallets missing either a positive or negative transaction.', len(wallets_missing_both))


    # 3. Calculate XIRR
    # -----------------
    # Group by wallet_address (level of the MultiIndex) and calculate XIRR\
    start_time = time.time()
    logger.info('Calculating XIRR values...')
    xirr_results = wallets_df_filtered.groupby(level='wallet_address').apply(
        lambda df: xirr(df.index.get_level_values('date'), df['usd_net_transfers'])
    )
    logger.info('XIRR calculations complete after %.2f seconds.', time.time() - start_time)

    # Convert to DataFrame
    xirr_df = pd.DataFrame(xirr_results)
    xirr_df.columns = ['xirr']

    # Fill empty values with 0s
    xirr_df = xirr_df.fillna(0)


    return xirr_df

In [None]:
min_wallet_volume = 1

# 1. Summarize cash flows on a wallet level
# -----------------------------------------
# Sum cash flows on a wallet level
wallets_df = pd.DataFrame(modeling_df.copy().groupby(['wallet_address','date'])['usd_net_transfers'].sum())


# 2. Filter wallets on data quality
# ---------------------------------
# Identify wallets with no transactions
wallets_agg_df = wallets_df.groupby(level='wallet_address')['usd_net_transfers'].apply(lambda x: x.abs().sum())
low_volume_wallets = wallets_agg_df[wallets_agg_df < min_wallet_volume].index

# Remove transactionless wallets
wallets_df_filtered = wallets_df[~wallets_df.index.get_level_values('wallet_address').isin(low_volume_wallets)]
logger.info('Removed %s wallets with volume below $%s.', len(low_volume_wallets), min_wallet_volume)

# Group by wallet_address and check for both positive and negative usd_net_transfers
wallet_check = wallets_df_filtered.groupby('wallet_address')['usd_net_transfers'].apply(
    lambda x: (x > 0).any() and (x < 0).any()
)
wallets_missing_both = wallet_check[~wallet_check].index

# Filter wallet addresses that do not have both positive and negative transfers
wallets_df_filtered = wallets_df_filtered[~wallets_df_filtered.index.get_level_values('wallet_address').isin(wallets_missing_both)]
logger.info('Removed %s wallets missing either a positive or negative transaction.', len(wallets_missing_both))


# 3. Calculate XIRR
# -----------------
# Group by wallet_address (level of the MultiIndex) and calculate XIRR\
start_time = time.time()
logger.info('Calculating XIRR values...')
xirr_results = wallets_df_filtered.groupby(level='wallet_address').apply(
    lambda df: xirr(df.index.get_level_values('date'), df['usd_net_transfers'])
)
logger.info('XIRR calculations complete after %.2f seconds.', time.time() - start_time)

# Convert to DataFrame
xirr_df = pd.DataFrame(xirr_results)
xirr_df.columns = ['xirr']

# Fill empty values with 0s
xirr_df = xirr_df.fillna(0)

In [None]:
xirr_results = wallets_df_filtered.groupby(level='wallet_address').apply(
    lambda df: -df['usd_net_transfers'].sum()/df['usd_net_transfers'].cumsum().max()
)

In [None]:
xirr_df = pd.DataFrame(xirr_results)
xirr_df.columns = ['xirr']
xirr_df.describe()

In [None]:
xirr_df.loc[w]

In [None]:
def wallet_metrics(group):
    cumsum = group['usd_net_transfers'].cumsum()
    invested = cumsum.max()
    net_gain = group['usd_net_transfers'].sum()

    return pd.Series({
        'invested': invested,
        'net_gain': net_gain,
        'return': net_gain/invested if invested != 0 else np.nan
    })

# Calculate metrics for all wallets at once
results = wallets_df_filtered.groupby(level='wallet_address').apply(wallet_metrics)

results

In [None]:
min_wallet_volume = 10000

# Calculate XIRR
training_xirr_df = calculate_wallets_xirr(training_df,min_wallet_volume)
modeling_xirr_df = calculate_wallets_xirr(modeling_df,min_wallet_volume=1)

In [None]:

# Calculate percentiles
xirr_df["training_xirr_percentile"] = xirr_df["training_xirr"].rank(ascending=True, pct=True)
xirr_df["modeling_xirr_percentile"] = xirr_df["modeling_xirr"].rank(ascending=True, pct=True)

# Calculate decile buckets
xirr_df['training_xirr_decile'] = np.ceil(xirr_df['training_xirr_percentile']*10)
xirr_df['modeling_xirr_decile'] = np.ceil(xirr_df['modeling_xirr_percentile']*10)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Create a cross-tabulation of the deciles
heatmap_data = pd.crosstab(
    xirr_df['training_xirr_decile'],
    xirr_df['modeling_xirr_decile'],
    normalize='index'
) * 100  # Convert to percentages

# Plot the heatmap
sns.heatmap(heatmap_data, annot=True, fmt=".1f", cmap="coolwarm", cbar=True)

# Add title and labels
plt.title('Percentage Allocation Heatmap: Training to Modeling Deciles')
plt.xlabel('Modeling Decile')
plt.ylabel('Training Decile')
plt.show()

In [None]:
xirr_df = training_xirr_df.rename(columns={'xirr': 'training_xirr'}).join(
    modeling_xirr_df.rename(columns={'xirr': 'modeling_xirr'}),
    how='inner'
).fillna({'modeling_xirr': 0})



xirr_df.describe()

In [None]:
modeling_df.describe()

In [None]:
xirr_df['training_xirr_percentile'].corr(xirr_df['modeling_xirr_percentile'])

In [None]:
# Calculate year fractions from the first date
start_date = dates.min()  # Use the earliest date as the reference
date_fractions = (dates - start_date).dt.days / 365.0
date_fractions = date_fractions.values

date_fractions

In [None]:

# Sum cash flows on a wallet level
wallets_df = pd.DataFrame(training_df.groupby(['wallet_address','date'])['usd_net_transfers'].sum())

# Identify wallets with no transactions
wallets_agg_df = wallets_df.groupby(level='wallet_address')['usd_net_transfers'].apply(lambda x: x.abs().sum())
low_volume_wallets = wallets_agg_df[wallets_agg_df < min_wallet_volume].index

# Remove transactionless wallets
wallets_df_filtered = wallets_df[~wallets_df.index.get_level_values('wallet_address').isin(low_volume_wallets)]

In [None]:
wallets_df_filtered.shape

In [None]:
# Group by wallet_address and check for both positive and negative usd_net_transfers
wallet_check = wallets_df_filtered.groupby('wallet_address')['usd_net_transfers'].apply(
    lambda x: (x > 0).any() and (x < 0).any()
)

# Filter wallet addresses that do not meet the condition
wallets_missing_both = wallet_check[~wallet_check].index
logger.info('Found %s wallets missing either a positive or negative transaction.', len(wallets_missing_both))





In [None]:
# w = '0x036783df7aec54b5dfca9e1f870577bbcca95481'
# wallets_df.loc[w]

# profits_df[profits_df['wallet_address']==w]


### XIRR sequence

In [None]:
wallets_df_filtered.head()

In [None]:
w = '0x0000000000000000000000000000000000000014'

dates = wallets_df.loc[w].index.values
cash_flows = wallets_df.loc[w]['usd_net_transfers']

xirr(dates,cash_flows)

In [None]:
# Group by wallet_address (level of the MultiIndex) and calculate XIRR
xirr_results = wallets_df_filtered.groupby(level='wallet_address').apply(
    lambda df: xirr(df.index.get_level_values('date'), df['usd_net_transfers'])
)

# Convert to DataFrame
xirr_df = pd.DataFrame(xirr_results)
xirr_df.columns = ['xirr']

# Display the resulting DataFrame
print(xirr_results.shape)
xirr_results.head()

In [None]:
xirr_df = pd.DataFrame(xirr_results)
xirr_df.columns = ['xirr']
xirr_df.head()

In [None]:
dates

In [None]:
cash_flows

In [None]:
x = xirr(dates,cash_flows)
x

In [None]:
c = '77e2cf4b-d18a-4026-a2f2-f083f48fe1be'
w = '0xaff2943cfe3e95f66142a1729079418d78e42236'

# u.cw_filter_df(training_df,c,w)

df = u.cw_filter_df(training_df,c,w)
df = df.sort_values('date')
df

In [None]:
dates = df['date']
cash_flows = df['usd_net_transfers']

In [None]:
from pyxirr import xirr

xirr(dates,cash_flows)

In [None]:
cash_flows.cumsum()

In [None]:
cash_flows

In [None]:
# Calculate year fractions from the first date
start_date = dates.min()  # Use the earliest date as the reference
date_fractions = (dates - start_date).dt.days / 365.0
date_fractions = date_fractions.values

date_fractions

In [None]:
date_fractions = (np.datetime64(dates) - np.datetime64(dates[0])).astype('timedelta64[D]') / np.timedelta64(1, 'Y')
date_fractions

## Junkyard

In [None]:
# query_sql = '''
#     with wallet_coins as (
#         select *
#         from (
#             select wallet_address
#             ,coin_id
#             ,max(usd_inflows_cumulative) as coin_inflows
#             from core.coin_wallet_profits
#             group by 1,2
#         )
#         where coin_inflows > 500
#     )

#     ,wallets as (
#         select *
#         from (
#             select wallet_address
#             ,count(coin_id) as total_tokens
#             ,sum(coin_inflows) as total_inflows
#             from wallet_coins wti
#             group by 1
#         )
#         where total_tokens between 3 and 50
#         and total_inflows < 20000000
#     )

#     select cwp.wallet_address
#     ,cwp.coin_id
#     ,cwp.date
#     ,round(cwp.usd_net_transfers) as usd_net_transfers
#     ,round(cwp.usd_balance) as usd_balance
#     ,round(cwp.usd_net_transfers/cmd.price) as token_transfers
#     ,round(cwp.usd_balance/cmd.price) as token_balance
#     ,cmd.price
#     from wallets w
#     join wallet_coins wc on wc.wallet_address = w.wallet_address
#     join core.coin_wallet_profits cwp on cwp.wallet_address = wc.wallet_address
#         and cwp.coin_id = wc.coin_id
#     join core.coin_market_data cmd on cmd.coin_id = cwp.coin_id
#         and cmd.date = cwp.date
#     order by 1,2,3
#     '''
# transfers_df = dgc().run_sql(query_sql)

# # Convert wallet_address to categorical, store the mapping, and convert the column to int32
# wallet_address_categorical = transfers_df['wallet_address'].astype('category')
# # wallet_address_mapping = wallet_address_categorical.cat.categories
# # transfers_df['wallet_address'] = wallet_address_categorical.cat.codes.astype('uint32')


# # Convert coin_id to categorical (original strings are preserved)
# transfers_df['coin_id'] = transfers_df['coin_id'].astype('category')

# # Convert all numerical columns to 32 bit, using safe_downcast to avoid overflow
# transfers_df = u.safe_downcast(transfers_df, 'usd_net_transfers', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'usd_balance', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'token_transfers', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'token_balance', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'price', 'float32')

# print(transfers_df.info())
# print(u.df_mem(transfers_df))
# transfers_df.head()

In [None]:
# query_sql = '''
#     with wallet_coins as (
#         select *
#         from (
#             select wallet_address
#             ,coin_id
#             ,max(usd_inflows_cumulative) as coin_inflows
#             from core.coin_wallet_profits
#             group by 1,2
#         )
#         where coin_inflows > 500
#     )

#     ,wallets as (
#         select *
#         from (
#             select wallet_address
#             ,count(coin_id) as total_tokens
#             ,sum(coin_inflows) as total_inflows
#             from wallet_coins wti
#             group by 1
#         )
#         where total_tokens between 3 and 50
#         and total_inflows < 20000000
#     )

#     ,coins as (
#         select wc.coin_id
#         from wallets w
#         join wallet_coins wc on wc.wallet_address = w.wallet_address
#         group by 1
#     )

#     select cmd.coin_id
#     ,cmd.date
#     ,cmd.price
#     ,cmd.market_cap
#     from coins c
#     join core.coin_market_data cmd on cmd.coin_id = c.coin_id
#     order by 1,2
#     '''
# prices_df = dgc().run_sql(query_sql)

# # Convert coin_id to categorical (original strings are preserved)
# prices_df['coin_id'] = prices_df['coin_id'].astype('category')

# # Convert all numerical columns to 32 bit, using safe_downcast to avoid overflow
# prices_df = u.safe_downcast(prices_df, 'price', 'float32')
# prices_df = u.safe_downcast(prices_df, 'market_cap', 'int32')

# print(prices_df.info())
# print(u.df_mem(prices_df))
# prices_df.head()

## Tests failing

In [None]:
# def test_multiple_coins_per_wallet():
"""
Test scenario where wallets own multiple coins, some exceeding thresholds when aggregated.
Checks filtering based on specified date range.
"""
# Create test data
sample_profits_df = pd.DataFrame({
    'coin_id': ['BTC', 'ETH', 'BTC', 'ETH', 'LTC', 'BTC', 'ETH'],
    'wallet_address': ['wallet1', 'wallet1', 'wallet2', 'wallet2', 'wallet2',
                        'wallet3', 'wallet3'],
    'date': pd.date_range(start='2023-01-01', periods=7),
    'profits_cumulative': [5000, 3000, 1000, 500, 500, 100, 50],
    'usd_inflows_cumulative': [10000, 8000, 2000, 1500, 1500, 500, 250]
})

config = {
    'profitability_filter': 7500,
    'inflows_filter': 15000,
    'date_range': {
        'start': '2023-01-02',
        'end': '2023-01-05'
    }
}

# Call the function with date range
cleaned_df, exclusions_logs_df = dr.clean_profits_df(
    sample_profits_df,
    config,
    earliest_date=config['date_range']['start'],
    latest_date=config['date_range']['end']
)

# Expected results - only checking within date window but removing all records
expected_cleaned_df = sample_profits_df[
    sample_profits_df['wallet_address'].isin(['wallet2', 'wallet3'])
].reset_index(drop=True)

expected_exclusions = pd.DataFrame({
    'wallet_address': ['wallet1'],
    'profits_exclusion': [True],
    'inflows_exclusion': [True]
})

# Assertions
assert len(cleaned_df) == 5  # wallet2 (3 records) and wallet3 (2 records) should remain
assert np.array_equal(cleaned_df.values, expected_cleaned_df.values)
assert np.array_equal(exclusions_logs_df.values, expected_exclusions.values)

# Check if profits and inflows are approximately correct for the remaining wallets
# Should include ALL records for passing wallets (1000 + 500 + 500 + 100 + 50)
assert pytest.approx(cleaned_df['profits_cumulative'].sum(), abs=1e-4) == 2150
# Should include ALL records for passing wallets (2000 + 1500 + 1500 + 500 + 250)
assert pytest.approx(cleaned_df['usd_inflows_cumulative'].sum(), abs=1e-4) == 5750

# Additional date-specific checks
date_mask = ((cleaned_df['date'] >= config['date_range']['start']) &
                (cleaned_df['date'] <= config['date_range']['end']))
date_filtered = cleaned_df[date_mask]

# Verify we have the expected number of records in the date range
assert len(date_filtered) == 3  # Should only have records between Jan 2-5 for remaining wallets