In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import re
import pdb
from pathlib import Path
import datetime
from datetime import datetime,timedelta
import json
import warnings
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score
from sklearn.ensemble import GradientBoostingRegressor

from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar
from pyxirr import xirr

load_dotenv()

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Suppress warnings
warnings.filterwarnings("ignore", message="MallocStackLogging")

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp
import wallet_modeling.wallet_orchestrator as wo
import wallet_modeling.wallet_training_data as wtd
import wallet_modeling.wallet_modeling as wm
import wallet_features.wallet_features as wf
import wallet_features.wallet_coin_features as wcf
import wallet_features.wallet_coin_date_features as wcdf
from wallet_modeling.wallets_config_manager import WalletsConfig


# reload all modules
modules = [u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp, wo, wtd, wm, wf, wcf, wcdf]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')
wallets_config = WalletsConfig.load_from_yaml('../config/wallets_config.yaml')
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)


## Codespace

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


In [None]:
wallets_config['training_data']['validation_period_start']

## Full Sequence

### training data

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Retrieve datasets
profits_df,market_data_df = wo.retrieve_datasets()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Define wallet cohort after cleaning
training_wallet_metrics_df,wallet_cohort = wo.define_wallet_cohort(profits_df,market_data_df)

# Generate profits_df for all training windows and the modeling period
training_profits_df, training_windows_profits_dfs, modeling_profits_df, validation_profits_df = wo.split_profits_df(profits_df,
                                                                               market_data_df,wallet_cohort)


### generate features

In [None]:
# [importlib.reload(module) for module in modules]
# wallets_config.reload()

# # Generate features for the full training dataset
# training_wallet_features_df = wf.calculate_wallet_features(training_profits_df, market_data_df, wallet_cohort)

# # Define the full feature set by appending a suffix for each window
# training_data_df = training_wallet_features_df.add_suffix("_all_windows")

# # Generate features for each window
# for i, window_profits_df in enumerate(training_windows_profits_dfs, 1):
#     # Generate the features
#     window_wallet_features_df = wf.calculate_wallet_features(window_profits_df, market_data_df, wallet_cohort)

#     # Add column suffix and join to training_data_df
#     window_wallet_features_df = window_wallet_features_df.add_suffix(f'_w{i}')
#     training_data_df = training_data_df.join(window_wallet_features_df, how='left')


# training_data_df.describe()

### Market timing features dev

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Market data: add indicators
market_indicators_df = ind.generate_time_series_indicators(market_data_df,
                                                           wallets_metrics_config['time_series']['market_data'],
                                                           'coin_id')



In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))

# add timing offset features
market_timing_df = wcdf.calculate_offsets(market_indicators_df,wallets_features_config)
market_timing_df,relative_change_columns = wcdf.calculate_relative_changes(market_timing_df,wallets_features_config)

relative_change_columns

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


wallet_timing_features_df = wcf.generate_all_timing_features(
    training_profits_df,
    market_timing_df,
    relative_change_columns,
    wallets_config['features']['timing_metrics_min_transaction_size'],
)

wallet_timing_features_df.head()

In [None]:
wallet_timing_features_df = wallet_timing_features_df.fillna(0)

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


wallet_timing_features_df = wcf.generate_all_timing_features(
    window_profits_df,
    market_timing_df,
    relative_change_columns,
    wallets_config['features']['timing_metrics_min_transaction_size'],
)

wallet_timing_features_df.head()

In [None]:
import pandas as pd
import numpy as np
import logging

def generate_all_timing_features(
    profits_df,
    market_timing_df,
    relative_change_columns,
    min_transaction_size=0
):
    """
    Generate timing features for multiple market metric columns.

    Args:
        profits_df (pd.DataFrame): DataFrame with columns [coin_id, date, wallet_address, usd_net_transfers]
        market_timing_df (pd.DataFrame): DataFrame with market timing metrics indexed by (coin_id, date)
        relative_change_columns (list): List of column names from market_timing_df to analyze
        min_transaction_size (float): Minimum absolute USD value of transaction to consider
    """
    logger.info("Starting timing feature generation for columns: %s", relative_change_columns)

    if not relative_change_columns:
        return pd.DataFrame(index=profits_df['wallet_address'].unique())

    # Filter and merge data
    filtered_profits = profits_df[
        abs(profits_df['usd_net_transfers']) >= min_transaction_size
    ].copy()

    timing_profits_df = filtered_profits.merge(
        market_timing_df[relative_change_columns + ['coin_id', 'date']],
        on=['coin_id', 'date'],
        how='left'
    )

    # Split into buys and sells
    buys_df = timing_profits_df[timing_profits_df['usd_net_transfers'] > 0]
    sells_df = timing_profits_df[timing_profits_df['usd_net_transfers'] < 0]

    buy_features = {}
    sell_features = {}

    # Process buys
    for col in relative_change_columns:
        # Simple mean
        buy_features[f"{col}_buy_mean"] = buys_df.groupby('wallet_address')[col].mean()
        # Weighted mean
        buy_features[f"{col}_buy_weighted"] = (
            buys_df.groupby('wallet_address')
            .apply(lambda x: np.average(x[col], weights=abs(x['usd_net_transfers'])))
        )

    # Process sells
    for col in relative_change_columns:
        # Simple mean
        sell_features[f"{col}_sell_mean"] = sells_df.groupby('wallet_address')[col].mean()
        # Weighted mean
        sell_features[f"{col}_sell_weighted"] = (
            sells_df.groupby('wallet_address')
            .apply(lambda x: np.average(x[col], weights=abs(x['usd_net_transfers'])))
        )

    # Combine results
    result = pd.concat([
        pd.DataFrame(buy_features),
        pd.DataFrame(sell_features)
    ], axis=1)

    # Ensure all wallets are included
    all_wallets = pd.DataFrame(index=filtered_profits['wallet_address'].unique())
    result = all_wallets.join(result)

    return result

In [None]:
result = generate_all_timing_features(
    window_profits_df,
    market_timing_df,
    relative_change_columns,
    wallets_config['features']['timing_metrics_min_transaction_size'],
)
result

In [None]:
market_timing_df.head()

In [None]:
data = market_timing_df['volume_sma_7_vs_lead_30']
cutoff = 0.01

lower_bound = np.percentile(data, cutoff * 100)
upper_bound = np.percentile(data, (1 - cutoff) * 100)

np.clip(data, lower_bound, upper_bound)


In [None]:
market_timing_df['volume_sma_7_vs_lead_30_w']

In [None]:
market_timing_df.describe()

### additional features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate additional features
# ----------------------------
# Retrieve the buy numbers for wallets in the cohort
buyer_numbers_df = wcf.retrieve_buyer_numbers()

# Append buyer numbers to the merged_df
buyer_averages_df = buyer_numbers_df.groupby('wallet_id').mean('buyer_number')
buyer_averages_df.columns = ['average_buyer_number']
training_data_df = training_data_df.join(buyer_averages_df)
training_data_df.describe()

### merge to modeling_df

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Clean inactive wallets from modeling period data
modeling_wallets_df = wo.filter_modeling_period_wallets(modeling_profits_df)

# Generate target variables
target_vars_df = wm.generate_target_variables(modeling_wallets_df)

# Merge training data and target variables
modeling_df = training_data_df.join(target_vars_df[wallets_config['modeling']['target_variable']],
                                    how='inner')

modeling_df.describe()

## Codespace

## Crude Model

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# make df
df = modeling_df.copy()

if wallets_config['modeling']['drop_columns']:
    # Get list of columns to drop from config
    columns_to_drop = wallets_config['modeling']['drop_columns']

    # Only drop columns that actually exist in the DataFrame
    existing_columns = [col for col in columns_to_drop if col in df.columns]

    # Drop the columns if any exist
    if existing_columns:
        df = df.drop(columns=existing_columns)

df.shape

In [None]:

# Assuming your dataframe is called 'df'
# Separate features and target
X = df.drop(wallets_config['modeling']['target_variable'], axis=1)  # dropping both return columns
y = df[wallets_config['modeling']['target_variable']]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create preprocessing steps
numeric_features = X.columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ])

# Define the gradient boosting regressor with key parameters
gbr = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=3,
    max_features=1.0,
    min_samples_leaf=0.005,
    min_samples_split=0.02,
    subsample=0.8,
    random_state=42,
)

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', gbr)
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

model=pipeline.named_steps['regressor']  # Pass the actual model object
# model=grid_search.best_estimator_.named_steps['regressor']

# Example usage with your existing code:
evaluation = wm.evaluate_regression_model(
    y_test,
    y_pred,
    model=model,
    feature_names=X.columns.tolist()
)

# Print summary report
print(evaluation['summary_report'])

# Access specific metrics
print(f"R² Score: {evaluation['r2']:.3f}")

# The figure can be displayed or saved
if evaluation['figures'] is not None:
    plt.show()  # or evaluation['figures'].savefig('model_evaluation.png')

In [None]:
# Calculate validation period wallet metrics
validation_profits_df = wcf.add_cash_flow_transfers_logic(validation_profits_df)
validation_wallets_df = wf.calculate_wallet_level_metrics(validation_profits_df)

# Attach validation period performance to modeling period scores
validation_df = pd.DataFrame()
validation_df['wallet_address'] = X_test.index.values
validation_df['score'] = y_pred
validation_df['score_rounded'] = np.ceil(validation_df['score']*20)/20
validation_df = validation_df.set_index('wallet_address')
validation_df = validation_df.join(validation_wallets_df,how='left')

# Group wallets by score bucket and assess performance
grouped_val = validation_df.groupby('score_rounded').agg(
    wallets=('score','count'),
    total_invested=('invested','sum'),
    total_net_gain=('net_gain','sum'),
    median_invested=('invested','median'),
    median_net_gain=('net_gain','median'),
)
grouped_val['total_return'] = grouped_val['total_net_gain']/grouped_val['total_invested']
grouped_val['median_return'] = grouped_val['median_net_gain']/grouped_val['median_invested']
grouped_val

In [None]:
validation_start_date = pd.to_datetime(wallets_config['training_data']['validation_period_start'])
validation_start_df = validation_profits_df[validation_profits_df['date']==validation_start_date].copy()
validation_start_df = validation_start_df[['coin_id','wallet_address','usd_balance']]
validation_start_df = validation_start_df[validation_start_df['usd_balance']>0]
validation_start_df.describe()

In [None]:
import pandas as pd
import numpy as np

def safe_weighted_average(scores, weights):
    """Calculate weighted average, handling zero weights safely"""
    if np.sum(weights) == 0:
        return np.mean(scores) if len(scores) > 0 else 0
    return np.sum(scores * weights) / np.sum(weights)

def rank_coins_by_wallet_scores(validation_start_df, wallet_scores_df):
    """
    Create multiple ranking metrics for coins based on wallet scores and balances.
    """
    # Merge wallet scores with balance data
    analysis_df = validation_start_df.merge(
        wallet_scores_df[['score']],
        left_on='wallet_address',
        right_index=True,
        how='left'
    )

    # Ensure no negative balances and fill any NA scores
    analysis_df['usd_balance'] = analysis_df['usd_balance'].clip(lower=0)
    analysis_df['score'] = analysis_df['score'].fillna(0)

    # Calculate weighted average score differently
    weighted_scores = analysis_df.groupby('coin_id').apply(
        lambda x: safe_weighted_average(x['score'].values, x['usd_balance'].values)
    ).reset_index()
    weighted_scores.columns = ['coin_id', 'weighted_avg_score']

    # Top wallet concentration
    high_score_threshold = wallet_scores_df['score'].quantile(0.8)
    top_wallet_metrics = analysis_df[analysis_df['score'] >= high_score_threshold].groupby('coin_id').agg({
        'usd_balance': 'sum',
        'wallet_address': 'count'
    }).reset_index()
    top_wallet_metrics.columns = ['coin_id', 'top_wallet_balance', 'top_wallet_count']

    # Calculate total metrics
    total_metrics = analysis_df.groupby('coin_id').agg({
        'usd_balance': 'sum',
        'wallet_address': 'count',
        'score': ['mean', 'std', 'count']
    }).reset_index()
    total_metrics.columns = ['coin_id', 'total_balance', 'total_wallets',
                           'mean_score', 'score_std', 'score_count']

    # Combine metrics
    final_rankings = pd.merge(weighted_scores, top_wallet_metrics, on='coin_id', how='left')
    final_rankings = pd.merge(final_rankings, total_metrics, on='coin_id', how='left')

    # Fill NaN values
    fill_columns = ['top_wallet_balance', 'top_wallet_count', 'score_std']
    final_rankings[fill_columns] = final_rankings[fill_columns].fillna(0)

    # Calculate percentages safely
    final_rankings['top_wallet_balance_pct'] = np.where(
        final_rankings['total_balance'] > 0,
        final_rankings['top_wallet_balance'] / final_rankings['total_balance'],
        0
    )

    final_rankings['top_wallet_count_pct'] = np.where(
        final_rankings['total_wallets'] > 0,
        final_rankings['top_wallet_count'] / final_rankings['total_wallets'],
        0
    )

    # Create composite score
    final_rankings['composite_score'] = (
        final_rankings['weighted_avg_score'] * 0.4 +
        final_rankings['top_wallet_balance_pct'] * 0.3 +
        final_rankings['top_wallet_count_pct'] * 0.3
    )

    # Additional metrics
    final_rankings['avg_wallet_balance'] = final_rankings['total_balance'] / final_rankings['total_wallets']
    final_rankings['score_confidence'] = 1 - (1 / np.sqrt(final_rankings['score_count'] + 1))  # Added +1 to avoid division by zero

    # Filter for minimum activity
    min_wallets = 5
    min_balance = 10000
    final_rankings = final_rankings[
        (final_rankings['total_wallets'] >= min_wallets) &
        (final_rankings['total_balance'] >= min_balance)
    ]

    # Set index
    final_rankings=final_rankings.set_index('coin_id')

    # Sort by composite score
    final_rankings = final_rankings.sort_values('composite_score', ascending=False)


    return final_rankings

# Example usage
wallet_scores_df = pd.DataFrame({'score': y_pred}, index=validation_df.index)
coin_rankings = rank_coins_by_wallet_scores(validation_start_df, wallet_scores_df)

# Display summary statistics to verify calculations
coin_rankings.describe()


In [None]:
validation_start_date = pd.to_datetime(wallets_config['training_data']['validation_period_start'])
validation_end_date = pd.to_datetime(wallets_config['training_data']['validation_period_end'])

# Get prices at start and end dates
start_prices = market_data_df[market_data_df['date'] == validation_start_date].set_index('coin_id')['price']
end_prices = market_data_df[market_data_df['date'] == validation_end_date].set_index('coin_id')['price']

# Combine into a single dataframe
validation_prices_df = pd.DataFrame({
    'starting_price': start_prices,
    'ending_price': end_prices
})

# Calculate returns
validation_prices_df['coin_return'] = (validation_prices_df['ending_price'] / validation_prices_df['starting_price']) - 1

# Clean up any missing values
validation_prices_df = validation_prices_df.dropna()
validation_prices_df


In [None]:
# Get market cap data
start_market_cap = market_data_df[market_data_df['date'] == validation_start_date].set_index('coin_id')['market_cap']
start_market_cap_filled = market_data_df[market_data_df['date'] == validation_start_date].set_index('coin_id')['market_cap_filled']

# Create market cap dataframe
market_cap_df = pd.DataFrame({
    'market_cap': start_market_cap,
    'market_cap_filled': start_market_cap_filled
})

market_cap_df.head()


In [None]:
coin_performance_df = coin_rankings.join(validation_prices_df['coin_return'],how='left')
coin_performance_df = coin_performance_df.join(market_cap_df,how='left')
coin_performance_df.head()

In [None]:
def analyze_metrics_performance(df):
   """
   For each metric in the dataframe, analyze return performance of top 10 coins sorted by that metric
   """
   results = {}

   # Skip these columns as they're not useful ranking metrics
   skip_columns = ['coin_return', 'coin_id','market_cap','market_cap_filled']

   # Calculate performance for each metric
   for column in df.columns:
       if column not in skip_columns:
           # Sort by metric and get top 10 coins
           top_10 = df.sort_values(column, ascending=False).head(10)

           # Calculate average return
           avg_return = top_10['coin_return'].mean()
           median_return = top_10['coin_return'].median()
           min_return = top_10['coin_return'].min()
           max_return = top_10['coin_return'].max()

           results[column] = {
               'mean_return': avg_return,
               'median_return': median_return,
               'min_return': min_return,
               'max_return': max_return
           }

   # Convert to dataframe
   results_df = pd.DataFrame(results).T

   # Sort by mean return
   results_df = results_df.sort_values('mean_return', ascending=False)

   return results_df



# Run analysis
analyze_df = coin_performance_df[
    (coin_performance_df['market_cap_filled']<=5000000)
    & (coin_performance_df['market_cap_filled']>1000000)
]
performance_by_metric = analyze_metrics_performance(analyze_df)
performance_by_metric

In [None]:
analyze_df.sort_values('top_wallet_count',ascending=False).head(10)

In [None]:
analyze_df.sort_values('top_wallet_count',ascending=False).head(10)

In [None]:
training_data_df.head()

In [None]:
import pygame

def play_notification(sound_file_path="/Users/jeremymeadow/DreamsData/Local/assets/sounds/mixkit-alert-bells-echo-765.wav"):
    """
    Play a notification sound from a local audio file using pygame.

    Parameters:
    sound_file_path (str): Path to the sound file (supports .mp3, .wav, etc.)
    """
    try:
        pygame.mixer.init()
        pygame.mixer.music.load(sound_file_path)
        pygame.mixer.music.play()

        # Wait for the sound to finish playing
        while pygame.mixer.music.get_busy():
            pygame.time.Clock().tick(10)

    except Exception as e:
        print(f"Error playing sound: {e}")
    finally:
        pygame.mixer.quit()


play_notification()

### grid search

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

# Assuming your dataframe is called 'df'
# Separate features and target
X = df.drop(wallets_config['modeling']['target_variable'], axis=1)
y = df[wallets_config['modeling']['target_variable']]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create preprocessing steps
numeric_features = X.columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ])

# Define parameter grid for tuning
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [2, 3, 4],
    'regressor__subsample': [0.8, 0.9],
    'regressor__min_samples_split': [0.01, 0.02, 0.03],
    'regressor__min_samples_leaf': [0.005, 0.01, 0.015],
    'regressor__max_features': [0.8, 0.9, 1.0]  # This replaces colsample_bytree
}

# Create grid search with cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    n_jobs=-1,
    scoring='neg_mean_squared_error'
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best parameters:", grid_search.best_params_)
print("Best CV score:", np.sqrt(-grid_search.best_score_))  # RMSE

# Make predictions with best model
y_pred = grid_search.predict(X_test)

# Calculate and print test set RMSE
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test set RMSE: {test_rmse}")

# If you want to examine feature importance
best_model = grid_search.best_estimator_.named_steps['regressor']
feature_importance = pd.DataFrame({
    'feature': numeric_features,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

In [None]:
play_notification()

In [None]:

model=pipeline.named_steps['regressor']
feature_names=X.columns.tolist()


importances = model.feature_importances_

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values('Importance', ascending=True)

# drop_columns = importance_df[importance_df['Importance']<=0.005]['Feature']
# drop_columns
importance_df

## Junkyard

In [None]:
[importlib.reload(module) for module in modules]



# Split out modeling and training records to calculate return separately
modeling_df = adj_profits_df[
    (adj_profits_df['date'] >= pd.to_datetime(wallets_config['training_data']['modeling_period_start'])) &
    (adj_profits_df['date'] <= pd.to_datetime(wallets_config['training_data']['modeling_period_end']))
]
modeling_performance_df = wf.calculate_wallet_investment_return(modeling_df)


training_df = adj_profits_df[
    (adj_profits_df['date'] >= pd.to_datetime(wallets_config['training_data']['training_period_start'])) &
    (adj_profits_df['date'] <= pd.to_datetime(wallets_config['training_data']['training_period_end']))
]
training_performance_df = wf.calculate_wallet_investment_return(training_df)


In [None]:
adj_profits_df.head()

### Calculations

In [None]:
training_performance_df

In [None]:
min_invested = 10000
filtered_df = training_performance_df[training_performance_df['invested']>=min_invested]
print(training_performance_df.shape)
print(filtered_df.shape)

In [None]:
# Join training and modeling data
performance_df = filtered_df[['return']].join(modeling_performance_df[['return']],lsuffix='_training',rsuffix='_modeling')
performance_df.shape

# Calculate percentiles
performance_df["training_percentile"] = performance_df["return_training"].rank(ascending=True, pct=True)
performance_df["modeling_percentile"] = performance_df["return_modeling"].rank(ascending=True, pct=True)

# Calculate decile buckets
performance_df['training_decile'] = np.ceil(performance_df['training_percentile']*5)
performance_df['modeling_decile'] = np.ceil(performance_df['modeling_percentile']*5)

# Check correlation
performance_df['training_percentile'].corr(performance_df['modeling_percentile'])


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Create a cross-tabulation of the deciles
heatmap_data = pd.crosstab(
    performance_df['training_decile'],
    performance_df['modeling_decile'],
    normalize='index'
) * 100  # Convert to percentages

# Plot the heatmap
sns.heatmap(heatmap_data, annot=True, fmt=".1f", cmap="coolwarm", cbar=True)

# Add title and labels
plt.title('Percentage Allocation Heatmap: Training to Modeling Deciles')
plt.xlabel('Modeling Decile')
plt.ylabel('Training Decile')
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def create_correlation_matrix(df):
    """
    Create and visualize a correlation matrix for the given DataFrame

    Parameters:
    df (pandas.DataFrame): Input DataFrame

    Returns:
    pandas.DataFrame: Correlation matrix
    """
    # Calculate the correlation matrix
    corr_matrix = df.corr(method='pearson')

    # Create a heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix,
                annot=True,  # Show correlation values
                cmap='coolwarm',  # Color scheme from red (negative) to blue (positive)
                vmin=-1, vmax=1,  # Fix the scale
                center=0,  # Center the colormap at 0
                fmt='.2f')  # Round to 2 decimal places

    plt.title('Correlation Matrix Heatmap')
    plt.tight_layout()

    return corr_matrix

# create_correlation_matrix(performance_df)

In [None]:
performance_df.corr()

### Market data analysis

In [None]:
start_prices.set_index('coin_id')

In [None]:
price_analysis_df = market_data_df.copy().set_index('coin_id')

start_prices = price_analysis_df[price_analysis_df['date']== pd.to_datetime(config['training_data']['modeling_period_start'])]
end_prices = price_analysis_df[price_analysis_df['date']== pd.to_datetime(config['training_data']['modeling_period_end'])]

# coin_modeling_returns_df = start_prices.join(end_prices)

    # (adj_profits_df['date'] >= pd.to_datetime(config['training_data']['modeling_period_start'])) &
    # (adj_profits_df['date'] <= pd.to_datetime(config['training_data']['modeling_period_end']))

In [None]:
# Calculate coin returns during modeling period
coin_modeling_returns_df = start_prices[['price']].join(end_prices[['price']],lsuffix='_start',rsuffix='_end')
coin_modeling_returns_df['coin_modeling_return'] = coin_modeling_returns_df['price_end']/coin_modeling_returns_df['price_start']
coin_modeling_returns_df["coin_modeling_percentile_return"] = coin_modeling_returns_df["coin_modeling_return"].rank(ascending=True, pct=True)

coin_modeling_returns_df.head()

In [None]:
# Calculate wallet ending balances
min_end_balance = 1000

# Calculate period end balance for each coin-wallet pair
end_balances_df = adj_profits_df[adj_profits_df['date']==pd.to_datetime(config['training_data']['training_period_end'])]
end_balances_df = end_balances_df[end_balances_df['usd_net_transfers']<=-min_end_balance]
end_balances_df['usd_balance'] = end_balances_df['usd_net_transfers'].abs()
end_balances_df = end_balances_df[['coin_id','wallet_address','usd_balance']]
end_balances_df = end_balances_df.set_index(['coin_id','wallet_address'])
end_balances_df.head()

# Add wallet performance metrics
end_balances_df = end_balances_df.join(performance_df,on='wallet_address')
end_balances_df = end_balances_df[end_balances_df['return_training'].notna()]

end_balances_df.head()

In [None]:
wallet_return_column = 'return_training'

# Assess average wallet return during training period
coin_wallet_performance = pd.DataFrame(end_balances_df.reset_index().groupby('coin_id',observed=True)[wallet_return_column].mean())
coin_wallet_performance.columns = ['avg_wallet_training_return']

coin_wallet_performance.head()

In [None]:
# coin_return_column = 'coin_modeling_return'
coin_return_column = 'coin_modeling_percentile_return'

wallet_forecast_df = coin_modeling_returns_df[[coin_return_column]].join(coin_wallet_performance)
wallet_forecast_df[coin_return_column].corr(wallet_forecast_df['avg_wallet_training_return'])


In [None]:
performance_df = filtered_df[['return']].join(modeling_performance_df[['return']],lsuffix='_training',rsuffix='_modeling')
performance_df.shape

In [None]:
# Join training and modeling data
performance_df = filtered_df[['return']].join(modeling_performance_df[['return']],lsuffix='_training',rsuffix='_modeling')
performance_df.shape

# Calculate percentiles
performance_df["training_percentile"] = performance_df["return_training"].rank(ascending=True, pct=True)
performance_df["modeling_percentile"] = performance_df["return_modeling"].rank(ascending=True, pct=True)

# Calculate decile buckets
performance_df['training_decile'] = np.ceil(performance_df['training_percentile']*10)
performance_df['modeling_decile'] = np.ceil(performance_df['modeling_percentile']*10)

# Check correlation
performance_df['training_percentile'].corr(performance_df['modeling_percentile'])


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Create a cross-tabulation of the deciles
heatmap_data = pd.crosstab(
    performance_df['training_decile'],
    performance_df['modeling_decile'],
    normalize='index'
) * 100  # Convert to percentages

# Plot the heatmap
sns.heatmap(heatmap_data, annot=True, fmt=".1f", cmap="coolwarm", cbar=True)

# Add title and labels
plt.title('Percentage Allocation Heatmap: Training to Modeling Deciles')
plt.xlabel('Modeling Decile')
plt.ylabel('Training Decile')
plt.show()

In [None]:
wallet_performance_df = training_performance_df
wallet_performance_df['return'] = wallet_performance_df['net_gain']/wallet_performance_df['invested']
wallet_performance_df.describe()

In [None]:
# wallet_performance_df.sample(10)

wallet_performance_df[wallet_performance_df['invested']==0]

In [None]:
w = '0xca6cfaa7d61371310d84b63a4ca90cbf7883a9db'

df = wallets_df_filtered.loc[w]

# print(xirr(df.index.get_level_values('date'), df['usd_net_transfers']))
df

In [None]:
# adj_profits_df[adj_profits_df['wallet_address']==w]
profits_df[profits_df['wallet_address']==w].sort_values('date')

In [None]:
adj_profits_df[adj_profits_df['wallet_address']==w].sort_values('date')


In [None]:
def calculate_wallets_xirr(profits_df, min_wallet_volume):
    """
    Calculates the XIRR of each wallet based on their cash flows across all coins they've
    interacted with in profits_df.

    Parameters:
    - profits_df (pd.DataFrame): shows daily coin-wallet transfers in USD
    - min_wallet_volume (int): wallets with less than this total USD volume will be excluded

    Returns:
    - xirr_df (pd.DataFrame): shows the XIRR of each wallet over the provided transactions
    """
    logger.info('Beginning XIRR calculation sequence...')

    # 1. Summarize cash flows on a wallet level
    # -----------------------------------------
    # Sum cash flows on a wallet level
    wallets_df = pd.DataFrame(profits_df.groupby(['wallet_address','date'])['usd_net_transfers'].sum())


    # 2. Filter wallets on data quality
    # ---------------------------------
    # Identify wallets with no transactions
    wallets_agg_df = wallets_df.groupby(level='wallet_address')['usd_net_transfers'].apply(lambda x: x.abs().sum())
    low_volume_wallets = wallets_agg_df[wallets_agg_df < min_wallet_volume].index

    # Remove transactionless wallets
    wallets_df_filtered = wallets_df[~wallets_df.index.get_level_values('wallet_address').isin(low_volume_wallets)]
    logger.info('Removed %s wallets with volume below $%s.', len(low_volume_wallets), min_wallet_volume)

    # Group by wallet_address and check for both positive and negative usd_net_transfers
    wallet_check = wallets_df_filtered.groupby('wallet_address')['usd_net_transfers'].apply(
        lambda x: (x > 0).any() and (x < 0).any()
    )
    wallets_missing_both = wallet_check[~wallet_check].index

    # Filter wallet addresses that do not have both positive and negative transfers
    wallets_df_filtered = wallets_df_filtered[~wallets_df_filtered.index.get_level_values('wallet_address').isin(wallets_missing_both)]
    logger.info('Removed %s wallets missing either a positive or negative transaction.', len(wallets_missing_both))


    # 3. Calculate XIRR
    # -----------------
    # Group by wallet_address (level of the MultiIndex) and calculate XIRR\
    start_time = time.time()
    logger.info('Calculating XIRR values...')
    xirr_results = wallets_df_filtered.groupby(level='wallet_address').apply(
        lambda df: xirr(df.index.get_level_values('date'), df['usd_net_transfers'])
    )
    logger.info('XIRR calculations complete after %.2f seconds.', time.time() - start_time)

    # Convert to DataFrame
    xirr_df = pd.DataFrame(xirr_results)
    xirr_df.columns = ['xirr']

    # Fill empty values with 0s
    xirr_df = xirr_df.fillna(0)


    return xirr_df

In [None]:
min_wallet_volume = 1

# 1. Summarize cash flows on a wallet level
# -----------------------------------------
# Sum cash flows on a wallet level
wallets_df = pd.DataFrame(modeling_df.copy().groupby(['wallet_address','date'])['usd_net_transfers'].sum())


# 2. Filter wallets on data quality
# ---------------------------------
# Identify wallets with no transactions
wallets_agg_df = wallets_df.groupby(level='wallet_address')['usd_net_transfers'].apply(lambda x: x.abs().sum())
low_volume_wallets = wallets_agg_df[wallets_agg_df < min_wallet_volume].index

# Remove transactionless wallets
wallets_df_filtered = wallets_df[~wallets_df.index.get_level_values('wallet_address').isin(low_volume_wallets)]
logger.info('Removed %s wallets with volume below $%s.', len(low_volume_wallets), min_wallet_volume)

# Group by wallet_address and check for both positive and negative usd_net_transfers
wallet_check = wallets_df_filtered.groupby('wallet_address')['usd_net_transfers'].apply(
    lambda x: (x > 0).any() and (x < 0).any()
)
wallets_missing_both = wallet_check[~wallet_check].index

# Filter wallet addresses that do not have both positive and negative transfers
wallets_df_filtered = wallets_df_filtered[~wallets_df_filtered.index.get_level_values('wallet_address').isin(wallets_missing_both)]
logger.info('Removed %s wallets missing either a positive or negative transaction.', len(wallets_missing_both))


# 3. Calculate XIRR
# -----------------
# Group by wallet_address (level of the MultiIndex) and calculate XIRR\
start_time = time.time()
logger.info('Calculating XIRR values...')
xirr_results = wallets_df_filtered.groupby(level='wallet_address').apply(
    lambda df: xirr(df.index.get_level_values('date'), df['usd_net_transfers'])
)
logger.info('XIRR calculations complete after %.2f seconds.', time.time() - start_time)

# Convert to DataFrame
xirr_df = pd.DataFrame(xirr_results)
xirr_df.columns = ['xirr']

# Fill empty values with 0s
xirr_df = xirr_df.fillna(0)

In [None]:
xirr_results = wallets_df_filtered.groupby(level='wallet_address').apply(
    lambda df: -df['usd_net_transfers'].sum()/df['usd_net_transfers'].cumsum().max()
)

In [None]:
xirr_df = pd.DataFrame(xirr_results)
xirr_df.columns = ['xirr']
xirr_df.describe()

In [None]:
xirr_df.loc[w]

In [None]:
def wallet_metrics(group):
    cumsum = group['usd_net_transfers'].cumsum()
    invested = cumsum.max()
    net_gain = group['usd_net_transfers'].sum()

    return pd.Series({
        'invested': invested,
        'net_gain': net_gain,
        'return': net_gain/invested if invested != 0 else np.nan
    })

# Calculate metrics for all wallets at once
results = wallets_df_filtered.groupby(level='wallet_address').apply(wallet_metrics)

results

In [None]:
min_wallet_volume = 10000

# Calculate XIRR
training_xirr_df = calculate_wallets_xirr(training_df,min_wallet_volume)
modeling_xirr_df = calculate_wallets_xirr(modeling_df,min_wallet_volume=1)

In [None]:

# Calculate percentiles
xirr_df["training_xirr_percentile"] = xirr_df["training_xirr"].rank(ascending=True, pct=True)
xirr_df["modeling_xirr_percentile"] = xirr_df["modeling_xirr"].rank(ascending=True, pct=True)

# Calculate decile buckets
xirr_df['training_xirr_decile'] = np.ceil(xirr_df['training_xirr_percentile']*10)
xirr_df['modeling_xirr_decile'] = np.ceil(xirr_df['modeling_xirr_percentile']*10)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Create a cross-tabulation of the deciles
heatmap_data = pd.crosstab(
    xirr_df['training_xirr_decile'],
    xirr_df['modeling_xirr_decile'],
    normalize='index'
) * 100  # Convert to percentages

# Plot the heatmap
sns.heatmap(heatmap_data, annot=True, fmt=".1f", cmap="coolwarm", cbar=True)

# Add title and labels
plt.title('Percentage Allocation Heatmap: Training to Modeling Deciles')
plt.xlabel('Modeling Decile')
plt.ylabel('Training Decile')
plt.show()

In [None]:
xirr_df = training_xirr_df.rename(columns={'xirr': 'training_xirr'}).join(
    modeling_xirr_df.rename(columns={'xirr': 'modeling_xirr'}),
    how='inner'
).fillna({'modeling_xirr': 0})



xirr_df.describe()

In [None]:
modeling_df.describe()

In [None]:
xirr_df['training_xirr_percentile'].corr(xirr_df['modeling_xirr_percentile'])

In [None]:
# Calculate year fractions from the first date
start_date = dates.min()  # Use the earliest date as the reference
date_fractions = (dates - start_date).dt.days / 365.0
date_fractions = date_fractions.values

date_fractions

In [None]:

# Sum cash flows on a wallet level
wallets_df = pd.DataFrame(training_df.groupby(['wallet_address','date'])['usd_net_transfers'].sum())

# Identify wallets with no transactions
wallets_agg_df = wallets_df.groupby(level='wallet_address')['usd_net_transfers'].apply(lambda x: x.abs().sum())
low_volume_wallets = wallets_agg_df[wallets_agg_df < min_wallet_volume].index

# Remove transactionless wallets
wallets_df_filtered = wallets_df[~wallets_df.index.get_level_values('wallet_address').isin(low_volume_wallets)]

In [None]:
wallets_df_filtered.shape

In [None]:
# Group by wallet_address and check for both positive and negative usd_net_transfers
wallet_check = wallets_df_filtered.groupby('wallet_address')['usd_net_transfers'].apply(
    lambda x: (x > 0).any() and (x < 0).any()
)

# Filter wallet addresses that do not meet the condition
wallets_missing_both = wallet_check[~wallet_check].index
logger.info('Found %s wallets missing either a positive or negative transaction.', len(wallets_missing_both))





In [None]:
# w = '0x036783df7aec54b5dfca9e1f870577bbcca95481'
# wallets_df.loc[w]

# profits_df[profits_df['wallet_address']==w]


### XIRR sequence

In [None]:
wallets_df_filtered.head()

In [None]:
w = '0x0000000000000000000000000000000000000014'

dates = wallets_df.loc[w].index.values
cash_flows = wallets_df.loc[w]['usd_net_transfers']

xirr(dates,cash_flows)

In [None]:
# Group by wallet_address (level of the MultiIndex) and calculate XIRR
xirr_results = wallets_df_filtered.groupby(level='wallet_address').apply(
    lambda df: xirr(df.index.get_level_values('date'), df['usd_net_transfers'])
)

# Convert to DataFrame
xirr_df = pd.DataFrame(xirr_results)
xirr_df.columns = ['xirr']

# Display the resulting DataFrame
print(xirr_results.shape)
xirr_results.head()

In [None]:
xirr_df = pd.DataFrame(xirr_results)
xirr_df.columns = ['xirr']
xirr_df.head()

In [None]:
dates

In [None]:
cash_flows

In [None]:
x = xirr(dates,cash_flows)
x

In [None]:
c = '77e2cf4b-d18a-4026-a2f2-f083f48fe1be'
w = '0xaff2943cfe3e95f66142a1729079418d78e42236'

# u.cw_filter_df(training_df,c,w)

df = u.cw_filter_df(training_df,c,w)
df = df.sort_values('date')
df

In [None]:
dates = df['date']
cash_flows = df['usd_net_transfers']

In [None]:
from pyxirr import xirr

xirr(dates,cash_flows)

In [None]:
cash_flows.cumsum()

In [None]:
cash_flows

In [None]:
# Calculate year fractions from the first date
start_date = dates.min()  # Use the earliest date as the reference
date_fractions = (dates - start_date).dt.days / 365.0
date_fractions = date_fractions.values

date_fractions

In [None]:
date_fractions = (np.datetime64(dates) - np.datetime64(dates[0])).astype('timedelta64[D]') / np.timedelta64(1, 'Y')
date_fractions

In [None]:
# query_sql = '''
#     with wallet_coins as (
#         select *
#         from (
#             select wallet_address
#             ,coin_id
#             ,max(usd_inflows_cumulative) as coin_inflows
#             from core.coin_wallet_profits
#             group by 1,2
#         )
#         where coin_inflows > 500
#     )

#     ,wallets as (
#         select *
#         from (
#             select wallet_address
#             ,count(coin_id) as total_tokens
#             ,sum(coin_inflows) as total_inflows
#             from wallet_coins wti
#             group by 1
#         )
#         where total_tokens between 3 and 50
#         and total_inflows < 20000000
#     )

#     select cwp.wallet_address
#     ,cwp.coin_id
#     ,cwp.date
#     ,round(cwp.usd_net_transfers) as usd_net_transfers
#     ,round(cwp.usd_balance) as usd_balance
#     ,round(cwp.usd_net_transfers/cmd.price) as token_transfers
#     ,round(cwp.usd_balance/cmd.price) as token_balance
#     ,cmd.price
#     from wallets w
#     join wallet_coins wc on wc.wallet_address = w.wallet_address
#     join core.coin_wallet_profits cwp on cwp.wallet_address = wc.wallet_address
#         and cwp.coin_id = wc.coin_id
#     join core.coin_market_data cmd on cmd.coin_id = cwp.coin_id
#         and cmd.date = cwp.date
#     order by 1,2,3
#     '''
# transfers_df = dgc().run_sql(query_sql)

# # Convert wallet_address to categorical, store the mapping, and convert the column to int32
# wallet_address_categorical = transfers_df['wallet_address'].astype('category')
# # wallet_address_mapping = wallet_address_categorical.cat.categories
# # transfers_df['wallet_address'] = wallet_address_categorical.cat.codes.astype('uint32')


# # Convert coin_id to categorical (original strings are preserved)
# transfers_df['coin_id'] = transfers_df['coin_id'].astype('category')

# # Convert all numerical columns to 32 bit, using safe_downcast to avoid overflow
# transfers_df = u.safe_downcast(transfers_df, 'usd_net_transfers', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'usd_balance', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'token_transfers', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'token_balance', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'price', 'float32')

# print(transfers_df.info())
# print(u.df_mem(transfers_df))
# transfers_df.head()

In [None]:
# query_sql = '''
#     with wallet_coins as (
#         select *
#         from (
#             select wallet_address
#             ,coin_id
#             ,max(usd_inflows_cumulative) as coin_inflows
#             from core.coin_wallet_profits
#             group by 1,2
#         )
#         where coin_inflows > 500
#     )

#     ,wallets as (
#         select *
#         from (
#             select wallet_address
#             ,count(coin_id) as total_tokens
#             ,sum(coin_inflows) as total_inflows
#             from wallet_coins wti
#             group by 1
#         )
#         where total_tokens between 3 and 50
#         and total_inflows < 20000000
#     )

#     ,coins as (
#         select wc.coin_id
#         from wallets w
#         join wallet_coins wc on wc.wallet_address = w.wallet_address
#         group by 1
#     )

#     select cmd.coin_id
#     ,cmd.date
#     ,cmd.price
#     ,cmd.market_cap
#     from coins c
#     join core.coin_market_data cmd on cmd.coin_id = c.coin_id
#     order by 1,2
#     '''
# prices_df = dgc().run_sql(query_sql)

# # Convert coin_id to categorical (original strings are preserved)
# prices_df['coin_id'] = prices_df['coin_id'].astype('category')

# # Convert all numerical columns to 32 bit, using safe_downcast to avoid overflow
# prices_df = u.safe_downcast(prices_df, 'price', 'float32')
# prices_df = u.safe_downcast(prices_df, 'market_cap', 'int32')

# print(prices_df.info())
# print(u.df_mem(prices_df))
# prices_df.head()

## Tests failing

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Create sample data
test_df = pd.DataFrame({
    'wallet_address': ['wallet_a'] * 3,
    'usd_net_transfers': [100, 200, -150],
    'test_metric': [0.5, 1.0, -0.5]
})

# Calculate features
result = wcf.calculate_timing_features_for_column(test_df, 'test_metric')

# Expected values - calculating each component:
# Buy weighted: (100 * 0.5 + 200 * 1.0) / (100 + 200) = 0.833333
# Buy mean: (0.5 + 1.0) / 2 = 0.75
# Sell weighted: (-0.5 * 150) / 150 = -0.5
# Sell mean: Single value = -0.5
expected = pd.DataFrame(
    {
        'test_metric_buy_mean': [0.75],
        'test_metric_buy_weighted': [0.833333],
        'test_metric_sell_mean': [-0.5],
        'test_metric_sell_weighted': [-0.5],
    },
    index=pd.Index(['wallet_a'], name='wallet_address')
)

# Verify all values match expected
assert np.allclose(
    result,
    expected,
    equal_nan=True
), f"Expected {expected}\nGot {result}"


In [None]:
test_df

In [None]:
result

In [None]:
expected

In [None]:
result

In [None]:
expected

In [None]:
result = u.winsorize(input_data, cutoff=0.1)
result

In [None]:
result

In [None]:
expected

In [None]:
@pytest.mark.unit
def test_calculate_mfi_scenario1():
    """
    Unit test for calculating Money Flow Index (MFI) for a normal case.

    Scenario: Calculate MFI for a price series [10, 12, 15, 14, 13, 16, 17, 18, 19, 20] and
    volume series [100, 150, 200, 250, 300, 350, 400, 450, 500, 550] with a window of 3.

    Expected Behavior: MFI should be calculated based on price and volume over the given window,
    reflecting positive and negative money flows. Initial NaN values should be forward filled,
    and any remaining NaNs should be filled with 0.5.
    """
    # Define test data directly in the test
    price_series = pd.Series([10, 12, 15, 14, 13, 16, 17, 18, 19, 20])
    volume_series = pd.Series([100, 150, 200, 250, 300, 350, 400, 450, 500, 550])

    # Call the function under test
    result_mfi = ind.calculate_mfi(price_series, volume_series, window=3)

    # Step-by-step expected MFI calculation for the first few values after the initial window (simplified):
    # Step 1: Calculate raw money flow (price * volume)
    money_flow = price_series * volume_series

    # Step 2: Calculate positive and negative money flow (simplified):
    positive_money_flow = money_flow.where(price_series > price_series.shift(1), 0)
    negative_money_flow = money_flow.where(price_series < price_series.shift(1), 0)

    # Step 3: Calculate the money flow ratio and MFI
    money_flow_ratio = positive_money_flow.rolling(window=3).sum() / negative_money_flow.rolling(window=3).sum()
    expected_mfi = 100 - (100 / (1 + money_flow_ratio))

    # Adjust for the window, the third value is expected to be 100 if only positive flows exist
    expected_mfi.iloc[2] = 100  # since only positive money flows exist in the first window

    # Fill NaN values according to the new function behavior
    expected_mfi = expected_mfi.ffill().fillna(0.5)

    # Assert all values are close, including the previously NaN values that are now filled
    assert np.allclose(result_mfi, expected_mfi, atol=1e-4), \
        f"Expected MFI values: {expected_mfi.values}, but got {result_mfi.values}"

    # Assert there are no NaN values in the result
    assert not result_mfi.isna().any(), "Expected no NaN values in the result due to forward fill and 0.5 filling"

    # Assert the first value is 0.5 (since it can't be forward filled)
    assert result_mfi[0] == 0.5, "Expected first value to be 0.5 since it cannot be forward filled"

In [None]:
result_mfi

In [None]:
"""
Test that when a coin's 'market_cap' is missing at the start of its time series
but meets the 'min_coverage' threshold, the function imputes the missing values correctly.
"""
# Define the input DataFrame with 'market_cap' missing at the start for BTC and DOGE
input_data = {
    'coin_id': ['BTC', 'BTC', 'BTC', 'ETH', 'ETH', 'DOGE', 'DOGE'],
    'date': ['2023-01-01', '2023-01-02', '2023-01-03',
                '2023-01-01', '2023-01-02', '2023-01-01', '2023-01-02'],
    'price': [30000, 31000, 32000, 2000, 2100, 0.05, 0.06],
    'market_cap': [np.nan, 620000000, 640000000, 400000000, 420000000, np.nan, 6000000]
}
input_df = pd.DataFrame(input_data)

# Define the expected 'market_cap_imputed' values
expected_market_cap_imputed = pd.Series(
    [np.nan, 620000000, 640000000, 400000000, 420000000, np.nan, 6000000],
    dtype='Int64'
)
expected_df = input_df.copy()
expected_df['market_cap_imputed'] = expected_market_cap_imputed

# Invoke the impute_market_cap function with min_coverage=0.7
result_df = dr.impute_market_cap(input_df, min_coverage=0.7)

# Assert that 'market_cap_imputed' matches the expected values
assert np.allclose(
    result_df.sort_values(['coin_id','date'])['market_cap_imputed'],
    expected_df.sort_values(['coin_id','date'])['market_cap_imputed'],
    equal_nan=True
), "The 'market_cap_imputed' does not correctly handle missing values at the start of the time series."

# Assert that the 'market_cap_imputed' column is of type Int64 where applicable
assert result_df['market_cap_imputed'].dtype == 'Int64', (
    "'market_cap_imputed' column is not of type Int64."
)

In [None]:
result_df.sort_values(['coin_id','date'])

In [None]:
expected_df.sort_values(['coin_id','date'])

In [None]:
# Make a copy of input data
df_copy = input_df.copy()
df_copy = df_copy.sort_values(['coin_id','date'])

# Calculate coverage and historical maximums per coin
coverage = df_copy.groupby('coin_id').agg(
    records=('price', 'count'),
    has_cap=('market_cap', 'count'),
    max_cap=('market_cap', 'max')
)
coverage['coverage'] = coverage['has_cap'] / coverage['records']

# Get eligible coins
eligible_coins = coverage[
    (coverage['coverage'] >= min_coverage) &
    (coverage['coverage'] < 1)
].index

# Initialize imputed column with original values as int64
df_copy['market_cap_imputed'] = df_copy['market_cap'].astype('Int64')

# Process only eligible coins
mask_eligible = df_copy['coin_id'].isin(eligible_coins)

# Calculate ratio for all valid records of eligible coins
df_copy.loc[mask_eligible, 'ratio'] = (
    df_copy.loc[mask_eligible, 'market_cap'] /
    df_copy.loc[mask_eligible, 'price']
)

# # Backfill and forward fill ratios within each coin group
# df_copy['ratio'] = df_copy.groupby('coin_id')['ratio'].bfill()
# df_copy['ratio'] = df_copy.groupby('coin_id')['ratio'].ffill()

# # Calculate imputed market caps using the filled ratios
# mask_missing = df_copy['market_cap_imputed'].isna() & mask_eligible
# df_copy.loc[mask_missing, 'market_cap_imputed'] = (
#     (df_copy.loc[mask_missing, 'price'] *
#         df_copy.loc[mask_missing, 'ratio']).round().astype('Int64')
# )

# # Join max historical values and apply max_multiple check vectorized
# df_copy = df_copy.merge(
#     coverage[['max_cap']],
#     left_on='coin_id',
#     right_index=True,
#     how='left'
# )

# # Set imputed values exceeding max_multiple * historical max to np.nan
# mask_exceeds_max = (
#     df_copy['market_cap_imputed'] >
#     (df_copy['max_cap'] * max_multiple)
# )
# df_copy.loc[mask_exceeds_max, 'market_cap_imputed'] = pd.NA

# # Drop temporary columns
# df_copy = df_copy.drop(['ratio', 'max_cap'], axis=1)


df_copy

In [None]:
# Make a copy of input data
df_copy = input_df.copy()

# Calculate coverage and historical maximums per coin
coverage = df_copy.groupby('coin_id').agg(
    records=('price', 'count'),
    has_cap=('market_cap', 'count'),
    max_cap=('market_cap', 'max')
)
coverage['coverage'] = coverage['has_cap'] / coverage['records']

# Get eligible coins
eligible_coins = coverage[
    (coverage['coverage'] >= min_coverage) &
    (coverage['coverage'] < 1)
].index

# Initialize imputed column with original values as int64
df_copy['market_cap_imputed'] = df_copy['market_cap'].astype('Int64')

# Process only eligible coins
mask_eligible = df_copy['coin_id'].isin(eligible_coins)

# Calculate ratio for all valid records of eligible coins
df_copy.loc[mask_eligible, 'ratio'] = (
    df_copy.loc[mask_eligible, 'market_cap'] /
    df_copy.loc[mask_eligible, 'price']
)

# Backfill and forward fill ratios within each coin group
df_copy['ratio'] = df_copy.groupby('coin_id')['ratio'].bfill()
df_copy['ratio'] = df_copy.groupby('coin_id')['ratio'].ffill()

# Calculate imputed market caps using the filled ratios
mask_missing = df_copy['market_cap_imputed'].isna() & mask_eligible
df_copy.loc[mask_missing, 'market_cap_imputed'] = (
    (df_copy.loc[mask_missing, 'price'] *
        df_copy.loc[mask_missing, 'ratio']).round().astype('Int64')
)

# Check for imputed values exceeding historical maximums
for coin_id in eligible_coins:
    max_historical = coverage.loc[coin_id, 'max_cap']
    coin_mask = (df_copy['coin_id'] == coin_id) & df_copy['market_cap_imputed'].notna()
    max_imputed = df_copy.loc[coin_mask, 'market_cap_imputed'].max()

    if max_imputed > max_historical:
        logger.warning(
            f"Coin {coin_id}: Imputed market cap ({max_imputed:.0f}) "
            f"exceeds historical maximum ({max_historical:.0f})"
        )

# Drop the temporary ratio column
df_copy = df_copy.drop('ratio', axis=1)


In [None]:
df_copy

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# def test_multiple_coins_per_wallet():
"""
Test scenario where wallets own multiple coins, some exceeding thresholds when aggregated.
Checks filtering based on specified date range.
"""
# Create test data
sample_profits_df = pd.DataFrame({
    'coin_id': ['BTC', 'ETH', 'BTC', 'ETH', 'LTC', 'BTC', 'ETH'],
    'wallet_address': ['wallet1', 'wallet1', 'wallet2', 'wallet2', 'wallet2',
                        'wallet3', 'wallet3'],
    'date': pd.date_range(start='2023-01-01', periods=7),
    'profits_cumulative': [5000, 3000, 1000, 500, 500, 100, 50],
    'usd_inflows_cumulative': [10000, 8000, 2000, 1500, 1500, 500, 250]
})

config = {
    'profitability_filter': 7500,
    'inflows_filter': 15000,
    'date_range': {
        'start': '2023-01-02',
        'end': '2023-01-05'
    }
}

# Call the function with date range
cleaned_df, exclusions_logs_df = dr.clean_profits_df(
    sample_profits_df,
    config,
    earliest_date=config['date_range']['start'],
    latest_date=config['date_range']['end']
)

# Expected results - only checking within date window but removing all records
expected_cleaned_df = sample_profits_df[
    sample_profits_df['wallet_address'].isin(['wallet2', 'wallet3'])
].reset_index(drop=True)

expected_exclusions = pd.DataFrame({
    'wallet_address': ['wallet1'],
    'profits_exclusion': [True],
    'inflows_exclusion': [True]
})

# Assertions
assert len(cleaned_df) == 5  # wallet2 (3 records) and wallet3 (2 records) should remain
assert np.array_equal(cleaned_df.values, expected_cleaned_df.values)
assert np.array_equal(exclusions_logs_df.values, expected_exclusions.values)

# Check if profits and inflows are approximately correct for the remaining wallets
# Should include ALL records for passing wallets (1000 + 500 + 500 + 100 + 50)
assert pytest.approx(cleaned_df['profits_cumulative'].sum(), abs=1e-4) == 2150
# Should include ALL records for passing wallets (2000 + 1500 + 1500 + 500 + 250)
assert pytest.approx(cleaned_df['usd_inflows_cumulative'].sum(), abs=1e-4) == 5750

# Additional date-specific checks
date_mask = ((cleaned_df['date'] >= config['date_range']['start']) &
                (cleaned_df['date'] <= config['date_range']['end']))
date_filtered = cleaned_df[date_mask]

# Verify we have the expected number of records in the date range
assert len(date_filtered) == 3  # Should only have records between Jan 2-5 for remaining wallets