In [1]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

# load dotenv
load_dotenv()


# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

#### Load the datasets

In [2]:
importlib.reload(td)
config = load_config()


# retrieve prices data
prices_df = td.retrieve_prices_data()

# fill gaps in prices data
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])
logger.info(f"Prices data shape: {prices_df.shape}")

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )
logger.info(f"Transfers data shape: {transfers_df.shape}")

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])
logger.info(f"Profits data shape: {profits_df.shape}")


# identify sharks
shark_coins_df = td.classify_shark_coins(profits_df, config['training_data'])
shark_wallets_df = td.classify_shark_wallets(shark_coins_df,config['training_data'])


# assess shark performance
shark_agg_performance_df,shark_wallets_performance_df = td.calculate_shark_performance(transfers_df, prices_df, shark_wallets_df, config)
shark_agg_performance_df

[10/Sep/2024 11:44:05] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[10/Sep/2024 11:44:05] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[10/Sep/2024 11:44:05] INFO [dreams_core.core.<module>:10] Prices data shape: (110929, 3)
[10/Sep/2024 11:44:30] INFO [dreams_core.core.retrieve_transfers_data:414] retrieved transfers_df with shape (21636858, 5) after 24.9 seconds.
[10/Sep/2024 11:44:30] INFO [dreams_core.core.<module>:18] Transfers data shape: (21636858, 5)
[10/Sep/2024 11:44:30] INFO [dreams_core.core.prepare_profits_data:455] Preparing profits_df data...
[10/Sep/2024 11:45:21] INFO [dreams_core.core.calculate_wallet_profitability:635] Generated profits df after 19.83 seconds
[10/Sep/2024 11:45:33] INFO [dreams_core.core.clean_profits_df:686] Finished cleaning profits_df after 12.29 seconds.
[10/Sep/2024 11:45:33] INFO [dreams_core.core.<modul

Unnamed: 0_level_0,count_wallets,median_inflows,median_profits,mean_inflows,min_inflows,max_inflows,percentile_25_inflows,percentile_75_inflows,mean_profits,min_profits,...,median_return,return_aggregate,nonzero_count_wallets,nonzero_median_inflows,nonzero_median_profits,nonzero_percentile_25_inflows,nonzero_percentile_75_inflows,nonzero_percentile_25_profits,nonzero_percentile_75_profits,nonzero_median_return
is_shark,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,101982,0.0,0,15243.8295517,0,25503660.8657,0,0.0,-12057.740522,-545140012.694,...,0,-0.790991560297,25060,9037.12519559,-2168.74764284,2455.18951417,28994.2497224,-9225.05429859,-106.86000325,-0.23998202923
True,608,8679.89287175,0,128760.043852,0,40567932.8443,0,49333.5117674,-118139.602783,-53839795.8331,...,0,-0.917517571823,345,36747.4716483,-11980.012919,13002.171937,95991.5681558,-38924.5672152,-2195.87737448,-0.326009175097


In [25]:
logger.setLevel(logging.DEBUG)
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,logs_df = td.clean_profits_df(profits_df, config['data_cleaning'])
logs_df

[10/Sep/2024 12:14:02] INFO [dreams_core.core.prepare_profits_data:455] Preparing profits_df data...
[10/Sep/2024 12:14:09] DEBUG [dreams_core.core.prepare_profits_data:475] <Step 1> merge transfers and prices: 6.92 seconds
[10/Sep/2024 12:14:14] DEBUG [dreams_core.core.prepare_profits_data:490] <Step 2> identify first prices of coins: 5.10 seconds
[10/Sep/2024 12:14:15] DEBUG [dreams_core.core.prepare_profits_data:514] <Step 3> created new records as of the first_price_date: 1.00 seconds
[10/Sep/2024 12:14:34] DEBUG [dreams_core.core.prepare_profits_data:528] <Step 4> merge new records into profits_df: 18.58 seconds
[10/Sep/2024 12:14:41] DEBUG [dreams_core.core.prepare_profits_data:544] <Step 5> removed records prior to each wallet's first token inflows: 6.91 seconds
[10/Sep/2024 12:14:41] DEBUG [dreams_core.core.prepare_profits_data:556] generated profits_df after 39.24 total seconds
[10/Sep/2024 12:14:42] DEBUG [dreams_core.core.calculate_wallet_profitability:599] Starting generati

Unnamed: 0,coin_id,wallet_address


## Testing clean_profits_df

In [112]:
profits_df_full.head()

Unnamed: 0,coin_id,wallet_address,date,net_transfers,balance,price,profits_change,profits_cumulative,usd_balance,usd_net_transfers,usd_inflows,usd_inflows_cumulative,total_return
0,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x00000000009e50a7ddb7a7b0e2ee6604fd120e49,2024-03-23,5.2e-08,5.2e-08,1.88983550773,0.0,0.0,9.82714464018e-08,9.82714464018e-08,9.82714464018e-08,9.82714464018e-08,0.0
1,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x00000000009e50a7ddb7a7b0e2ee6604fd120e49,2024-03-28,0.002920722,0.002920774,7.47303212902,2.90326224307e-07,2.90326224307e-07,0.0218270379436,0.0218266493459,0.0218266493459,0.0218267476174,1.33013964974e-05
2,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x00000000009e50a7ddb7a7b0e2ee6604fd120e49,2024-03-31,0.0,0.002920774,27.5991647724,0.0587838849452,0.0587841752714,0.0806109228888,0.0,0.0,0.0218267476174,2.69321734515
3,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x00000000009e50a7ddb7a7b0e2ee6604fd120e49,2024-04-01,0.0,0.002920774,30.3304629657,0.00797750474928,0.0667616800207,0.0885884276381,0.0,0.0,0.0218267476174,3.0587094876
4,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x00000000009e50a7ddb7a7b0e2ee6604fd120e49,2024-04-02,1.75,1.752920774,34.7437187048,0.0128901226181,0.0796518026389,60.9029862836,60.8015077334,60.8015077334,60.823334481,0.00130955994634


In [113]:
def sample_clean_profits_profits_df():
    """
    Fixture to create a sample DataFrame for testing td.clean_profits_df function.
    """
    data = {
        'coin_id': ['coin_1', 'coin_2', 'coin_3', 'coin_1', 'coin_2', 'coin_3', 'coin_4', 'coin_1', 'coin_2', 'coin_3', 'coin_1', 'coin_2'],
        'wallet_address': ['wallet_1', 'wallet_1', 'wallet_2', 'wallet_2', 'wallet_3', 'wallet_3', 'wallet_4', 'wallet_4', 'wallet_5', 'wallet_5', 'wallet_6', 'wallet_6'],
        'profits_change': [2000, 500, 18000, -7000, 5000, 12000, 16000, 1000, 18000, -7000, -10000, -8000],  # Updated profits for Wallet 4, 5, and added Wallet 6
        'usd_inflows': [5000, 4000, 7000, 6000, 8000, 9000, 4000, 3000, 5000, 3000, 6000, 4000]  # Updated inflows and Wallet 6 with 10,000 inflows
    }
    return pd.DataFrame(data)

def sample_clean_profits_data_cleaning_config():
    """
    Fixture for the data cleaning configuration.
    """
    return {
        'profitability_filter': 15000,  # Updated profitability filter
        'inflows_filter': 10000  # Updated inflows filter
    }


sample_clean_profits_profits_df = sample_clean_profits_profits_df()
sample_clean_profits_data_cleaning_config = sample_clean_profits_data_cleaning_config()
sample_clean_profits_profits_df.sort_values('wallet_address')

Unnamed: 0,coin_id,wallet_address,profits_change,usd_inflows_cumulative
0,coin_1,wallet_1,2000,5000
1,coin_2,wallet_1,500,4000
2,coin_3,wallet_2,18000,7000
3,coin_1,wallet_2,-7000,6000
4,coin_2,wallet_3,5000,8000
5,coin_3,wallet_3,12000,9000
6,coin_4,wallet_4,16000,4000
7,coin_1,wallet_4,1000,3000
8,coin_2,wallet_5,18000,5000
9,coin_3,wallet_5,-7000,3000


In [118]:
summary_df = sample_clean_profits_profits_df.groupby('wallet_address').agg({'profits_change': 'sum', 'usd_inflows_cumulative': 'max'})
summary_df['excess_profits'] = abs(summary_df['profits_change'])>15000
summary_df['excess_inflows'] = abs(summary_df['usd_inflows_cumulative'])>15000

In [115]:
cleaned_df

Unnamed: 0,coin_id,wallet_address,profits_change,usd_inflows_cumulative
0,coin_1,wallet_1,2000,5000
1,coin_2,wallet_1,500,4000
2,coin_3,wallet_2,18000,7000
3,coin_1,wallet_2,-7000,6000
4,coin_2,wallet_5,18000,5000
5,coin_3,wallet_5,-7000,3000


In [114]:
importlib.reload(td)
config = load_config()

# # Test Case 1: Basic functionality test
# @pytest.mark.unit
# def test_clean_profits_basic_functionality(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test basic functionality where some wallets exceed the thresholds and others do not.
#     """
cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
assert cleaned_df['wallet_address'].nunique() == 4  # 2 wallets should be excluded: one for profits, one for inflows
assert 'wallet_1' in cleaned_df['wallet_address'].values  # wallet_1 stays within limits
assert 'wallet_3' not in cleaned_df['wallet_address'].values  # wallet_3 exceeds both thresholds

# # Test Case 2: Wallet with exactly the threshold profitability and inflows
# @pytest.mark.unit
# def test_clean_profits_exact_threshold(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test a wallet with exactly the threshold values for profitability and inflows.
#     """
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_5', 'profits_change'] = 15000
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_5' in cleaned_df['wallet_address'].values  # wallet_5 should not be excluded as it's exactly at the threshold

# # Test Case 3: Wallet with negative profitability exceeding the threshold
# @pytest.mark.unit
# def test_clean_profits_negative_profitability(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallets with negative profitability exceeding the threshold.
#     """
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_6' not in cleaned_df['wallet_address'].values  # wallet_6 exceeds the negative profitability threshold

# # Test Case 4: Multiple wallets with profits and inflows exceeding thresholds
# @pytest.mark.unit
# def test_clean_profits_multiple_exceeding_thresholds(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallets where either profits or inflows exceed thresholds.
#     """
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert len(exclusions_df) == 3  # 3 wallets should be excluded based on either profits or inflows

# # Test Case 5: Wallet with multiple transactions but total profits below thresholds
# @pytest.mark.unit
# def test_clean_profits_multiple_transactions_below_threshold(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test wallet with multiple transactions where total profits remain below thresholds.
#     """
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_1' in cleaned_df['wallet_address'].values  # wallet_1 remains below threshold and should not be excluded

# # Test Case 8: Wallet with extreme profits but inflows below the threshold
# @pytest.mark.unit
# def test_clean_profits_extreme_profits_but_low_inflows(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallet with extreme profits but inflows below the threshold.
#     """
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_4', 'profits_change'] = 16000
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_4' not in cleaned_df['wallet_address'].values  # wallet_4 should be excluded based on profits

# # Test Case 9: Wallet with extreme inflows but zero profits
# @pytest.mark.unit
# def test_clean_profits_extreme_inflows_but_zero_profits(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallet with extreme inflows but no significant profits.
#     """
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_3', 'profits_change'] = 0
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_3', 'usd_inflows_cumulative'] = 17000
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_3' not in cleaned_df['wallet_address'].values  # wallet_3 should be excluded based on inflows

# # Test Case 11: Wallet with inflows/profits across multiple coins but aggregate exceeds threshold
# @pytest.mark.unit
# def test_clean_profits_aggregate_inflows_profits_across_coins(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallet where aggregated inflows/profits across multiple coins exceed thresholds.
#     """
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_5', 'profits_change'] = 18000
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_5', 'usd_inflows_cumulative'] = 8000
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_5' not in cleaned_df['wallet_address'].values  # wallet_5 should be excluded based on aggregated profits


[10/Sep/2024 13:47:28] DEBUG [dreams_core.core.clean_profits_df:658] Starting generation of profits_cleaned_df...
[10/Sep/2024 13:47:28] INFO [dreams_core.core.clean_profits_df:706] Finished cleaning profits_df after 0.05 seconds.
[10/Sep/2024 13:47:28] DEBUG [dreams_core.core.clean_profits_df:707] Removed 3 coin-wallet pairs beyond profit threshold of $15.0k and 0 pairs beyond inflows filter of 10.0k.


AssertionError: 

Unnamed: 0_level_0,profits_change,usd_inflows_cumulative
wallet_address,Unnamed: 1_level_1,Unnamed: 2_level_1
wallet_1,10000,12000
wallet_2,5000,5000
wallet_3,2000,20000
wallet_4,30000,9000
wallet_5,800,6000
wallet_6,-2500,500
wallet_7,12000,15000
wallet_8,1000,8000
wallet_9,4000,4000


In [107]:
exclusions_df

Unnamed: 0,wallet_address,profits_exclusion,inflows_exclusion
0,wallet_1,True,True
1,wallet_3,True,True
2,wallet_4,True,True
3,wallet_7,True,True


In [108]:

# # Test Case 1: Basic functionality test
# def test_clean_profits_basic_functionality(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test basic functionality where some wallets exceed the thresholds and others do not.
#     """
cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
assert len(cleaned_df) == 6  # 2 wallets should be excluded: one for profits, one for inflows
assert 'wallet_2' not in cleaned_df['wallet_address'].values  # wallet_2 exceeds profitability filter
assert 'wallet_6' not in cleaned_df['wallet_address'].values  # wallet_6 exceeds inflows filter

# # Test Case 2: Wallet with exactly the threshold profitability and inflows
# def test_clean_profits_exact_threshold(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test a wallet with exactly the threshold values for profitability and inflows.
#     """
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_1', 'profits_change'] = 1000
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_1' in cleaned_df['wallet_address'].values  # wallet_1 should not be excluded

# # Test Case 3: Wallet with negative profitability exceeding the threshold
# def test_clean_profits_negative_profitability(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallets with negative profitability exceeding the threshold.
#     """
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_6' not in cleaned_df['wallet_address'].values  # wallet_6 exceeds inflows filter

# # Test Case 4: Multiple wallets with profits and inflows exceeding thresholds
# def test_clean_profits_multiple_exceeding_thresholds(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallets where either profits or inflows exceed thresholds.
#     """
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert len(exclusions_df) == 2  # 2 wallets should be excluded based on either profits or inflows

# # Test Case 5: Wallet with multiple transactions but total profits below thresholds
# def test_clean_profits_multiple_transactions_below_threshold(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test wallet with multiple transactions where total profits remain below thresholds.
#     """
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_1' in cleaned_df['wallet_address'].values  # wallet_1 remains below threshold and should not be excluded

# # Test Case 8: Wallet with extreme profits but inflows below the threshold
# def test_clean_profits_extreme_profits_but_low_inflows(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallet with extreme profits but inflows below the threshold.
#     """
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_4', 'profits_change'] = 5000
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_4' not in cleaned_df['wallet_address'].values  # wallet_4 should be excluded based on profits

# # Test Case 9: Wallet with extreme inflows but zero profits
# def test_clean_profits_extreme_inflows_but_zero_profits(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallet with extreme inflows but no significant profits.
#     """
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_7', 'profits_change'] = 0
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_7', 'usd_inflows_cumulative'] = 6000
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_7' not in cleaned_df['wallet_address'].values  # wallet_7 should be excluded based on inflows

# # Test Case 11: Wallet with inflows/profits across multiple coins but aggregate exceeds threshold
# def test_clean_profits_aggregate_inflows_profits_across_coins(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallet where aggregated inflows/profits across multiple coins exceed thresholds.
#     """
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_3', 'profits_change'] = 500
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_3', 'usd_inflows_cumulative'] = 6000
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_3' not in cleaned_df['wallet_address'].values  # wallet_3 should be excluded based on aggregated inflows

[10/Sep/2024 13:22:53] DEBUG [dreams_core.core.clean_profits_df:658] Starting generation of profits_cleaned_df...
[10/Sep/2024 13:22:53] INFO [dreams_core.core.clean_profits_df:706] Finished cleaning profits_df after 0.01 seconds.


[10/Sep/2024 13:22:53] DEBUG [dreams_core.core.clean_profits_df:707] Removed 1 coin-wallet pairs beyond profit threshold of $15.0k and 3 pairs beyond inflows filter of 10.0k.


AssertionError: 

In [109]:
exclusions_df

Unnamed: 0,wallet_address,profits_exclusion,inflows_exclusion
0,wallet_1,True,True
1,wallet_3,True,True
2,wallet_4,True,True
3,wallet_7,True,True


In [57]:
profits_df = sample_clean_profits_profits_df

In [92]:
importlib.reload(td)
cleaned_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
cleaned_df

[10/Sep/2024 13:01:36] DEBUG [dreams_core.core.clean_profits_df:658] Starting generation of profits_cleaned_df...
[10/Sep/2024 13:01:36] INFO [dreams_core.core.clean_profits_df:706] Finished cleaning profits_df after 0.01 seconds.
[10/Sep/2024 13:01:36] DEBUG [dreams_core.core.clean_profits_df:707] Removed 4 coin-wallet pairs beyond profit threshold of $1.0k and 0 pairs beyond inflows filter of 5.0k.


(  coin_id wallet_address  profits_change  usd_inflows_cumulative
 0  coin_1       wallet_1             500                    1000
 1  coin_1       wallet_1            -300                    1500
 2  coin_3       wallet_3               0                       0
 3  coin_3       wallet_7            -500                    2500,
   wallet_address  profits_exclusion  inflows_exclusion
 0       wallet_2               True               True
 1       wallet_4               True               True
 2       wallet_5               True               True
 3       wallet_6               True               True)

In [89]:
cleaned_df

Unnamed: 0,coin_id,wallet_address,profits_change,usd_inflows_cumulative
0,coin_1,wallet_1,500,1000
1,coin_1,wallet_1,-300,1500
2,coin_3,wallet_3,0,0
3,coin_3,wallet_7,-500,2500


In [88]:
len(cleaned_df['wallet_address'].unique())

3

In [85]:
importlib.reload(td)
config = load_config()
logger.setLevel(logging.DEBUG)

# # Test Case 1: Basic functionality test
# def test_clean_profits_basic_functionality(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test basic functionality where some wallets exceed the thresholds and others do not.
#     """
cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
assert len(cleaned_df) == 5  # 3 wallets should be excluded
assert 'wallet_2' not in cleaned_df['wallet_address'].values  # wallet_2 exceeds profitability filter
assert 'wallet_5' not in cleaned_df['wallet_address'].values  # wallet_5 exceeds inflows filter

# # Test Case 2: Wallet with exactly the threshold profitability and inflows
# def test_clean_profits_exact_threshold(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test a wallet with exactly the threshold values for profitability and inflows.
#     """
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_1', 'profits_change'] = 1000
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_1' in cleaned_df['wallet_address'].values  # wallet_1 should not be excluded

# # Test Case 3: Wallet with negative profitability exceeding the threshold
# def test_clean_profits_negative_profitability(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallets with negative profitability exceeding the threshold.
#     """
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_5' not in cleaned_df['wallet_address'].values  # wallet_5 exceeds negative profitability threshold

# # Test Case 4: Multiple wallets with profits and inflows exceeding thresholds
# def test_clean_profits_multiple_exceeding_thresholds(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallets where either profits or inflows exceed thresholds.
#     """
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert len(exclusions_df) == 3  # 3 wallets should be excluded based on either profits or inflows

# # Test Case 5: Wallet with multiple transactions but total profits below thresholds
# def test_clean_profits_multiple_transactions_below_threshold(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test wallet with multiple transactions where total profits remain below thresholds.
#     """
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_1' in cleaned_df['wallet_address'].values  # wallet_1 remains below threshold and should not be excluded

# # Test Case 8: Wallet with extreme profits but inflows below the threshold
# def test_clean_profits_extreme_profits_but_low_inflows(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallet with extreme profits but inflows below the threshold.
#     """
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_4', 'profits_change'] = 5000
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_4' not in cleaned_df['wallet_address'].values  # wallet_4 should be excluded based on profits

# # Test Case 9: Wallet with extreme inflows but zero profits
# def test_clean_profits_extreme_inflows_but_zero_profits(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallet with extreme inflows but no significant profits.
#     """
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_6', 'profits_change'] = 0
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_6', 'usd_inflows_cumulative'] = 10000
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_6' not in cleaned_df['wallet_address'].values  # wallet_6 should be excluded based on inflows

# # Test Case 11: Wallet with inflows/profits across multiple coins but aggregate exceeds threshold
# def test_clean_profits_aggregate_inflows_profits_across_coins(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config):
#     """
#     Test exclusion of wallet where aggregated inflows/profits across multiple coins exceed thresholds.
#     """
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_3', 'profits_change'] = 500
#     sample_clean_profits_profits_df.loc[sample_clean_profits_profits_df['wallet_address'] == 'wallet_3', 'usd_inflows_cumulative'] = 6000
#     cleaned_df, exclusions_df = td.clean_profits_df(sample_clean_profits_profits_df, sample_clean_profits_data_cleaning_config)
#     assert 'wallet_3' not in cleaned_df['wallet_address'].values  # wallet_3 should be excluded based on aggregated inflows




[10/Sep/2024 12:59:27] DEBUG [dreams_core.core.clean_profits_df:658] Starting generation of profits_cleaned_df...
[10/Sep/2024 12:59:27] INFO [dreams_core.core.clean_profits_df:706] Finished cleaning profits_df after 0.01 seconds.
[10/Sep/2024 12:59:27] DEBUG [dreams_core.core.clean_profits_df:707] Removed 4 coin-wallet pairs beyond profit threshold of $1.0k and 0 pairs beyond inflows filter of 5.0k.


AssertionError: 

In [43]:
print(profits_df_full.shape)
print(profits_cleaned_df.shape)


profits_cleaned_df[profits_cleaned_df['wallet_address']=='0x000000000000000000000000000000000000dead']

(16992723, 13)
(16980347, 13)


Unnamed: 0,coin_id,wallet_address,date,net_transfers,balance,price,profits_change,profits_cumulative,usd_balance,usd_net_transfers,usd_inflows,usd_inflows_cumulative,total_return


In [18]:
# Pivot the shark_agg_performance_df to separate is_shark into columns
shark_agg_performance_pivot_df = shark_agg_performance_df.pivot_table(
    index=None,  # If there's a specific identifier column, replace None with it
    columns='is_shark',  # Pivot based on the 'is_shark' column
    aggfunc='first'  # No aggregation needed if there’s only one row per value
)

shark_agg_performance_pivot_df

is_shark,False,True
count_wallets,101982.0,608.0
max_inflows,25503660.8657,40567932.8443
max_profits,2673859.66268,305773.243016
mean_inflows,15243.8295517,128760.043852
mean_profits,-12057.740522,-118139.602783
median_inflows,0.0,8679.89287175
median_profits,0.0,0.0
median_return,0.0,0.0
min_inflows,0.0,0.0
min_profits,-545140012.694,-53839795.8331


In [63]:
print(len(shark_coins_df.groupby(['coin_id','wallet_address']).size()))

len(shark_coins_df.groupby(['coin_id','wallet_address']).size().drop_duplicates())

95568


1

In [74]:
duplicates = shark_coins_df.duplicated(subset=['coin_id', 'wallet_address'], keep=False)
set(duplicates)

{False}

In [48]:
def assess_coin_shark_metrics_df(shark_coins_df):
    """
    creates a series of coin-keyed metrics based on shark behavior
    """
    # Step 1: Coin-Level Metrics - Counting the number of sharks per coin
    coin_shark_count = shark_coins_df.groupby('coin_id')['is_shark'].sum().reset_index()
    coin_shark_count.columns = ['coin_id', 'num_sharks']

    # Step 2: Total inflows by sharks for each coin
    coin_shark_inflows = shark_coins_df[shark_coins_df['is_shark']].groupby('coin_id')['usd_inflows_cumulative'].sum().reset_index()
    coin_shark_inflows.columns = ['coin_id', 'total_shark_inflows']

    # Step 3: Merge the coin-level shark metrics
    coin_shark_metrics_df = pd.merge(coin_shark_count, coin_shark_inflows, on='coin_id', how='left')

    return coin_shark_metrics_df

Unnamed: 0,coin_id,num_sharks,total_shark_inflows
0,0b9d343d-4e25-4d22-b49c-fa17509a0333,428,40882358.1574
1,0db96a94-082b-4e13-a315-860850e9ff4f,186,27178344.7173
2,0e1c102e-2e7d-4aed-af2d-1526c2e0720a,15,16064740.7561
3,0eedc336-a78e-4b25-957e-57117227ef78,576,73903604.6467
4,0f96fb26-1ee9-4232-ae0e-c768f38070b3,50,3704418.38065
...,...,...,...
101,eeccf0b6-aaaa-464c-a23e-f2fc9e73a350,89,8368216.99833
102,f0420cea-5dc1-42ac-b1bc-f6e48b7804f1,236,83918538.5148
103,f64ac466-300d-43d4-8c36-ef26a7a48977,0,
104,f68b64ae-61d5-4dd6-b448-4ae9c754bd07,260,56928923.2692
