In [20]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

# load dotenv
load_dotenv()


# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

#### Load the datasets

In [21]:
importlib.reload(td)
config = load_config()


# retrieve prices data
prices_df = td.retrieve_prices_data()

# fill gaps in prices data
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])
logger.info(f"Prices data shape: {prices_df.shape}")

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])


# identify sharks
shark_coins_df = td.classify_shark_coins(profits_df, config['training_data'])
shark_wallets_df = td.classify_shark_wallets(shark_coins_df,config['training_data'])


# # assess shark performance
# shark_agg_performance_df,shark_wallets_performance_df = td.calculate_shark_performance(transfers_df, prices_df, shark_wallets_df, config)
# metrics = ['count_wallets', 'return_aggregate', 'nonzero_count_wallets', 'nonzero_median_return', 'midrange_count_wallets', 'midrange_median_return', 'midrange_return_aggregate']
# shark_agg_performance_df[shark_agg_performance_df['metric'].isin(metrics)]

[11/Sep/2024 21:10:59] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[11/Sep/2024 21:10:59] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[11/Sep/2024 21:10:59] INFO [dreams_core.core.<module>:10] Prices data shape: (110929, 3)
[11/Sep/2024 21:11:26] INFO [dreams_core.core.retrieve_transfers_data:414] retrieved transfers_df with shape (23823401, 5) after 27.0 seconds.
[11/Sep/2024 21:11:27] INFO [dreams_core.core.prepare_profits_data:455] Preparing profits_df data...
[11/Sep/2024 21:12:19] INFO [dreams_core.core.calculate_wallet_profitability:635] Generated profits df after 19.30 seconds
[11/Sep/2024 21:12:42] INFO [dreams_core.core.clean_profits_df:706] Finished cleaning profits_df after 23.04 seconds.
[11/Sep/2024 21:12:50] INFO [dreams_core.core.classify_shark_coins:772] creation of shark_coins_df complete.


In [23]:
importlib.reload(td)
importlib.reload(cwm)
config = load_config()

# generate inputs for generate_buysell_metrics_df()
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()

buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)

[11/Sep/2024 21:15:57] INFO [dreams_core.core.generate_buysell_metrics_df:29] Preparing buysell_metrics_df...


999
139


[11/Sep/2024 21:16:01] INFO [dreams_core.core.generate_buysell_metrics_df:76] Generated buysell_metrics_df after 4.13 seconds.


In [29]:
buysell_metrics_df.head()

Unnamed: 0,date,buyers_new,buyers_repeat,total_buyers,sellers_new,sellers_repeat,total_sellers,total_bought,total_sold,total_net_transfers,total_volume,total_holders,total_balance,coin_id
0,2024-03-13,20,2,22,0,0,0,73649.83617,0.0,73649.83617,73649.83617,20,83658.541775,04f6120a-f0dd-4260-bb2b-b8f827fdba61
1,2024-03-14,2,0,2,2,0,2,5420.2621442,3279.53765231,2140.72449189,8699.79979651,4,13148.8171354,04f6120a-f0dd-4260-bb2b-b8f827fdba61
2,2024-03-15,3,0,3,1,0,1,6435.64314486,1876.35561669,4559.28752817,8311.99876155,4,6731.13464433,04f6120a-f0dd-4260-bb2b-b8f827fdba61
3,2024-03-16,0,2,2,1,1,2,1547.12713762,5418.07204041,-3870.94490279,6965.19917802,4,6735.78147101,04f6120a-f0dd-4260-bb2b-b8f827fdba61
4,2024-03-17,0,2,2,0,1,1,579.271114799,1537.95538848,-958.684273681,2117.22650328,3,7395.7597263,04f6120a-f0dd-4260-bb2b-b8f827fdba61


In [41]:
cohort_profits_df.head()

Unnamed: 0,coin_id,wallet_address,date,net_transfers,balance,price,profits_change,profits_cumulative,usd_balance,usd_net_transfers,usd_inflows,usd_inflows_cumulative,total_return
483,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x032d363570b67a6f0e6851a2574a0e2da7e4297b,2024-04-05,241.367405987,241.367405987,26.8227283754,0.0,0.0,6474.13236947,6474.13236947,6474.13236947,6474.13236947,0.0
484,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x032d363570b67a6f0e6851a2574a0e2da7e4297b,2024-04-12,-229.299035688,12.068370299,17.6881586962,-2204.78738828,-2204.78738828,213.467249053,-4055.87773214,0.0,6474.13236947,-0.340553337877
485,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x032d363570b67a6f0e6851a2574a0e2da7e4297b,2024-04-30,0.0,12.068370299,8.27418014541,-113.611379138,-2318.39876742,99.8558699155,0.0,0.0,6474.13236947,-0.358101848265
486,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x032d363570b67a6f0e6851a2574a0e2da7e4297b,2024-05-01,0.0,12.068370299,9.476061117,14.5047446205,-2303.8940228,114.360614536,0.0,0.0,6474.13236947,-0.355861433056
487,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x032d363570b67a6f0e6851a2574a0e2da7e4297b,2024-05-07,0.0,12.068370299,9.75893447752,3.41382046237,-2300.48020233,117.774434998,0.0,0.0,6474.13236947,-0.35533413144


In [40]:
cleaned_profits_df = profits_df

# """
# Integration test for the buysell_metrics_df fixture.
# Validates the structure and key calculations in the final DataFrame.
# """

# # 1. Validate Structure: Check for expected columns in buysell_metrics_df
# expected_columns = [
#     'date', 'buyers_new', 'buyers_repeat', 'total_buyers', 'sellers_new', 'sellers_repeat',
#     'total_sellers', 'total_bought', 'total_sold', 'total_net_transfers', 'total_volume',
#     'total_holders', 'total_balance', 'coin_id'
# ]
# assert set(expected_columns).issubset(buysell_metrics_df.columns), "Missing expected columns in buysell_metrics_df"

# # 2. Validate Key Feature Calculations

# # Filter the cleaned_profits_df to only include cohort wallets and coins
# cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark'] == True]['wallet_address'].unique()
# cohort_coins = shark_coins_df['coin_id'].unique()

cohort_profits_df = cleaned_profits_df[
    (cleaned_profits_df['wallet_address'].isin(cohort_wallets)) &
    (cleaned_profits_df['coin_id'].isin(cohort_coins)) &
    (cleaned_profits_df['date'] <= training_period_end)  # Add date filtering
]


# Check that total_bought matches the sum of positive net_transfers in cohort_profits_df
total_bought_mock = cohort_profits_df[cohort_profits_df['net_transfers'] > 0]['net_transfers'].sum()
total_bought_result = buysell_metrics_df['total_bought'].sum()
assert total_bought_mock == total_bought_result, f"Total bought mismatch: {total_bought_mock} != {total_bought_result}"

# Check that total_sold matches the absolute sum of negative net_transfers in cohort_profits_df
total_sold_mock = abs(cohort_profits_df[cohort_profits_df['net_transfers'] < 0]['net_transfers'].sum())
total_sold_result = buysell_metrics_df['total_sold'].sum()
assert total_sold_mock == total_sold_result, f"Total sold mismatch: {total_sold_mock} != {total_sold_result}"

# Check that total_net_transfers matches net of all net_transfers in cohort_profits_df
total_net_transfers_mock = cohort_profits_df['net_transfers'].sum()
total_net_transfers_result = buysell_metrics_df['total_net_transfers'].sum()
assert total_net_transfers_mock == total_net_transfers_result, f"Total net transfers mismatch: {total_net_transfers_mock} != {total_net_transfers_result}"

# 3. Data Quality Checks

# Ensure there are no NaN values in critical columns
critical_columns = ['total_bought', 'total_sold', 'total_net_transfers', 'total_balance']
for col in critical_columns:
    assert buysell_metrics_df[col].isnull().sum() == 0, f"Found NaN values in {col}"

# Ensure non-cohort wallets and coins are excluded
assert set(buysell_metrics_df['coin_id']).issubset(cohort_coins), "Non-cohort coins found in buysell_metrics_df"

# 4. Sequence Logic Checks

# buyers_new and buyers_repeat logic validation (example dates provided)
# Check that buyers_new for coin1 on a specific date matches expected value
assert buysell_metrics_df[(buysell_metrics_df['coin_id'] == 'coin1') & (buysell_metrics_df['date'] == '2024-01-01')]['buyers_new'].iloc[0] == 2

# buyers_repeat for coin2 on a specific date
assert buysell_metrics_df[(buysell_metrics_df['coin_id'] == 'coin2') & (buysell_metrics_df['date'] == '2024-01-03')]['buyers_repeat'].iloc[0] == 2

# sellers_new validation (similar to buyers)
# sellers_new for coin1 on 1/1/24 should match the expected value (adjust this based on your actual data)
assert buysell_metrics_df[(buysell_metrics_df['coin_id'] == 'coin1') & (buysell_metrics_df['date'] == '2024-01-01')]['sellers_new'].iloc[0] == expected_sellers_new_value




AssertionError: Total bought mismatch: 19832765367313.01 != 19495313129806.188

In [46]:
total_bought_mock

np.float64(19832765367313.01)

In [44]:
buysell_metrics_df

Unnamed: 0,date,buyers_new,buyers_repeat,total_buyers,sellers_new,sellers_repeat,total_sellers,total_bought,total_sold,total_net_transfers,total_volume,total_holders,total_balance,coin_id
0,2024-03-13,20,2,22,0,0,0,73649.83617,0,73649.83617,73649.83617,20,83658.541775,04f6120a-f0dd-4260-bb2b-b8f827fdba61
1,2024-03-14,2,0,2,2,0,2,5420.2621442,3279.53765231,2140.72449189,8699.79979651,4,13148.8171354,04f6120a-f0dd-4260-bb2b-b8f827fdba61
2,2024-03-15,3,0,3,1,0,1,6435.64314486,1876.35561669,4559.28752817,8311.99876155,4,6731.13464433,04f6120a-f0dd-4260-bb2b-b8f827fdba61
3,2024-03-16,0,2,2,1,1,2,1547.12713762,5418.07204041,-3870.94490279,6965.19917802,4,6735.78147101,04f6120a-f0dd-4260-bb2b-b8f827fdba61
4,2024-03-17,0,2,2,0,1,1,579.271114799,1537.95538848,-958.684273681,2117.22650328,3,7395.7597263,04f6120a-f0dd-4260-bb2b-b8f827fdba61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6723,2024-04-25,0,0,0,0,3,3,0,1675480.54129,-1675480.54129,1675480.54129,3,404187,fd1f38d9-5c05-4809-80fe-b67a07fd345c
6724,2024-04-26,0,0,0,0,1,1,0,404187,-404187,404187,1,-7.27595761418e-11,fd1f38d9-5c05-4809-80fe-b67a07fd345c
6725,2024-04-27,0,3,3,0,0,0,433498.485203,0,433498.485203,433498.485203,3,1496896.2854,fd1f38d9-5c05-4809-80fe-b67a07fd345c
6726,2024-04-28,0,0,0,1,0,1,0,717928.578161,-717928.578161,717928.578161,1,4.36557456851e-11,fd1f38d9-5c05-4809-80fe-b67a07fd345c


In [56]:
# Group cohort_profits_df by coin_id and sum net_transfers where net_transfers > 0 (buys)
cohort_grouped = cohort_profits_df[cohort_profits_df['net_transfers'] > 0].groupby('coin_id').agg({
    'net_transfers': 'sum'
}).reset_index().rename(columns={'net_transfers': 'cohort_total_bought'})

# Group buysell_metrics_df by coin_id and sum total_bought
buysell_grouped = buysell_metrics_df.groupby('coin_id').agg({
    'total_bought': 'sum'
}).reset_index().rename(columns={'total_bought': 'metrics_total_bought'})

# Merge the two grouped DataFrames on coin_id
comparison_df = pd.merge(cohort_grouped, buysell_grouped, on='coin_id', how='left')

# Add a column to show the difference between cohort_total_bought and metrics_total_bought
comparison_df['bought_diff'] = comparison_df['cohort_total_bought'] - comparison_df['metrics_total_bought']
comparison_df['percent_off'] = comparison_df['bought_diff'] / comparison_df['metrics_total_bought']

comparison_df.sort_values('percent_off')

Unnamed: 0,coin_id,cohort_total_bought,metrics_total_bought,bought_diff,percent_off
31,3c1400bd-ffcb-4189-bace-eb3399b13901,135340640.346,135340640.346,-2.98023223877e-08,-2.20202315517e-16
98,d5782037-4e63-477e-9714-b5f546aac472,38221214386.1,38221214386.1,-7.62939453125e-06,-1.99611515589e-16
115,f87b6a04-49f1-475c-8a0d-e65ddea3129c,37397890.5923,37397890.5923,-7.45058059692e-09,-1.99224621467e-16
108,eeccf0b6-aaaa-464c-a23e-f2fc9e73a350,2423485027.38,2423485027.38,-4.76837158203e-07,-1.96756799739e-16
55,71165a06-a733-4b57-bb77-4ab0b6d3a925,7034423473.31,7034423473.31,-9.53674316406e-07,-1.35572491481e-16
...,...,...,...,...,...
18,2a354338-a862-4c51-b197-5b3863202aac,85745.3471686,76566.6280641,9178.71910454,0.119878847177
116,f9d19e1d-8637-4e7c-b808-4b4bfe18e316,676348.485588,595194.735938,81153.7496495,0.13634823151
39,4a50ffaf-4a82-4263-9ff6-b2afc08a8aa3,656.837953479,556.633317909,100.20463557,0.180019111946
46,5baae6b5-8f5b-48d6-8826-2d1a51653a63,780833.461379,530493.243865,250340.217513,0.471900859075


In [58]:
c = '8fee3eb7-7a52-41c3-a92d-a64f5d6ee871'

profits = cohort_profits_df[cohort_profits_df['coin_id']==c]
buysell = buysell_metrics_df[buysell_metrics_df['coin_id']==c]
buysell

Unnamed: 0,date,buyers_new,buyers_repeat,total_buyers,sellers_new,sellers_repeat,total_sellers,total_bought,total_sold,total_net_transfers,total_volume,total_holders,total_balance,coin_id
4269,2024-04-29,1,0,1,0,0,0,25784.9109921,0,25784.9109921,25784.9109921,1,25784.9109921,8fee3eb7-7a52-41c3-a92d-a64f5d6ee871
4270,2024-04-30,0,0,0,0,0,0,0.0,0,0.0,0.0,1,25784.9109921,8fee3eb7-7a52-41c3-a92d-a64f5d6ee871


In [59]:
profits

Unnamed: 0,coin_id,wallet_address,date,net_transfers,balance,price,profits_change,profits_cumulative,usd_balance,usd_net_transfers,usd_inflows,usd_inflows_cumulative,total_return
12420742,8fee3eb7-7a52-41c3-a92d-a64f5d6ee871,0x0871deb34bfd2052b1c10dc4f6c0912a2a47e927,2024-05-01,775185.954243,775185.954243,0.120321527954,0.0,0.0,93271.558463,93271.558463,93271.558463,93271.558463,0.0
12420743,8fee3eb7-7a52-41c3-a92d-a64f5d6ee871,0x0871deb34bfd2052b1c10dc4f6c0912a2a47e927,2024-05-04,-775185.954243,-2.32830643654e-10,0.123970213645,2828.40989924,2828.40989924,-2.88640646369e-11,-96099.9683622,0.0,93271.558463,0.0303244627392
12420744,8fee3eb7-7a52-41c3-a92d-a64f5d6ee871,0x0871deb34bfd2052b1c10dc4f6c0912a2a47e927,2024-05-07,0.0,-2.32830643654e-10,0.100434303479,5.47988111306e-12,2828.40989924,-2.33841835238e-11,0.0,0.0,93271.558463,0.0303244627392
12420811,8fee3eb7-7a52-41c3-a92d-a64f5d6ee871,0x0c6306c1ff8f0ca8432b761085d5ce74160a499a,2024-04-29,25784.9109921,25784.9109921,0.115954252196,0.0,0.0,2989.87007203,2989.87007203,2989.87007203,2989.87007203,0.0
12420812,8fee3eb7-7a52-41c3-a92d-a64f5d6ee871,0x0c6306c1ff8f0ca8432b761085d5ce74160a499a,2024-04-30,0.0,25784.9109921,0.115954252196,0.0,0.0,2989.87007203,0.0,0.0,2989.87007203,0.0
12420813,8fee3eb7-7a52-41c3-a92d-a64f5d6ee871,0x0c6306c1ff8f0ca8432b761085d5ce74160a499a,2024-05-01,0.0,25784.9109921,0.120321527954,112.609816695,112.609816695,3102.47988872,0.0,0.0,2989.87007203,0.0376637827002
12420814,8fee3eb7-7a52-41c3-a92d-a64f5d6ee871,0x0c6306c1ff8f0ca8432b761085d5ce74160a499a,2024-05-07,-25784.9109921,-3.6379788070900004e-12,0.100434303479,-512.790312977,-400.180496282,-3.6537786756000005e-13,-2589.68957574,0.0,2989.87007203,-0.133845447007
12424634,8fee3eb7-7a52-41c3-a92d-a64f5d6ee871,0xc33c030f8b228aa006f6b7e73e18f64fd72633c8,2024-05-01,18469.4139004,18469.4139004,0.120321527954,0.0,0.0,2222.26810091,2222.26810091,2222.26810091,2222.26810091,0.0
12424635,8fee3eb7-7a52-41c3-a92d-a64f5d6ee871,0xc33c030f8b228aa006f6b7e73e18f64fd72633c8,2024-05-03,-12525.2222397,5944.19166067,0.14579132153,470.412159505,470.412159505,866.611557634,-1826.06870278,0.0,2222.26810091,0.211681101534
12424636,8fee3eb7-7a52-41c3-a92d-a64f5d6ee871,0xc33c030f8b228aa006f6b7e73e18f64fd72633c8,2024-05-07,0.0,5944.19166067,0.100434303479,-269.610808451,200.801351053,597.000749183,0.0,0.0,2222.26810091,0.0903587424808
