In [20]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

# load dotenv
load_dotenv()


# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

#### Load the datasets

In [21]:
importlib.reload(td)
config = load_config()


# retrieve prices data
prices_df = td.retrieve_prices_data()

# fill gaps in prices data
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])
logger.info(f"Prices data shape: {prices_df.shape}")

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])


# identify sharks
shark_coins_df = td.classify_shark_coins(profits_df, config['training_data'])
shark_wallets_df = td.classify_shark_wallets(shark_coins_df,config['training_data'])


# # assess shark performance
# shark_agg_performance_df,shark_wallets_performance_df = td.calculate_shark_performance(transfers_df, prices_df, shark_wallets_df, config)
# metrics = ['count_wallets', 'return_aggregate', 'nonzero_count_wallets', 'nonzero_median_return', 'midrange_count_wallets', 'midrange_median_return', 'midrange_return_aggregate']
# shark_agg_performance_df[shark_agg_performance_df['metric'].isin(metrics)]

[11/Sep/2024 21:10:59] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[11/Sep/2024 21:10:59] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[11/Sep/2024 21:10:59] INFO [dreams_core.core.<module>:10] Prices data shape: (110929, 3)
[11/Sep/2024 21:11:26] INFO [dreams_core.core.retrieve_transfers_data:414] retrieved transfers_df with shape (23823401, 5) after 27.0 seconds.
[11/Sep/2024 21:11:27] INFO [dreams_core.core.prepare_profits_data:455] Preparing profits_df data...
[11/Sep/2024 21:12:19] INFO [dreams_core.core.calculate_wallet_profitability:635] Generated profits df after 19.30 seconds
[11/Sep/2024 21:12:42] INFO [dreams_core.core.clean_profits_df:706] Finished cleaning profits_df after 23.04 seconds.
[11/Sep/2024 21:12:50] INFO [dreams_core.core.classify_shark_coins:772] creation of shark_coins_df complete.


In [125]:
importlib.reload(td)
importlib.reload(cwm)
config = load_config()

# generate inputs for generate_buysell_metrics_df()
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()

buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)

[12/Sep/2024 12:52:37] INFO [dreams_core.core.generate_buysell_metrics_df:29] Preparing buysell_metrics_df...
[12/Sep/2024 12:52:46] INFO [dreams_core.core.generate_buysell_metrics_df:76] Generated buysell_metrics_df after 9.50 seconds.


In [127]:
config = load_config('config.yaml')
metrics_config = load_config('config_metrics.yaml')
importlib.reload(fe)

flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config)
flattened_buysell_metrics_df.head()

Unnamed: 0,coin_id,buyers_new_sum,buyers_new_mean,buyers_new_median,buyers_new_std,sum_7d_period_1,max_7d_period_1,change_7d_period_1,pct_change_7d_period_1,sum_7d_period_2,...,change_7d_period_6,pct_change_7d_period_6,sum_7d_period_7,max_7d_period_7,change_7d_period_7,pct_change_7d_period_7,sum_7d_period_8,max_7d_period_8,change_7d_period_8,pct_change_7d_period_8
0,04f6120a-f0dd-4260-bb2b-b8f827fdba61,63,1.5,0,3.2403703492,0,0,0,0,1.0,...,-18.0,-90.0,,,,,,,,
1,0b9d343d-4e25-4d22-b49c-fa17509a0333,111,1.0,0,4.49646325663,3,1,-1,-100,4.0,...,0.0,0.0,2.0,1.0,-1.0,-100.0,4.0,1.0,-1.0,-100.0
2,0db96a94-082b-4e13-a315-860850e9ff4f,98,0.98,0,5.80139307547,2,1,0,0,0.0,...,1.0,0.0,2.0,1.0,0.0,0.0,3.0,2.0,0.0,0.0
3,0e1c102e-2e7d-4aed-af2d-1526c2e0720a,10,0.833333333333,0,1.99240983979,0,0,0,0,,...,,,,,,,,,,
4,0eedc336-a78e-4b25-957e-57117227ef78,126,1.125,0,8.96753403488,0,0,0,0,1.0,...,0.0,0.0,1.0,1.0,-1.0,-100.0,3.0,2.0,0.0,0.0


In [131]:
rolling_features_partial_window

{'sum_3d_period_1': np.int64(21),
 'max_3d_period_1': np.int64(8),
 'min_3d_period_1': np.int64(6),
 'median_3d_period_1': np.float64(7.0),
 'std_3d_period_1': np.float64(1.0),
 'change_3d_period_1': np.int64(2),
 'pct_change_3d_period_1': np.float64(33.33333333333333),
 'sum_3d_period_2': np.int64(12),
 'max_3d_period_2': np.int64(5),
 'min_3d_period_2': np.int64(3),
 'median_3d_period_2': np.float64(4.0),
 'std_3d_period_2': np.float64(1.0),
 'change_3d_period_2': np.int64(2),
 'pct_change_3d_period_2': np.float64(66.66666666666667)}

In [133]:
# Sample data for testing
ts = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
ts_with_8_records = pd.Series([1, 2, 3, 4, 5, 6, 7, 8])
small_ts = pd.Series([1, 2])

# Configuration for window and lookback periods
window_duration = 3
lookback_periods = 3
rolling_stats = ['sum', 'max', 'min', 'median', 'std']
comparisons = ['change', 'pct_change']


# Test Case 1: Multiple periods with complete windows (10 records, 3 periods, window_duration=3)
rolling_features = fe.calculate_rolling_window_features(ts, window_duration, lookback_periods, rolling_stats, comparisons)

assert rolling_features['sum_3d_period_1'] == 27  # Last 3 records: 9+10+8 = 27
assert rolling_features['max_3d_period_1'] == 10
assert rolling_features['min_3d_period_1'] == 8
assert rolling_features['median_3d_period_1'] == 9
assert round(rolling_features['std_3d_period_1'], 5) == round(ts.iloc[-3:].std(), 5)

assert 'change_3d_period_1' in rolling_features
assert 'pct_change_3d_period_1' in rolling_features


# Test Case 2: Non-divisible records (8 records, window_duration=3)
rolling_features_partial_window = fe.calculate_rolling_window_features(ts_with_8_records, 3, 3, ['sum', 'max'], ['change', 'pct_change'])

# Only two full periods (6-8 and 3-5), so period 3 should not exist
assert rolling_features_partial_window['sum_3d_period_1'] == 21  # Last 3 records: 6+7+8
assert rolling_features_partial_window['sum_3d_period_2'] == 12  # Next 3 records: 3+4+5

# Ensure no period 3 is calculated
assert 'sum_3d_period_3' not in rolling_features_partial_window
assert 'change_3d_period_3' not in rolling_features_partial_window


# Test Case 3: Small dataset (2 records)
rolling_features_small_ts = fe.calculate_rolling_window_features(small_ts, window_duration, lookback_periods, rolling_stats, comparisons)

# No valid 3-period windows exist, so the function should handle it gracefully
assert rolling_features_small_ts == {}  # Expect empty dict since window is larger than available data


# Test Case 4: Check std and median specifically with window of 3 and valid lookback periods
rolling_features_std_median = fe.calculate_rolling_window_features(ts, window_duration, lookback_periods, ['std', 'median'], comparisons)

# Check for standard deviation and median over the last 3 periods
assert round(rolling_features_std_median['std_3d_period_1'], 5) == round(ts.iloc[-3:].std(), 5)
assert rolling_features_std_median['median_3d_period_1'] == 9
assert round(rolling_features_std_median['std_3d_period_2'], 5) == round(ts.iloc[-6:-3].std(), 5)
assert rolling_features_std_median['median_3d_period_2'] == 6


# Test Case 5: Handle pct_change with impute_value logic (start_value=0)
ts_with_zeros = pd.Series([0, 0, 5, 10, 15, 20])
rolling_features_zeros = fe.calculate_rolling_window_features(ts_with_zeros, window_duration, lookback_periods, ['sum'], comparisons)

assert 'pct_change_3d_period_1' in rolling_features_zeros
assert rolling_features_zeros['pct_change_3d_period_2'] <= 1000  # Ensure capping at 1000%

AssertionError: 