In [20]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

# load dotenv
load_dotenv()


# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

#### Load the datasets

In [21]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)

config = load_config('config.yaml')
metrics_config = load_config('config_metrics.yaml')

# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])
logger.info(f"Prices data shape: {prices_df.shape}")

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])

# identify sharks
shark_coins_df = td.classify_shark_coins(profits_df, config['training_data'])
shark_wallets_df = td.classify_shark_wallets(shark_coins_df,config['training_data'])

# generate and flatten buysell_metrics
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)
flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])


[11/Sep/2024 21:10:59] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[11/Sep/2024 21:10:59] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[11/Sep/2024 21:10:59] INFO [dreams_core.core.<module>:10] Prices data shape: (110929, 3)
[11/Sep/2024 21:11:26] INFO [dreams_core.core.retrieve_transfers_data:414] retrieved transfers_df with shape (23823401, 5) after 27.0 seconds.
[11/Sep/2024 21:11:27] INFO [dreams_core.core.prepare_profits_data:455] Preparing profits_df data...
[11/Sep/2024 21:12:19] INFO [dreams_core.core.calculate_wallet_profitability:635] Generated profits df after 19.30 seconds
[11/Sep/2024 21:12:42] INFO [dreams_core.core.clean_profits_df:706] Finished cleaning profits_df after 23.04 seconds.
[11/Sep/2024 21:12:50] INFO [dreams_core.core.classify_shark_coins:772] creation of shark_coins_df complete.


In [144]:
buysell_metrics_df

Unnamed: 0,date,buyers_new,buyers_repeat,total_buyers,sellers_new,sellers_repeat,total_sellers,total_bought,total_sold,total_net_transfers,total_volume,total_holders,total_balance,coin_id
0,2024-03-13,20,2,22,0,0,0,73649.83617,0,73649.83617,73649.83617,20,83658.541775,04f6120a-f0dd-4260-bb2b-b8f827fdba61
1,2024-03-14,2,0,2,2,0,2,5420.2621442,3279.53765231,2140.72449189,8699.79979651,4,13148.8171354,04f6120a-f0dd-4260-bb2b-b8f827fdba61
2,2024-03-15,3,0,3,1,0,1,6435.64314486,1876.35561669,4559.28752817,8311.99876155,4,6731.13464433,04f6120a-f0dd-4260-bb2b-b8f827fdba61
3,2024-03-16,0,2,2,1,1,2,1547.12713762,5418.07204041,-3870.94490279,6965.19917802,4,6735.78147101,04f6120a-f0dd-4260-bb2b-b8f827fdba61
4,2024-03-17,0,2,2,0,1,1,579.271114799,1537.95538848,-958.684273681,2117.22650328,3,7395.7597263,04f6120a-f0dd-4260-bb2b-b8f827fdba61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6723,2024-04-25,0,0,0,0,3,3,0,1675480.54129,-1675480.54129,1675480.54129,3,404187,fd1f38d9-5c05-4809-80fe-b67a07fd345c
6724,2024-04-26,0,0,0,0,1,1,0,404187,-404187,404187,1,-7.27595761418e-11,fd1f38d9-5c05-4809-80fe-b67a07fd345c
6725,2024-04-27,0,3,3,0,0,0,433498.485203,0,433498.485203,433498.485203,3,1496896.2854,fd1f38d9-5c05-4809-80fe-b67a07fd345c
6726,2024-04-28,0,0,0,1,0,1,0,717928.578161,-717928.578161,717928.578161,1,4.36557456851e-11,fd1f38d9-5c05-4809-80fe-b67a07fd345c


In [154]:
buysell_metrics_df = pd.DataFrame({
    'coin_id': ['coin1', 'coin1', 'coin2', 'coin2'],
    'date': [pd.Timestamp('2024-01-01'), pd.Timestamp('2024-01-04'), pd.Timestamp('2024-01-01'), pd.Timestamp('2024-01-03')],
    'total_balance': [100, 110, 200, None],
    'total_bought': [50, 20, 75, None]
})
buysell_metrics_df

Unnamed: 0,coin_id,date,total_balance,total_bought
0,coin1,2024-01-01,100.0,50.0
1,coin1,2024-01-04,110.0,20.0
2,coin2,2024-01-01,200.0,75.0
3,coin2,2024-01-03,,


In [162]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)


# # generate and flatten buysell_metrics
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)


# flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])

[12/Sep/2024 15:30:18] INFO [dreams_core.core.generate_buysell_metrics_df:33] Preparing buysell_metrics_df...
[12/Sep/2024 15:30:22] INFO [dreams_core.core.generate_buysell_metrics_df:99] Generated buysell_metrics_df after 4.08 seconds.


In [161]:
training_period_end = config['training_data']['training_period_end']

# Calculate the latest date in buysell_metrics_df
latest_date = buysell_metrics_df['date'].max()

# If training_period_end is later than the latest date, add NaN rows for missing dates
if pd.to_datetime(training_period_end) > latest_date:
    # Generate missing date range
    missing_dates = pd.date_range(start=latest_date + pd.Timedelta(days=1), end=training_period_end, freq='D')
    
    # Create a MultiIndex for the missing rows (with NaN values) based on coin_id and date
    missing_index = pd.MultiIndex.from_product(
        [missing_dates, buysell_metrics_df['coin_id'].unique()],
        names=['date', 'coin_id']
    )

    # Create DataFrame with missing rows and NaN values
    missing_rows = pd.DataFrame(index=missing_index).reset_index()
    
    # Concatenate with the original buysell_metrics_df
    buysell_metrics_df = pd.concat([buysell_metrics_df, missing_rows], ignore_index=True)

# Apply the appropriate fill logic per metric:
buysell_metrics_df = buysell_metrics_df.assign(
    # Forward-fill for balance and holders (each coin_id has independent forward-filling logic)
    total_balance=buysell_metrics_df.groupby('coin_id')['total_balance'].ffill(),
    total_holders=buysell_metrics_df.groupby('coin_id')['total_holders'].ffill(),
    
    # Fill 0 for metrics related to transactions (buying, selling) that should be 0 when there's no activity
    total_bought=buysell_metrics_df['total_bought'].fillna(0),
    total_sold=buysell_metrics_df['total_sold'].fillna(0),
    total_net_transfers=buysell_metrics_df['total_net_transfers'].fillna(0),
    total_volume=buysell_metrics_df['total_volume'].fillna(0),
    
    # Fill 0 for buyer/seller counts on days with no transactions
    buyers_new=buysell_metrics_df['buyers_new'].fillna(0),
    buyers_repeat=buysell_metrics_df['buyers_repeat'].fillna(0),
    total_buyers=buysell_metrics_df['total_buyers'].fillna(0),
    sellers_new=buysell_metrics_df['sellers_new'].fillna(0),
    sellers_repeat=buysell_metrics_df['sellers_repeat'].fillna(0),
    total_sellers=buysell_metrics_df['total_sellers'].fillna(0)
)

In [166]:
cwm.fill_buysell_metrics_df(buysell_metrics_df, training_period_end)

Unnamed: 0,coin_id,date,total_balance,total_bought,total_sold,total_net_transfers,total_volume,total_holders,buyers_new,buyers_repeat,total_buyers,sellers_new,sellers_repeat,total_sellers
0,coin1,2024-01-01,100,50,10,40,100,10,1,0,1,0,1,1
1,coin1,2024-01-04,110,20,5,15,35,11,0,1,1,1,0,1
2,coin2,2024-01-01,200,75,15,60,150,20,2,0,2,1,1,2
3,coin2,2024-01-03,200,0,0,0,0,20,0,0,0,0,0,0
4,coin1,2024-01-05,110,0,0,0,0,11,0,0,0,0,0,0
5,coin2,2024-01-05,200,0,0,0,0,20,0,0,0,0,0,0


In [174]:
# Updated sample buysell_metrics_df with an additional record for coin3 on 2024-01-03
buysell_metrics_df = pd.DataFrame({
    'coin_id': ['coin1', 'coin1', 'coin2', 'coin2', 'coin3'],
    'date': [pd.Timestamp('2024-01-01'), pd.Timestamp('2024-01-04'), pd.Timestamp('2024-01-01'), pd.Timestamp('2024-01-03'), pd.Timestamp('2024-01-03')],
    'total_balance': [100, 110, 200, None, 300],  # Added coin3 with balance 300 on 2024-01-03
    'total_bought': [50, 20, 75, None, 60],
    'total_sold': [10, 5, 15, None, 20],
    'total_net_transfers': [40, 15, 60, None, 40],
    'total_volume': [100, 35, 150, None, 80],
    'total_holders': [10, 11, 20, None, 30],  # Added coin3 with holders 30 on 2024-01-03
    'buyers_new': [1, 0, 2, None, 3],  # Added coin3 with 3 new buyers on 2024-01-03
    'buyers_repeat': [0, 1, 0, None, 1],
    'total_buyers': [1, 1, 2, None, 4],
    'sellers_new': [0, 1, 1, None, 2],
    'sellers_repeat': [1, 0, 1, None, 1],
    'total_sellers': [1, 1, 2, None, 3]
})

training_period_end = pd.Timestamp('2024-01-05')

# Call the function to fill missing dates and values
result = cwm.fill_buysell_metrics_df(buysell_metrics_df, training_period_end)

# Assert total_balance for coin1 is filled correctly
expected_total_balance_coin1 = [100, 100, 100, 110, 110]
result_total_balance_coin1 = result[result['coin_id'] == 'coin1']['total_balance'].tolist()
assert result_total_balance_coin1 == expected_total_balance_coin1, f"Expected {expected_total_balance_coin1}, but got {result_total_balance_coin1}"

# Assert buyers_new for coin1 is filled correctly
expected_buyers_new_coin1 = [1, 0, 0, 0, 0]
result_buyers_new_coin1 = result[result['coin_id'] == 'coin1']['buyers_new'].tolist()
assert result_buyers_new_coin1 == expected_buyers_new_coin1, f"Expected {expected_buyers_new_coin1}, but got {result_buyers_new_coin1}"

# Assert total_balance for coin3 is filled correctly
expected_total_balance_coin3 = [0, 0, 300, 300, 300]
result_total_balance_coin3 = result[result['coin_id'] == 'coin3']['total_balance'].tolist()
assert result_total_balance_coin3 == expected_total_balance_coin3, f"Expected {expected_total_balance_coin3}, but got {result_total_balance_coin3}"

# Assert buyers_new for coin3 is filled correctly
expected_buyers_new_coin3 = [0, 0, 3, 0, 0]
result_buyers_new_coin3 = result[result['coin_id'] == 'coin3']['buyers_new'].tolist()
assert result_buyers_new_coin3 == expected_buyers_new_coin3, f"Expected {expected_buyers_new_coin3}, but got {result_buyers_new_coin3}"


AssertionError: Expected [0, 0, 300, 300, 300], but got [nan, nan, 300.0, 300.0, 300.0]

In [170]:
buysell_metrics_df

Unnamed: 0,coin_id,date,total_balance,total_bought,total_sold,total_net_transfers,total_volume,total_holders,buyers_new,buyers_repeat,total_buyers,sellers_new,sellers_repeat,total_sellers
0,coin1,2024-01-01,100.0,50.0,10.0,40.0,100.0,10.0,1.0,0.0,1.0,0.0,1.0,1.0
1,coin1,2024-01-04,110.0,20.0,5.0,15.0,35.0,11.0,0.0,1.0,1.0,1.0,0.0,1.0
2,coin2,2024-01-01,200.0,75.0,15.0,60.0,150.0,20.0,2.0,0.0,2.0,1.0,1.0,2.0
3,coin2,2024-01-03,,,,,,,,,,,,


Unnamed: 0,coin_id,date,total_balance,total_bought,total_sold,total_net_transfers,total_volume,total_holders,buyers_new,buyers_repeat,total_buyers,sellers_new,sellers_repeat,total_sellers
0,coin1,2024-01-01,100,50,10,40,100,10,1,0,1,0,1,1
1,coin1,2024-01-04,110,20,5,15,35,11,0,1,1,1,0,1
2,coin2,2024-01-01,200,75,15,60,150,20,2,0,2,1,1,2
3,coin2,2024-01-03,200,0,0,0,0,20,0,0,0,0,0,0
4,coin1,2024-01-02,110,0,0,0,0,11,0,0,0,0,0,0
5,coin1,2024-01-03,110,0,0,0,0,11,0,0,0,0,0,0
6,coin1,2024-01-05,110,0,0,0,0,11,0,0,0,0,0,0
7,coin2,2024-01-02,200,0,0,0,0,20,0,0,0,0,0,0
8,coin2,2024-01-04,200,0,0,0,0,20,0,0,0,0,0,0
9,coin2,2024-01-05,200,0,0,0,0,20,0,0,0,0,0,0


In [125]:
importlib.reload(td)
importlib.reload(cwm)
config = load_config()

# generate inputs for generate_buysell_metrics_df()
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()

buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)

flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config)

[12/Sep/2024 12:52:37] INFO [dreams_core.core.generate_buysell_metrics_df:29] Preparing buysell_metrics_df...
[12/Sep/2024 12:52:46] INFO [dreams_core.core.generate_buysell_metrics_df:76] Generated buysell_metrics_df after 9.50 seconds.


In [135]:
config = load_config('config.yaml')
metrics_config = load_config('config_metrics.yaml')
importlib.reload(fe)

flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config)
flattened_buysell_metrics_df.head()

Unnamed: 0,coin_id,buyers_new_sum,buyers_new_mean,buyers_new_median,buyers_new_std,sum_7d_period_1,max_7d_period_1,change_7d_period_1,pct_change_7d_period_1,sum_7d_period_2,...,change_7d_period_6,pct_change_7d_period_6,sum_7d_period_7,max_7d_period_7,change_7d_period_7,pct_change_7d_period_7,sum_7d_period_8,max_7d_period_8,change_7d_period_8,pct_change_7d_period_8
0,04f6120a-f0dd-4260-bb2b-b8f827fdba61,63,1.5,0,3.2403703492,0,0,0,0,1.0,...,-18.0,-90.0,,,,,,,,
1,0b9d343d-4e25-4d22-b49c-fa17509a0333,111,1.0,0,4.49646325663,3,1,-1,-100,4.0,...,0.0,0.0,2.0,1.0,-1.0,-100.0,4.0,1.0,-1.0,-100.0
2,0db96a94-082b-4e13-a315-860850e9ff4f,98,0.98,0,5.80139307547,2,1,0,0,0.0,...,1.0,0.0,2.0,1.0,0.0,0.0,3.0,2.0,0.0,0.0
3,0e1c102e-2e7d-4aed-af2d-1526c2e0720a,10,0.833333333333,0,1.99240983979,0,0,0,0,,...,,,,,,,,,,
4,0eedc336-a78e-4b25-957e-57117227ef78,126,1.125,0,8.96753403488,0,0,0,0,1.0,...,0.0,0.0,1.0,1.0,-1.0,-100.0,3.0,2.0,0.0,0.0


In [131]:
rolling_features_partial_window

{'sum_3d_period_1': np.int64(21),
 'max_3d_period_1': np.int64(8),
 'min_3d_period_1': np.int64(6),
 'median_3d_period_1': np.float64(7.0),
 'std_3d_period_1': np.float64(1.0),
 'change_3d_period_1': np.int64(2),
 'pct_change_3d_period_1': np.float64(33.33333333333333),
 'sum_3d_period_2': np.int64(12),
 'max_3d_period_2': np.int64(5),
 'min_3d_period_2': np.int64(3),
 'median_3d_period_2': np.float64(4.0),
 'std_3d_period_2': np.float64(1.0),
 'change_3d_period_2': np.int64(2),
 'pct_change_3d_period_2': np.float64(66.66666666666667)}

In [136]:
# Sample DataFrame for testing
sample_coin_df = pd.DataFrame({
    'coin_id': [1] * 6,
    'buyers_new': [10, 20, 30, 40, 50, 60],
    'sellers_new': [5, 10, 15, 20, 25, 30]
})

# Sample configuration for metrics
metrics_config = {
    'metrics': {
        'buyers_new': {
            'aggregations': ['sum', 'mean', 'max', 'min', 'median', 'std'],
            'rolling': {
                'stats': ['sum', 'max'],
                'comparisons': ['change', 'pct_change'],
                'window_duration': 3,
                'lookback_periods': 2
            }
        },
        'sellers_new': {
            'aggregations': ['sum', 'mean', 'max']
        }
    }
}

# Test Case 1: Basic functionality with all metrics present
flat_features = fe.flatten_coin_features(sample_coin_df, metrics_config)

assert flat_features['buyers_new_sum'] == 210  # Sum of buyers_new column
assert flat_features['buyers_new_mean'] == 35   # Mean of buyers_new column
assert flat_features['buyers_new_max'] == 60    # Max of buyers_new column
assert flat_features['buyers_new_min'] == 10    # Min of buyers_new column
assert flat_features['buyers_new_median'] == 35 # Median of buyers_new column
assert round(flat_features['buyers_new_std'], 5) == round(sample_coin_df['buyers_new'].std(), 5)

assert flat_features['sellers_new_sum'] == 105  # Sum of sellers_new column
assert flat_features['sellers_new_mean'] == 17.5  # Mean of sellers_new column
assert flat_features['sellers_new_max'] == 30  # Max of sellers_new column

# Test Case 2: Missing metric column in DataFrame
with pytest.raises(ValueError, match="Metric 'nonexistent_metric' is missing from the input DataFrame"):
    sample_coin_df_invalid = sample_coin_df.drop(columns=['buyers_new'])
    metrics_config_invalid = {'metrics': {'nonexistent_metric': {'aggregations': ['sum']}}}
    fe.flatten_coin_features(sample_coin_df_invalid, metrics_config_invalid)

# Test Case 3: Missing 'coin_id' column in DataFrame
with pytest.raises(ValueError, match="The input DataFrame is missing the required 'coin_id' column."):
    sample_coin_df_no_id = sample_coin_df.drop(columns=['coin_id'])
    fe.flatten_coin_features(sample_coin_df_no_id, metrics_config)

# Test Case 4: Invalid aggregation function
with pytest.raises(KeyError, match="Aggregation 'invalid_agg' for metric 'buyers_new' is not recognized"):
    metrics_config_invalid_agg = {
        'metrics': {
            'buyers_new': {
                'aggregations': ['invalid_agg']
            }
        }
    }
    fe.flatten_coin_features(sample_coin_df, metrics_config_invalid_agg)

# Test Case 5: Rolling window metrics
rolling_features = fe.flatten_coin_features(sample_coin_df, metrics_config)

assert 'buyers_new_sum_3d_period_1' in rolling_features  # Ensure rolling stats are calculated
assert 'buyers_new_max_3d_period_1' in rolling_features
assert 'buyers_new_sum_3d_period_2' in rolling_features
assert 'buyers_new_max_3d_period_2' in rolling_features

assert 'buyers_new_sum_3d_period_3' not in rolling_features  # Ensure no extra periods



AssertionError: 

In [137]:
rolling_features

{'coin_id': np.int64(1),
 'buyers_new_sum': np.int64(210),
 'buyers_new_mean': np.float64(35.0),
 'buyers_new_max': np.int64(60),
 'buyers_new_min': np.int64(10),
 'buyers_new_median': np.float64(35.0),
 'buyers_new_std': np.float64(18.708286933869708),
 'sum_3d_period_1': np.int64(150),
 'max_3d_period_1': np.int64(60),
 'change_3d_period_1': np.int64(20),
 'pct_change_3d_period_1': np.float64(50.0),
 'sum_3d_period_2': np.int64(60),
 'max_3d_period_2': np.int64(30),
 'change_3d_period_2': np.int64(20),
 'pct_change_3d_period_2': np.float64(200.0),
 'sellers_new_sum': np.int64(105),
 'sellers_new_mean': np.float64(17.5),
 'sellers_new_max': np.int64(30)}