In [28]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

# load dotenv
load_dotenv()

# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [29]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


### Training Data Generation

In [3]:
logger.setLevel(logging.INFO)

# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])



[14/Sep/2024 12:25:53] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[14/Sep/2024 12:25:53] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[14/Sep/2024 12:27:08] INFO [dreams_core.core.retrieve_transfers_data:333] retrieved transfers_df with shape (35657183, 5) after 74.9 seconds.
[14/Sep/2024 12:27:08] INFO [dreams_core.core.prepare_profits_data:374] Preparing profits_df data...
[14/Sep/2024 12:28:31] INFO [dreams_core.core.calculate_wallet_profitability:554] Generated profits df after 36.06 seconds
[14/Sep/2024 12:29:09] INFO [dreams_core.core.clean_profits_df:625] Finished cleaning profits_df after 38.51 seconds.


### Metrics and Feature Engineering

In [24]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe) 
importlib.reload(m) 
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')



# identify sharks
shark_coins_df = td.classify_shark_coins(profits_df, config['training_data'])
shark_wallets_df = td.classify_shark_wallets(shark_coins_df,config['training_data'])

# generate and flatten buysell_metrics
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)

# flatten, save, and preprocess the flattened df
output_directory = '..//modeling/outputs/flattened_outputs/'
metric_description = 'buysell_metrics'
modeling_period_start = config['training_data']['modeling_period_start']
version = '0.1'

flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])
flattened_df, flattened_filepath = fe.save_flattened_outputs(flattened_buysell_metrics_df, output_directory, metric_description, modeling_period_start, version)
preprocessed_df, preprocessed_filepath = fe.preprocess_coin_df(flattened_filepath, modeling_config, metrics_config)

# create the training data df
input_directory = f"{preprocessed_filepath.split('preprocessed_outputs/')[0]}preprocessed_outputs/"
input_filenames = [
    preprocessed_filepath.split('preprocessed_outputs/')[1]
]
training_data_df = fe.create_training_data_df(input_directory, input_filenames)

# create the target variable df
target_variable_df,_ = fe.create_target_variables_mooncrater(prices_df, config['training_data'], modeling_config)

# merge the two into the final model input df
model_input_df = fe.prepare_model_input_df(training_data_df, target_variable_df, modeling_config['modeling']['target_column'])

[14/Sep/2024 12:50:11] INFO [dreams_core.core.classify_shark_coins:691] creation of shark_coins_df complete.
[14/Sep/2024 12:50:11] INFO [dreams_core.core.generate_buysell_metrics_df:33] Preparing buysell_metrics_df...
[14/Sep/2024 12:50:15] INFO [dreams_core.core.generate_buysell_metrics_df:99] Generated buysell_metrics_df after 3.49 seconds.
[14/Sep/2024 12:50:15] INFO [dreams_core.core.flatten_coin_date_df:84] Flattening columns ['buyers_new', 'total_bought', 'total_sold', 'total_buyers'] into coin-level features...
[14/Sep/2024 12:50:15] INFO [dreams_core.core.flatten_coin_date_df:100] Flattened input df into coin-level features with shape (89, 57) after 0.10 seconds.
[14/Sep/2024 12:50:15] INFO [dreams_core.core.preprocess_coin_df:423] Preprocessed file saved at: ..//modeling/outputs/preprocessed_outputs/buysell_metrics_0.1_2024-09-14_12-50_model_period_2024-03-01_preprocessed.csv
[14/Sep/2024 12:50:15] INFO [dreams_core.core.preprocess_coin_df:424] Dropped 1 columns: buyers_new_m

In [107]:
profits_df.head()

Unnamed: 0,coin_id,wallet_address,date,net_transfers,balance,price,profits_change,profits_cumulative,usd_balance,usd_net_transfers,usd_inflows,usd_inflows_cumulative,total_return
0,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x000000004685666c7653cc148f566f0511901b37,2024-03-13,2.38776413,2.38776413,2.20442838263,0,0,5.26365501921,5.26365501921,5.26365501921,5.26365501921,0
1,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x000000004685666c7653cc148f566f0511901b37,2024-03-14,0.0,2.38776413,2.20442838263,0,0,5.26365501921,0.0,0.0,5.26365501921,0
2,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x00000000a991c429ee2ec6df19d40fe0c80088b8,2024-03-13,216.04104,216.04104,2.20442838263,0,0,476.24700039,476.24700039,476.24700039,476.24700039,0
3,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x00000000a991c429ee2ec6df19d40fe0c80088b8,2024-03-14,0.0,216.04104,2.20442838263,0,0,476.24700039,0.0,0.0,476.24700039,0
4,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x0096672e6b20840f2ca0e8025fd4957e5d6e7153,2024-03-13,691.380214155,691.380214155,2.20442838263,0,0,1524.09816728,1524.09816728,1524.09816728,1524.09816728,0


In [165]:
profits_df_full=profits_df
profits_df=sample_wallet_cohort_profits_df
profits_df

Unnamed: 0,coin_id,wallet_address,date,usd_inflows,profits_cumulative,total_return
0,coin_1,wallet_1,2024-02-15,5000,3000,0.6
1,coin_1,wallet_2,2024-02-20,15000,8000,0.7
2,coin_2,wallet_1,2024-02-18,8000,6000,0.4
3,coin_2,wallet_3,2024-02-25,20000,9000,0.8
4,coin_3,wallet_2,2024-02-22,5000,4000,0.45


In [170]:
wallet_cohort_config = sample_wallet_cohort_config
sample_wallet_cohort_config

{'wallet_minimum_inflows': 10000,
 'wallet_maximum_inflows': 50000,
 'coin_profits_win_threshold': 5000,
 'coin_return_win_threshold': 0.5,
 'wallet_min_coin_wins': 2}

In [171]:
eligible_wallets_coins_df['profits_cumulative']

0    3000
1    6000
2    8000
3    4000
4    9000
Name: profits_cumulative, dtype: int64

In [173]:
eligible_wallets_coins_df['profits_cumulative'] >= wallet_cohort_config['coin_profits_win_threshold']

0    False
1     True
2     True
3    False
4     True
Name: profits_cumulative, dtype: bool

In [172]:
wallet_cohort_config['coin_profits_win_threshold']

5000

In [176]:
logger.debug('Classifying wallet cohort based on coin-level thresholds...')

# Step 1: Aggregate wallet-level inflows and filter eligible wallets
wallet_inflows_df = profits_df.groupby('wallet_address')['usd_inflows'].sum().reset_index()
eligible_wallets_df = wallet_inflows_df[
    (wallet_inflows_df['usd_inflows'] >= wallet_cohort_config['wallet_minimum_inflows']) &
    (wallet_inflows_df['usd_inflows'] <= wallet_cohort_config['wallet_maximum_inflows'])
]

# Step 2: Group by wallet and coin to aggregate profits and return
# filter profits_df for only eligible wallets
eligible_wallets_profits_df = profits_df[profits_df['wallet_address'].isin(eligible_wallets_df['wallet_address'])].copy()
eligible_wallets_profits_df = eligible_wallets_profits_df.sort_values(by=['wallet_address', 'coin_id', 'date'])

# compute wallet-coin level metrics
eligible_wallets_coins_df = eligible_wallets_profits_df.groupby(['wallet_address', 'coin_id']).agg({
    'profits_cumulative': 'last',  # Use the last record for cumulative profits
    'total_return': 'last'  # Use the last record for total return
}).reset_index()

# Step 3: Apply wallet-coin-level thresholds (profits AND return) to each wallet and classify "win"s
eligible_wallets_coins_df['is_profits_win'] = eligible_wallets_coins_df['profits_cumulative'] >= wallet_cohort_config['coin_profits_win_threshold']
eligible_wallets_coins_df['is_returns_win'] = eligible_wallets_coins_df['total_return'] >= wallet_cohort_config['coin_return_win_threshold']

# A coin is classified as a "win" if it meets both the profits AND returns threshold
eligible_wallets_coins_df['is_coin_win'] = eligible_wallets_coins_df['is_profits_win'] & eligible_wallets_coins_df['is_returns_win']

# Step 4: Aggregate at the wallet level to count the number of "winning" coins and total coins
wallet_wins_df = eligible_wallets_coins_df.groupby('wallet_address')['is_coin_win'].sum().reset_index()
wallet_wins_df.columns = ['wallet_address', 'winning_coins']

wallet_coins_df = eligible_wallets_coins_df.groupby('wallet_address')['coin_id'].nunique().reset_index()
wallet_coins_df.columns = ['wallet_address', 'total_coins']

# Step 5: Classify wallets into the cohort based on minimum number of coin wins
wallet_wins_df['in_cohort'] = wallet_wins_df['winning_coins'] >= wallet_cohort_config['wallet_min_coin_wins']


# Step 6: Compute wallet-level metrics for output
# Merge wallet inflows, wins, total coins, profits, and return rate
cohort_summary_df = wallet_inflows_df.merge(wallet_coins_df, on='wallet_address')
cohort_summary_df = cohort_summary_df.merge(wallet_wins_df, on='wallet_address', how='left')
cohort_summary_df['winning_coins'] = cohort_summary_df['winning_coins'].fillna(0)

# Calculate total profits (USD value)
wallet_profits_df = profits_df.groupby('wallet_address')['profits_cumulative'].sum().reset_index()
wallet_profits_df.columns = ['wallet_address', 'total_profits']

# Calculate return rate: total profits / total inflows
wallet_return_rate_df = wallet_profits_df.merge(wallet_inflows_df, on='wallet_address')
wallet_return_rate_df['return_rate'] = wallet_return_rate_df['total_profits'] / wallet_return_rate_df['usd_inflows']

# Merge profits and return rate into the cohort summary
cohort_summary_df = cohort_summary_df.merge(wallet_profits_df, on='wallet_address', how='left')
cohort_summary_df = cohort_summary_df.merge(wallet_return_rate_df[['wallet_address', 'return_rate']], on='wallet_address', how='left')

logger.info('Wallet cohort classification complete.')

# return cohort_summary_df

print(f'profits_df {profits_df.shape}')
print(f'wallet_inflows_df {wallet_inflows_df.shape}')
print(f'eligible_wallets_df {eligible_wallets_df.shape}')
print(f'eligible_wallets_profits_df {eligible_wallets_profits_df.shape}')
print(f'eligible_wallets_coins_df {eligible_wallets_coins_df.shape}')
print(f'wallet_wins_df {wallet_wins_df.shape}')
print(f'wallet_coins_df {wallet_coins_df.shape}')


[14/Sep/2024 21:42:45] INFO [dreams_core.core.<module>:57] Wallet cohort classification complete.


profits_df (5, 6)
wallet_inflows_df (3, 2)
eligible_wallets_df (3, 2)
eligible_wallets_profits_df (5, 6)
eligible_wallets_coins_df (5, 7)
wallet_wins_df (3, 3)
wallet_coins_df (3, 2)


In [180]:
wallet_wins_df

Unnamed: 0,wallet_address,winning_coins,in_cohort
0,wallet_1,0,False
1,wallet_2,1,False
2,wallet_3,1,False


In [177]:
eligible_wallets_coins_df

Unnamed: 0,wallet_address,coin_id,profits_cumulative,total_return,is_profits_win,is_returns_win,is_coin_win
0,wallet_1,coin_1,3000,0.6,False,True,False
1,wallet_1,coin_2,6000,0.4,True,False,False
2,wallet_2,coin_1,8000,0.7,True,True,True
3,wallet_2,coin_3,4000,0.45,False,False,False
4,wallet_3,coin_2,9000,0.8,True,True,True


In [179]:
cohort_summary_df

Unnamed: 0,wallet_address,usd_inflows,total_coins,winning_coins,in_cohort,total_profits,return_rate
0,wallet_1,13000,2,0,False,9000,0.692307692308
1,wallet_2,20000,2,1,False,12000,0.6
2,wallet_3,20000,1,1,False,9000,0.45


In [178]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)

config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')

def sample_wallet_cohort_profits_df():
    data = {
        'coin_id': ['coin_1', 'coin_1', 'coin_2', 'coin_2', 'coin_3'],
        'wallet_address': ['wallet_1', 'wallet_2', 'wallet_1', 'wallet_3', 'wallet_2'],
        'date': ['2024-02-15', '2024-02-20', '2024-02-18', '2024-02-25', '2024-02-22'],
        'usd_inflows': [5000, 15000, 8000, 20000, 5000],
        'profits_cumulative': [3000, 8000, 6000, 9000, 4000],
        'total_return': [0.6, 0.7, 0.4, 0.8, 0.45]
    }
    return pd.DataFrame(data)

sample_wallet_cohort_profits_df = sample_wallet_cohort_profits_df()

def sample_wallet_cohort_config():
    return {
        'wallet_minimum_inflows': 10000,
        'wallet_maximum_inflows': 50000,
        'coin_profits_win_threshold': 5000,
        'coin_return_win_threshold': 0.5,
        'wallet_min_coin_wins': 1
    }
sample_wallet_cohort_config = sample_wallet_cohort_config()

# Test 1: Ensure wallets are filtered correctly based on inflows eligibility criteria.
cohort_wallets_df = td.classify_wallet_cohort(sample_wallet_cohort_profits_df, sample_wallet_cohort_config)
eligible_wallets = cohort_wallets_df['wallet_address'].unique()
assert 'wallet_1' not in eligible_wallets, "Wallet_1 should be excluded due to insufficient inflows."
assert 'wallet_2' in eligible_wallets, "Wallet_2 should be included."
assert 'wallet_3' in eligible_wallets, "Wallet_3 should be included."

# Test 2: Verify that wallets are classified based on profits win threshold.
wallet_2_wins = cohort_wallets_df[cohort_wallets_df['wallet_address'] == 'wallet_2']['winning_coins'].values[0]
assert wallet_2_wins == 2, "Wallet_2 should have 2 winning coins based on profits."

# Test 3: Verify that wallets are classified based on returns win threshold.
wallet_3_wins = cohort_wallets_df[cohort_wallets_df['wallet_address'] == 'wallet_3']['winning_coins'].values[0]
assert wallet_3_wins == 1, "Wallet_3 should have 1 winning coin based on returns."

# Test 4: Ensure wallets are classified based on combined profits and return thresholds.
wallet_2_is_cohort = cohort_wallets_df[cohort_wallets_df['wallet_address'] == 'wallet_2']['in_cohort'].values[0]
assert wallet_2_is_cohort, "Wallet_2 should be classified as a cohort member."

# Test 5: Check summary metrics for wallet_2
wallet_2_metrics = cohort_wallets_df[cohort_wallets_df['wallet_address'] == 'wallet_2']
assert wallet_2_metrics['total_profits'].values[0] == 8000, "Wallet_2 total profits should be 8000."
assert wallet_2_metrics['return_rate'].values[0] == 8000 / 15000, "Wallet_2 return rate should be 0.533."

[14/Sep/2024 21:42:47] INFO [dreams_core.core.classify_wallet_cohort:709] Wallet cohort classification complete.


AssertionError: Wallet_1 should be excluded due to insufficient inflows.

In [161]:
sample_wallet_cohort_profits_df

Unnamed: 0,coin_id,wallet_address,date,usd_inflows,profits_cumulative,total_return
0,coin_1,wallet_1,2024-02-15,5000,3000,0.6
1,coin_1,wallet_2,2024-02-20,15000,8000,0.7
2,coin_2,wallet_1,2024-02-18,8000,6000,0.4
3,coin_2,wallet_3,2024-02-25,20000,9000,0.8
4,coin_3,wallet_2,2024-02-22,5000,4000,0.45


In [95]:
sample_wallet_cohort_config

{'wallet_minimum_inflows': 10000,
 'wallet_maximum_inflows': 50000,
 'coin_profits_win_threshold': 5000,
 'coin_return_win_threshold': 0.5,
 'wallet_min_coin_wins': 2}

In [162]:
cohort_wallets_df

Unnamed: 0,wallet_address,usd_inflows,total_coins,winning_coins,in_cohort,total_profits,return_rate
0,wallet_1,13000,2,0,False,9000,0.692307692308
1,wallet_2,20000,2,1,False,12000,0.6
2,wallet_3,20000,1,1,False,9000,0.45


In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe) 
importlib.reload(m) 
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')


# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])

# identify sharks
shark_coins_df = td.classify_shark_coins(profits_df, config['training_data'])
shark_wallets_df = td.classify_shark_wallets(shark_coins_df,config['training_data'])

# generate and flatten buysell_metrics
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)

# flatten, save, and preprocess the flattened df
output_directory = '..//modeling/outputs/flattened_outputs/'
metric_description = 'buysell_metrics'
modeling_period_start = config['training_data']['modeling_period_start']
version = '0.1'

flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])
flattened_df, flattened_filepath = fe.save_flattened_outputs(flattened_buysell_metrics_df, output_directory, metric_description, modeling_period_start, version)
preprocessed_df, preprocessed_filepath = fe.preprocess_coin_df(flattened_filepath, modeling_config, metrics_config)

# create the training data df
input_directory = f"{preprocessed_filepath.split('preprocessed_outputs/')[0]}preprocessed_outputs/"
input_filenames = [
    preprocessed_filepath.split('preprocessed_outputs/')[1]
]
training_data_df = fe.create_training_data_df(input_directory, input_filenames)

# create the target variable df
target_variable_df,_ = fe.create_target_variables_mooncrater(prices_df, config['training_data'], modeling_config)

# merge the two into the final model input df
model_input_df = fe.prepare_model_input_df(training_data_df, target_variable_df, modeling_config['modeling']['target_column'])

# split train/test sets
X_train,X_test,y_train,y_test = m.split_model_input(model_input_df, modeling_config['modeling']['target_column'], test_size=0.2, random_state=42)

