In [1]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

# load dotenv
load_dotenv()


# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

#### Load the datasets

In [2]:
importlib.reload(td)
config = load_config()


# retrieve prices data
prices_df = td.retrieve_prices_data()

# fill gaps in prices data
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])
logger.info(f"Prices data shape: {prices_df.shape}")

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )
logger.info(f"Transfers data shape: {transfers_df.shape}")

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
# profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])
# logger.info(f"Profits data shape: {profits_df.shape}")


# # identify sharks
# shark_coins_df = td.classify_shark_coins(profits_df, config['training_data'])
# shark_wallets_df = td.classify_shark_wallets(shark_coins_df,config['training_data'])


# # assess shark performance
# shark_agg_performance_df,shark_wallets_performance_df = td.calculate_shark_performance(transfers_df, prices_df, shark_wallets_df, config)
# shark_agg_performance_df

[10/Sep/2024 14:23:34] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[10/Sep/2024 14:23:35] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[10/Sep/2024 14:23:35] INFO [dreams_core.core.<module>:10] Prices data shape: (110929, 3)
[10/Sep/2024 14:24:23] INFO [dreams_core.core.retrieve_transfers_data:414] retrieved transfers_df with shape (21636374, 5) after 48.3 seconds.
[10/Sep/2024 14:24:23] INFO [dreams_core.core.<module>:18] Transfers data shape: (21636374, 5)
[10/Sep/2024 14:24:23] INFO [dreams_core.core.prepare_profits_data:455] Preparing profits_df data...
[10/Sep/2024 14:25:10] INFO [dreams_core.core.calculate_wallet_profitability:635] Generated profits df after 18.38 seconds


In [3]:
cleaned_df, exclusions_df = td.clean_profits_df(profits_df, config['data_cleaning'])

[10/Sep/2024 14:25:33] INFO [dreams_core.core.clean_profits_df:706] Finished cleaning profits_df after 22.98 seconds.


In [6]:
# @pytest.mark.integration
# def test_clean_profits_exclusions(cleaned_profits_df, profits_df, config):
#     """
#     Test that all excluded wallets breach either the profitability or inflows threshold.
#     Uses thresholds from the config file.
#     """
#     cleaned_df, exclusions_df = cleaned_profits_df

# Check that every excluded wallet breached at least one threshold
exclusions_with_breaches = exclusions_df.merge(profits_df, on='wallet_address', how='inner')

# Calculate the total profits and inflows per wallet
wallet_agg_df = exclusions_with_breaches.groupby('wallet_address').agg({
    'profits_change': 'sum',
    'usd_inflows': 'sum'
}).reset_index()

# Apply threshold check from the config
profitability_filter = config['data_cleaning']['profitability_filter']
inflows_filter = config['data_cleaning']['inflows_filter']

breaches_df = wallet_agg_df[
    (wallet_agg_df['profits_change'] >= profitability_filter) |
    (wallet_agg_df['profits_change'] <= -profitability_filter) |
    (wallet_agg_df['usd_inflows'] >= inflows_filter)
]

# Assert that all excluded wallets breached a threshold
assert len(exclusions_df) == len(breaches_df), "Some excluded wallets do not breach a threshold."

# @pytest.mark.integration
# def test_clean_profits_remaining_count(cleaned_profits_df, profits_df):
#     """
#     Test that the count of remaining records in the cleaned DataFrame matches the expected count.
#     """
#     cleaned_df, exclusions_df = cleaned_profits_df

# Get the total number of unique wallets before and after cleaning
input_wallet_count = profits_df['wallet_address'].nunique()
cleaned_wallet_count = cleaned_df['wallet_address'].nunique()
excluded_wallet_count = exclusions_df['wallet_address'].nunique()

# Assert that the remaining records equal the difference between the input and excluded records
assert input_wallet_count == cleaned_wallet_count + excluded_wallet_count, \
    "The count of remaining wallets does not match the input minus excluded records."

# @pytest.mark.integration
# def test_clean_profits_aggregate_sums(cleaned_profits_df, config):
#     """
#     Test that the aggregation of profits and inflows for the remaining wallets stays within the configured thresholds.
#     Uses thresholds from the config file.
#     """
#     cleaned_df, exclusions_df = cleaned_profits_df

# Aggregate the profits and inflows for the remaining wallets
remaining_wallets_agg_df = cleaned_df.groupby('wallet_address').agg({
    'profits_change': 'sum',
    'usd_inflows': 'sum'
}).reset_index()

# Apply the thresholds from the config
profitability_filter = config['data_cleaning']['profitability_filter']
inflows_filter = config['data_cleaning']['inflows_filter']

# Ensure no remaining wallets exceed the thresholds
over_threshold_wallets = remaining_wallets_agg_df[
    (remaining_wallets_agg_df['profits_change'] >= profitability_filter) |
    (remaining_wallets_agg_df['profits_change'] <= -profitability_filter) |
    (remaining_wallets_agg_df['usd_inflows'] >= inflows_filter)
]

# Assert that no wallets in the cleaned DataFrame breach the thresholds
assert over_threshold_wallets.empty, "Some remaining wallets exceed the thresholds."




AssertionError: Some remaining wallets exceed the thresholds.

In [9]:
cleaned_df.head()

Unnamed: 0,coin_id,wallet_address,date,net_transfers,balance,price,profits_change,profits_cumulative,usd_balance,usd_net_transfers,usd_inflows,usd_inflows_cumulative,total_return
0,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x00000000009e50a7ddb7a7b0e2ee6604fd120e49,2024-03-23,5.2e-08,5.2e-08,1.88983550773,0.0,0.0,9.82714464018e-08,9.82714464018e-08,9.82714464018e-08,9.82714464018e-08,0.0
1,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x00000000009e50a7ddb7a7b0e2ee6604fd120e49,2024-03-28,0.002920722,0.002920774,7.47303212902,2.90326224307e-07,2.90326224307e-07,0.0218270379436,0.0218266493459,0.0218266493459,0.0218267476174,1.33013964974e-05
2,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x00000000009e50a7ddb7a7b0e2ee6604fd120e49,2024-03-31,0.0,0.002920774,27.5991647724,0.0587838849452,0.0587841752714,0.0806109228888,0.0,0.0,0.0218267476174,2.69321734515
3,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x00000000009e50a7ddb7a7b0e2ee6604fd120e49,2024-04-01,0.0,0.002920774,30.3304629657,0.00797750474928,0.0667616800207,0.0885884276381,0.0,0.0,0.0218267476174,3.0587094876
4,04f6120a-f0dd-4260-bb2b-b8f827fdba61,0x00000000009e50a7ddb7a7b0e2ee6604fd120e49,2024-04-02,1.75,1.752920774,34.7437187048,0.0128901226181,0.0796518026389,60.9029862836,60.8015077334,60.8015077334,60.823334481,0.00130955994634


In [8]:
remaining_wallets_agg_df

Unnamed: 0,wallet_address,profits_change,usd_inflows_cumulative
0,0x0000000000000000000000000000000000000057,1.08341889994,0.9193270152
1,0x0000000000000000000000000000000000000069,13339.8000135,11319.387663
2,0x000000000000000000000000000000000000d3ad,-38.7666481102,1539.13886306
3,0x0000000000000000000000000000000000309c53,791.816034706,1793.36638408
4,0x00000000000000000000000000000000deaddead,-2.51448035026e-17,1.97249930768e-16
...,...,...,...
3021339,zzWrBSzwRuhZZjciBsAGe7EN3j6rGcn7jPUf1VmtBSq,-132.298523839,7912.81349005
3021340,zzd15Aw3QDokGGxYUGN7rBXe32uQ8CgXZZpe25yLNvA,0.383043515803,1.27395311438
3021341,zzpbk1H74zNrNUttDwgyoYFC1Adyz8rxwAY52eqBtTJ,-0.150065377454,35.5904743716
3021342,zzv3gCngGzvjsH5ACW83RvuoWbxKyeHs5nb8T2a3tMz,-1906.53564314,40897.0724873


In [25]:
logger.setLevel(logging.DEBUG)
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,logs_df = td.clean_profits_df(profits_df, config['data_cleaning'])
logs_df

[10/Sep/2024 12:14:02] INFO [dreams_core.core.prepare_profits_data:455] Preparing profits_df data...
[10/Sep/2024 12:14:09] DEBUG [dreams_core.core.prepare_profits_data:475] <Step 1> merge transfers and prices: 6.92 seconds
[10/Sep/2024 12:14:14] DEBUG [dreams_core.core.prepare_profits_data:490] <Step 2> identify first prices of coins: 5.10 seconds
[10/Sep/2024 12:14:15] DEBUG [dreams_core.core.prepare_profits_data:514] <Step 3> created new records as of the first_price_date: 1.00 seconds
[10/Sep/2024 12:14:34] DEBUG [dreams_core.core.prepare_profits_data:528] <Step 4> merge new records into profits_df: 18.58 seconds
[10/Sep/2024 12:14:41] DEBUG [dreams_core.core.prepare_profits_data:544] <Step 5> removed records prior to each wallet's first token inflows: 6.91 seconds
[10/Sep/2024 12:14:41] DEBUG [dreams_core.core.prepare_profits_data:556] generated profits_df after 39.24 total seconds
[10/Sep/2024 12:14:42] DEBUG [dreams_core.core.calculate_wallet_profitability:599] Starting generati

Unnamed: 0,coin_id,wallet_address
