In [1]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema


# import local files if necessary
sys.path.append('..//src')
import training_data as td
importlib.reload(td)

# load dotenv
load_dotenv()

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.DEBUG)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')


def load_config(file_path='config.yaml'):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)
config = load_config()


#### Load the datasets

In [2]:
importlib.reload(td)
config = load_config()


# retrieve prices data
if 'prices_filled_df' not in locals():
    prices_df = td.retrieve_prices_data()

    # fill gaps in prices data
    prices_df,prices_outcomes_df = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])
logger.info(f"Prices data shape: {prices_df.shape}")


# Retrieve or load transfers data
# if 'transfers_df' not in locals():
transfers_df = td.retrieve_transfers_data(
    config['modeling']['training_period_start'],
    config['modeling']['modeling_period_start'],
    config['modeling']['modeling_period_end']
    )
logger.info(f"Transfers data shape: {transfers_df.shape}")


# Calculate and clean profits data
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning']['profitability_filter'])
logger.info(f"Profits data shape: {profits_df.shape}")

[09/Sep/2024 13:36:28] DEBUG [dreams_core.core.retrieve_prices_data:34] retrieving prices data...
[09/Sep/2024 13:36:30] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[09/Sep/2024 13:36:31] DEBUG [dreams_core.core.fill_prices_gaps:125] retained 401 coins.
[09/Sep/2024 13:36:31] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[09/Sep/2024 13:36:31] INFO [dreams_core.core.<module>:11] Prices data shape: (110929, 3)
[09/Sep/2024 13:36:31] DEBUG [dreams_core.core.retrieve_transfers_data:373] retrieving transfers data...
[09/Sep/2024 13:36:56] INFO [dreams_core.core.retrieve_transfers_data:380] retrieved transfers_df with shape (15527424, 5) after 25.3 seconds.
[09/Sep/2024 13:36:56] INFO [dreams_core.core.<module>:21] Transfers data shape: (15527424, 5)
[09/Sep/2024 13:36:56] DEBUG [dreams_core.core.prepare_profits_data:421] Preparing profits_df data...


### Sharkwork

In [3]:
importlib.reload(td)
config = load_config()

sharks_coins_df = td.classify_shark_coins(profits_df, config['modeling'])
shark_wallets_df = td.classify_shark_wallets(sharks_coins_df,config['modeling'])

[09/Sep/2024 13:37:37] DEBUG [dreams_core.core.classify_shark_coins:672] identifying shark wallets...
[09/Sep/2024 13:37:42] INFO [dreams_core.core.classify_shark_coins:703] creation of sharks_df complete.


In [4]:
shark_wallets_df.head()

Unnamed: 0,wallet_address,total_coins,shark_coins,shark_rate,is_shark
0,0x000000000000c35e4364deffa9059dbadaefd4f8,3,0,0,False
1,0x00000000000a78c8727b6ae386f004e7e37a4875,1,0,0,False
2,0x000000000077cdff30a1b5d7c12f3587f921e519,1,1,1,False
3,0x000000000085cd7bd617419ce5ff21c722ab2d38,1,1,1,False
4,0x0000000000a6f0986c92cf1ec4d2e77afbe1466d,1,0,0,False


In [58]:
importlib.reload(td)
config = load_config()

def calculate_modeling_period_profitability(profits_df, training_period_end, modeling_period_end):
    """
    Calculate profitability during the modeling period by finding the difference in total_profits between
    the end of the training period and the end of the modeling period.
    
    Parameters:
        profits_df (DataFrame): DataFrame containing wallet profitability data.
        training_period_end (str): End date of the training period.
        modeling_period_end (str): End date of the modeling period.
        
    Returns:
        modeling_period_profits_df (DataFrame): DataFrame with wallet_address, coin_id, and profitability change.
    """
    # Step 1: Filter for the end of the training period and the modeling period
    training_profits_df = profits_df[profits_df['date'] == training_period_end][['wallet_address', 'coin_id', 'profits_cumulative']]
    modeling_profits_df = profits_df[profits_df['date'] == modeling_period_end][['wallet_address', 'coin_id', 'profits_cumulative']]

    # Step 2: Merge the two DataFrames on wallet_address and coin_id
    merged_profits_df = pd.merge(
        training_profits_df,
        modeling_profits_df,
        on=['wallet_address', 'coin_id'],
        suffixes=('_training', '_modeling')
    )

    # Step 3: Calculate the difference in total profits between the two periods
    merged_profits_df['profit_during_modeling'] = merged_profits_df['profits_cumulative_modeling'] - merged_profits_df['profits_cumulative_training']

    return merged_profits_df[['wallet_address', 'coin_id', 'profit_during_modeling']]


training_period_end = config['modeling']['training_period_end']
modeling_period_end = config['modeling']['modeling_period_end']
modeling_period_profits_df = calculate_modeling_period_profitability(profits_df, training_period_end, modeling_period_end)
modeling_period_profits_df

Unnamed: 0,wallet_address,coin_id,profit_during_modeling
0,0x000000000005af2ddc1a93a03e9b7014064d3b8d,0b9d343d-4e25-4d22-b49c-fa17509a0333,-5.17081809181e-08
1,0x000000000035b5e5ad9019092c665357240f594e,0b9d343d-4e25-4d22-b49c-fa17509a0333,-6.72206351936e-07
2,0x00000000003b3cc22af3ae1eac0440bcee416b40,0b9d343d-4e25-4d22-b49c-fa17509a0333,-0.00262284576889
3,0x00000000009726632680fb29d3f7a9734e3010e2,0b9d343d-4e25-4d22-b49c-fa17509a0333,-234.011423626
4,0x00000000009e50a7ddb7a7b0e2ee6604fd120e49,0b9d343d-4e25-4d22-b49c-fa17509a0333,-0.233809693934
...,...,...,...
1888161,0xffe91fda27c3d39663d3adc16d3ac4bce17a1f0a,f7b278de-7fa7-4f87-ba5d-3eb2e57d933a,534.286148136
1888162,0xffed43322e064fce09bef0e949701da17f067569,f7b278de-7fa7-4f87-ba5d-3eb2e57d933a,0
1888163,0xfff07d6cb3d1e67563f3bfa335c94db34f59c0a4,f7b278de-7fa7-4f87-ba5d-3eb2e57d933a,82.2364102505
1888164,0xfff2246f89868eb0e06e5a28a84ff53d2652266a,f7b278de-7fa7-4f87-ba5d-3eb2e57d933a,75.0893117248


In [6]:
def assess_megashark_modeling_period_performance(modeling_period_profits_df, shark_wallets_df):
    """
    Assess the performance of megasharks during the modeling period by comparing their profitability changes
    to non-megasharks.
    
    Parameters:
        modeling_period_profits_df (DataFrame): DataFrame containing the change in profitability during the modeling period.
        shark_wallets_df (DataFrame): DataFrame with wallets classified as megasharks.
        
    Returns:
        megasharks_performance_df (DataFrame): DataFrame comparing megashark performance vs. non-megasharks.
    """
    # Step 1: Filter for megasharks and non-megasharks
    megasharks_df = shark_wallets_df[shark_wallets_df['is_megashark']]
    non_megasharks_df = shark_wallets_df[~shark_wallets_df['is_megashark']]

    # Step 2: Merge megasharks with modeling period profits data
    megasharks_performance_df = pd.merge(
        megasharks_df[['wallet_address']],
        modeling_period_profits_df,
        on='wallet_address',
        how='inner'
    )

    # Step 3: Merge non-megasharks with modeling period profits data
    non_megasharks_performance_df = pd.merge(
        non_megasharks_df[['wallet_address']],
        modeling_period_profits_df,
        on='wallet_address',
        how='inner'
    )

    # Step 4: Calculate average performance for megasharks and non-megasharks
    megasharks_avg_profit_change = megasharks_performance_df['profit_during_modeling'].mean()
    non_megasharks_avg_profit_change = non_megasharks_performance_df['profit_during_modeling'].mean()

    # Step 5: Create a comparison DataFrame
    performance_comparison_df = pd.DataFrame({
        'group': ['megasharks', 'non-megasharks'],
        'avg_profit_change': [megasharks_avg_profit_change, non_megasharks_avg_profit_change]
    })

    return performance_comparison_df

performance_comparison_df = assess_megashark_modeling_period_performance(modeling_period_profits_df, shark_wallets_df)
performance_comparison_df

NameError: name 'modeling_period_profits_df' is not defined

In [None]:
def calculate_modeling_period_rate_of_return(profits_df, training_period_end, modeling_period_end):
    """
    Calculate the rate of return (ROR) during the modeling period for each wallet-coin pair.

    Parameters:
        profits_df (DataFrame): DataFrame containing wallet balances, inflows, and profits data.
        training_period_end (str): End date of the training period.
        modeling_period_end (str): End date of the modeling period.

    Returns:
        ror_df (DataFrame): DataFrame with wallet_address, coin_id, and rate of return (ROR).
    """
    # Step 1: Filter for the end of the training period and the end of the modeling period
    training_profits_df = profits_df[profits_df['date'] == training_period_end][['wallet_address', 'coin_id', 'balance', 'usd_inflows_cumulative']]
    modeling_profits_df = profits_df[profits_df['date'] == modeling_period_end][['wallet_address', 'coin_id', 'balance', 'price']]

    # Step 2: Merge the two DataFrames on wallet_address and coin_id
    merged_profits_df = pd.merge(
        training_profits_df,
        modeling_profits_df,
        on=['wallet_address', 'coin_id'],
        suffixes=('_training', '_modeling')
    )

    # Step 3: Calculate the total inflows, outflows, and price changes during the modeling period
    merged_profits_df['net_inflows'] = merged_profits_df['usd_inflows_cumulative']  # Assumed to be cumulative inflows
    merged_profits_df['final_value'] = merged_profits_df['balance_modeling'] * merged_profits_df['price']

    # Step 4: Calculate net investment (initial balance + inflows)
    merged_profits_df['net_investment'] = merged_profits_df['balance_training'] + merged_profits_df['net_inflows']

    # Step 5: Calculate the rate of return (ROR)
    merged_profits_df['rate_of_return'] = ((merged_profits_df['final_value'] - merged_profits_df['net_investment']) / merged_profits_df['net_investment']) * 100

    return merged_profits_df[['wallet_address', 'coin_id', 'rate_of_return']]


Unnamed: 0,wallet_address,coin_id,rate_of_return
0,0x000000000005af2ddc1a93a03e9b7014064d3b8d,0b9d343d-4e25-4d22-b49c-fa17509a0333,173.483384026
1,0x000000000035b5e5ad9019092c665357240f594e,0b9d343d-4e25-4d22-b49c-fa17509a0333,173.483384026
2,0x00000000003b3cc22af3ae1eac0440bcee416b40,0b9d343d-4e25-4d22-b49c-fa17509a0333,-71.3555514489
3,0x00000000009726632680fb29d3f7a9734e3010e2,0b9d343d-4e25-4d22-b49c-fa17509a0333,-100
4,0x00000000009e50a7ddb7a7b0e2ee6604fd120e49,0b9d343d-4e25-4d22-b49c-fa17509a0333,-21.8428128321
...,...,...,...
1888161,0xffe91fda27c3d39663d3adc16d3ac4bce17a1f0a,f7b278de-7fa7-4f87-ba5d-3eb2e57d933a,-98.9627594542
1888162,0xffed43322e064fce09bef0e949701da17f067569,f7b278de-7fa7-4f87-ba5d-3eb2e57d933a,
1888163,0xfff07d6cb3d1e67563f3bfa335c94db34f59c0a4,f7b278de-7fa7-4f87-ba5d-3eb2e57d933a,-98.9632455914
1888164,0xfff2246f89868eb0e06e5a28a84ff53d2652266a,f7b278de-7fa7-4f87-ba5d-3eb2e57d933a,-98.9651619607


In [20]:
importlib.reload(td)
config = load_config()


def sample_transfers_df():
    """
    Create a sample transfers DataFrame for testing.
    """
    data = {
        'coin_id': ['BTC', 'BTC', 'ETH', 'ETH', 'BTC', 'ETH', 'MYRO', 'MYRO', 'MYRO', 
                    'BTC', 'ETH', 'BTC', 'ETH', 'MYRO'],
        'wallet_address': ['wallet1', 'wallet1', 'wallet1', 'wallet2', 'wallet2', 'wallet2', 'wallet3', 'wallet3', 'wallet3',
                           'wallet1', 'wallet1', 'wallet2', 'wallet2', 'wallet3'],
        'date': [
            '2023-01-01', '2023-02-01', '2023-01-01', '2023-01-01', '2023-01-01', '2023-02-01', 
            '2023-01-01', '2023-02-01', '2023-03-01',
            '2023-04-01', '2023-04-01', '2023-04-01', '2023-04-01', '2023-04-01'
        ],
        'net_transfers': [10.0, 5, 100, 50, 20, 25, 1000, 500, -750,
                          0, 0, 0, -10, 0],
        'balance': [10.0, 15, 100, 50, 20, 75, 1000, 1500, 750,
                    15, 100, 20, 65, 750]
    }
    df = pd.DataFrame(data)

    # Convert coin_id to categorical and date to datetime
    df['coin_id'] = df['coin_id'].astype('category')
    df['date'] = pd.to_datetime(df['date'])

    return df


def sample_prices_df():
    """
    Create a sample prices DataFrame for testing.
    """
    data = {
        'date': [
            '2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01',
            '2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01',
            '2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01'
        ],
        'coin_id': ['BTC', 'BTC', 'BTC', 'BTC', 'ETH', 'ETH', 'ETH', 'ETH', 'MYRO', 'MYRO', 'MYRO', 'MYRO'],
        'price': [20000.0, 21000, 22000, 23000, 1500, 1600, 1700, 1800, 10, 15, 12, 8]
    }
    df = pd.DataFrame(data)

    # Convert coin_id to categorical and date to datetime
    df['coin_id'] = df['coin_id'].astype('category')
    df['date'] = pd.to_datetime(df['date'])

    return df

sample_transfers_df = sample_transfers_df()
sample_prices_df =sample_prices_df()

import pytest

profits_df = td.prepare_profits_data(sample_transfers_df, sample_prices_df)
result = td.calculate_wallet_profitability(profits_df)

# Check profitability for wallet1, BTC
wallet1_btc = result[(result['wallet_address'] == 'wallet1') & (result['coin_id'] == 'BTC')]
assert wallet1_btc.loc[wallet1_btc['date'] == '2023-02-01', 'profits_change'].values[0] == 10000  # (21000 - 20000) * 10
assert wallet1_btc.loc[wallet1_btc['date'] == '2023-02-01', 'profits_cumulative'].values[0] == 10000
assert wallet1_btc.loc[wallet1_btc['date'] == '2023-04-01', 'profits_change'].values[0] == 30000  # (23000 - 21000) * 15
assert wallet1_btc.loc[wallet1_btc['date'] == '2023-04-01', 'profits_cumulative'].values[0] == 40000  # 10000 + 15000 + 15000

# Check net transfers and cumulative net transfers for wallet1, BTC
assert wallet1_btc.loc[wallet1_btc['date'] == '2023-02-01', 'usd_net_transfers'].values[0] == 5 * 21000  # 5 BTC * $21,000
assert wallet1_btc.loc[wallet1_btc['date'] == '2023-04-01', 'usd_net_transfers_cumulative'].values[0] == (10*20000) + (5*21000) + (0*23000)

# Check profitability for wallet2, ETH
wallet2_eth = result[(result['wallet_address'] == 'wallet2') & (result['coin_id'] == 'ETH')]
assert wallet2_eth.loc[wallet2_eth['date'] == '2023-02-01', 'profits_change'].values[0] == 5000  # (1600 - 1500) * 50
assert wallet2_eth.loc[wallet2_eth['date'] == '2023-02-01', 'profits_cumulative'].values[0] == 5000
assert wallet2_eth.loc[wallet2_eth['date'] == '2023-04-01', 'profits_change'].values[0] == 15000  # (1800 - 1600) * 75
assert wallet2_eth.loc[wallet2_eth['date'] == '2023-04-01', 'profits_cumulative'].values[0] == 20000  # 5000 + 15000

# Check net transfers and cumulative net transfers for wallet2, ETH
assert wallet2_eth.loc[wallet2_eth['date'] == '2023-02-01', 'usd_net_transfers'].values[0] == 25 * 1600  # 25 ETH * $1,600
assert wallet2_eth.loc[wallet2_eth['date'] == '2023-04-01', 'usd_net_transfers_cumulative'].values[0] == (50*1500) + (25*1600) + (-10*1800)

# Check profitability for wallet3, MYRO
wallet3_myro = result[(result['wallet_address'] == 'wallet3') & (result['coin_id'] == 'MYRO')]
assert wallet3_myro.loc[wallet3_myro['date'] == '2023-02-01', 'profits_change'].values[0] == 5000  # (15 - 10) * 1000
assert wallet3_myro.loc[wallet3_myro['date'] == '2023-03-01', 'profits_change'].values[0] == -4500  # (12 - 15) * 1500
assert wallet3_myro.loc[wallet3_myro['date'] == '2023-03-01', 'profits_cumulative'].values[0] == 500
assert wallet3_myro.loc[wallet3_myro['date'] == '2023-04-01', 'profits_change'].values[0] == -3000  # (8 - 12) * 750
assert wallet3_myro.loc[wallet3_myro['date'] == '2023-04-01', 'profits_cumulative'].values[0] == -2500  # 500 - 3000

# Check net transfers and cumulative net transfers for wallet3, MYRO
assert wallet3_myro.loc[wallet3_myro['date'] == '2023-03-01', 'usd_net_transfers'].values[0] == -750 * 12  # Sold 750 MYRO * $12
assert wallet3_myro.loc[wallet3_myro['date'] == '2023-03-01', 'usd_net_transfers_cumulative'].values[0] == 1000 * 10 + 500 * 15 - 750 * 12


[09/Sep/2024 14:05:24] DEBUG [dreams_core.core.prepare_profits_data:421] Preparing profits_df data...
[09/Sep/2024 14:05:24] DEBUG [dreams_core.core.prepare_profits_data:441] <Step 1> merge transfers and prices: 0.00 seconds
[09/Sep/2024 14:05:24] DEBUG [dreams_core.core.prepare_profits_data:456] <Step 2> identify first prices of coins: 0.00 seconds
[09/Sep/2024 14:05:24] DEBUG [dreams_core.core.prepare_profits_data:480] <Step 3> created new records as of the first_price_date: 0.00 seconds
[09/Sep/2024 14:05:24] DEBUG [dreams_core.core.prepare_profits_data:494] <Step 4> merge new records into profits_df: 0.00 seconds
[09/Sep/2024 14:05:24] DEBUG [dreams_core.core.prepare_profits_data:510] <Step 5> removed records prior to each wallet's first token inflows: 0.00 seconds
[09/Sep/2024 14:05:24] DEBUG [dreams_core.core.prepare_profits_data:522] generated profits_df after 0.01 total seconds
[09/Sep/2024 14:05:24] DEBUG [dreams_core.core.calculate_wallet_profitability:565] Starting generatio

In [21]:
wallet3_myro

Unnamed: 0,coin_id,wallet_address,date,net_transfers,balance,price,profits_change,profits_cumulative,usd_balance,usd_net_transfers,usd_net_transfers_cumulative,total_return
10,MYRO,wallet3,2023-01-01,1000,1000,10,0,0,10000,10000,10000,0.0
11,MYRO,wallet3,2023-02-01,500,1500,15,5000,5000,22500,7500,17500,0.285714285714
12,MYRO,wallet3,2023-03-01,-750,750,12,-4500,500,9000,-9000,8500,0.0588235294118
13,MYRO,wallet3,2023-04-01,0,750,8,-3000,-2500,6000,0,8500,-0.294117647059
