In [2]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

# load dotenv
load_dotenv()


# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

ModuleNotFoundError: No module named 'utils'

#### Load the datasets

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)

config = load_config('config.yaml')
metrics_config = load_config('metrics_config.yaml')

# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])
logger.info(f"Prices data shape: {prices_df.shape}")

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])

# identify sharks
shark_coins_df = td.classify_shark_coins(profits_df, config['training_data'])
shark_wallets_df = td.classify_shark_wallets(shark_coins_df,config['training_data'])

# generate and flatten buysell_metrics
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)
flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])


[12/Sep/2024 20:04:57] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[12/Sep/2024 20:04:57] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[12/Sep/2024 20:04:57] INFO [dreams_core.core.<module>:11] Prices data shape: (110929, 3)


KeyboardInterrupt: 

In [1]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)


# generate and flatten buysell_metrics
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)
flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])

buysell_metrics_df.dtypes

NameError: name 'importlib' is not defined

In [269]:
buysell_metrics_df_test = pd.read_csv('../tests/fixtures/buysell_metrics_df.csv')
buysell_metrics_df_test['date'] = pd.to_datetime(buysell_metrics_df_test['date']).astype('datetime64[ns]')

buysell_metrics_df_test.head()

buysell_metrics_df = buysell_metrics_df_test

In [283]:
zero_activity_rows

Unnamed: 0,date,buyers_new,buyers_repeat,total_buyers,sellers_new,sellers_repeat,total_sellers,total_bought,total_sold,total_net_transfers,total_volume,total_holders,total_balance,coin_id
0,2024-01-01,0,0,0,0,0,0,0,0,0,0,0,0,04f6120a-f0dd-4260-bb2b-b8f827fdba61
1,2024-01-02,0,0,0,0,0,0,0,0,0,0,0,0,04f6120a-f0dd-4260-bb2b-b8f827fdba61
2,2024-01-03,0,0,0,0,0,0,0,0,0,0,0,0,04f6120a-f0dd-4260-bb2b-b8f827fdba61
3,2024-01-04,0,0,0,0,0,0,0,0,0,0,0,0,04f6120a-f0dd-4260-bb2b-b8f827fdba61
4,2024-01-05,0,0,0,0,0,0,0,0,0,0,0,0,04f6120a-f0dd-4260-bb2b-b8f827fdba61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14226,2024-03-10,0,0,0,0,0,0,0,0,0,0,0,0,fd1f38d9-5c05-4809-80fe-b67a07fd345c
14227,2024-03-11,0,0,0,0,0,0,0,0,0,0,0,0,fd1f38d9-5c05-4809-80fe-b67a07fd345c
14228,2024-03-12,0,0,0,0,0,0,0,0,0,0,0,0,fd1f38d9-5c05-4809-80fe-b67a07fd345c
14229,2024-03-13,0,0,0,0,0,0,0,0,0,0,0,0,fd1f38d9-5c05-4809-80fe-b67a07fd345c


In [280]:
buysell_metrics_df.head()

Unnamed: 0,date,buyers_new,buyers_repeat,total_buyers,sellers_new,sellers_repeat,total_sellers,total_bought,total_sold,total_net_transfers,total_volume,total_holders,total_balance,coin_id
0,2024-01-01,0,0,0,0,0,0,0,0,0,0,0,0,04f6120a-f0dd-4260-bb2b-b8f827fdba61
1,2024-01-02,0,0,0,0,0,0,0,0,0,0,0,0,04f6120a-f0dd-4260-bb2b-b8f827fdba61
2,2024-01-03,0,0,0,0,0,0,0,0,0,0,0,0,04f6120a-f0dd-4260-bb2b-b8f827fdba61
3,2024-01-04,0,0,0,0,0,0,0,0,0,0,0,0,04f6120a-f0dd-4260-bb2b-b8f827fdba61
4,2024-01-05,0,0,0,0,0,0,0,0,0,0,0,0,04f6120a-f0dd-4260-bb2b-b8f827fdba61


In [277]:
flattened_buysell_metrics_df['total_bought_sum']

0     137230.345434
1     483641.730393
2      26585152.255
3     1438347.66117
4     417155151.125
           ...     
113   1291632.51301
114   57614240.7022
115   5815451.34039
116   366444.257715
117   87582514.3141
Name: total_bought_sum, Length: 118, dtype: float64

In [268]:

# Introduce an outlier in the buysell_metrics_df for total_bought
outlier_df = buysell_metrics_df.copy()
outlier_df.loc[0, 'total_bought'] = 1e12  # Extreme value

# Flatten the modified DataFrame
flattened_buysell_metrics_df = fe.flatten_coin_date_df(outlier_df, metrics_config, config['training_data']['training_period_end'])

# Ensure the extreme value is handled and aggregated correctly
assert flattened_buysell_metrics_df['total_bought_sum'].max() >= 1e12, "Outlier in total_bought not handled correctly"

[12/Sep/2024 20:32:26] INFO [dreams_core.core.flatten_coin_date_df:63] Flattening columns ['buyers_new', 'total_bought'] into coin-level features...


[12/Sep/2024 20:32:26] INFO [dreams_core.core.flatten_coin_date_df:79] Flattened input df into coin-level features with shape (118, 47) after 0.14 seconds.


In [241]:
full_date_range

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09', '2024-01-10',
               ...
               '2024-04-21', '2024-04-22', '2024-04-23', '2024-04-24',
               '2024-04-25', '2024-04-26', '2024-04-27', '2024-04-28',
               '2024-04-29', '2024-04-30'],
              dtype='datetime64[ns]', length=121, freq='D')

In [242]:
missing_dates

{Timestamp('2024-01-01 00:00:00'),
 Timestamp('2024-01-02 00:00:00'),
 Timestamp('2024-01-03 00:00:00'),
 Timestamp('2024-01-04 00:00:00'),
 Timestamp('2024-01-05 00:00:00'),
 Timestamp('2024-01-06 00:00:00'),
 Timestamp('2024-01-07 00:00:00'),
 Timestamp('2024-01-08 00:00:00'),
 Timestamp('2024-01-09 00:00:00'),
 Timestamp('2024-01-10 00:00:00'),
 Timestamp('2024-01-11 00:00:00'),
 Timestamp('2024-01-12 00:00:00'),
 Timestamp('2024-01-13 00:00:00'),
 Timestamp('2024-01-14 00:00:00'),
 Timestamp('2024-01-15 00:00:00'),
 Timestamp('2024-01-16 00:00:00'),
 Timestamp('2024-01-17 00:00:00'),
 Timestamp('2024-01-18 00:00:00'),
 Timestamp('2024-01-19 00:00:00'),
 Timestamp('2024-01-20 00:00:00'),
 Timestamp('2024-01-21 00:00:00'),
 Timestamp('2024-01-22 00:00:00'),
 Timestamp('2024-01-23 00:00:00'),
 Timestamp('2024-01-24 00:00:00'),
 Timestamp('2024-01-25 00:00:00'),
 Timestamp('2024-01-26 00:00:00'),
 Timestamp('2024-01-27 00:00:00'),
 Timestamp('2024-01-28 00:00:00'),
 Timestamp('2024-01-

In [244]:
# Initialize a dictionary to store missing dates
missing_dates_dict = {}

# Iterate over unique coin_ids
for coin_id in df['coin_id'].unique():
    # Filter data for the current coin_id
    coin_df = df[df['coin_id'] == coin_id]
    
    # Create the full date range for the coin, explicitly cast to datetime
    full_date_range = pd.to_datetime(pd.date_range(start=coin_df['date'].min(), end=training_period_end))
    
    # Get the existing dates for the coin, explicitly cast to datetime
    existing_dates = set(pd.to_datetime(coin_df['date'].unique()))
    
    # Find the missing dates by subtracting existing from full date range
    missing_dates = set(full_date_range) - existing_dates
    
    # Store the missing dates for the current coin_id
    missing_dates_dict[coin_id] = sorted(missing_dates)

# Convert to DataFrame for easier display
missing_dates_df = pd.DataFrame(list(missing_dates_dict.items()), columns=['coin_id', 'missing_dates'])
missing_dates_df

Unnamed: 0,coin_id,missing_dates
0,04f6120a-f0dd-4260-bb2b-b8f827fdba61,[]


In [245]:
training_period_end = '2024-04-30'

df = buysell_metrics_df
missing_dates = df.groupby('coin_id')['date'].apply(
    lambda x: pd.date_range(start=x.min(), end=training_period_end).difference(x.unique())
)
# missing_dates_df = missing_dates.reset_index(level=0)
pd.DataFrame(missing_dates)

Unnamed: 0_level_0,date
coin_id,Unnamed: 1_level_1
04f6120a-f0dd-4260-bb2b-b8f827fdba61,"DatetimeIndex(['2024-01-01', '2024-01-02', '20..."
0b9d343d-4e25-4d22-b49c-fa17509a0333,"DatetimeIndex(['2024-01-01', '2024-01-02', '20..."
0db96a94-082b-4e13-a315-860850e9ff4f,"DatetimeIndex(['2024-01-01', '2024-01-02', '20..."
0e1c102e-2e7d-4aed-af2d-1526c2e0720a,"DatetimeIndex(['2024-01-01', '2024-01-02', '20..."
0eedc336-a78e-4b25-957e-57117227ef78,"DatetimeIndex(['2024-01-01', '2024-01-02', '20..."
...,...
f68b64ae-61d5-4dd6-b448-4ae9c754bd07,"DatetimeIndex(['2024-01-01', '2024-01-02', '20..."
f7b278de-7fa7-4f87-ba5d-3eb2e57d933a,"DatetimeIndex(['2024-01-01', '2024-01-02', '20..."
f87b6a04-49f1-475c-8a0d-e65ddea3129c,"DatetimeIndex(['2024-01-01', '2024-01-02', '20..."
f9d19e1d-8637-4e7c-b808-4b4bfe18e316,"DatetimeIndex(['2024-01-01', '2024-01-02', '20..."


In [246]:
# Initialize a dictionary to store missing dates
missing_dates_dict = {}

# Iterate over unique coin_ids
for coin_id in df['coin_id'].unique():
    # Filter data for the current coin_id
    coin_df = df[df['coin_id'] == coin_id]
    
    # Create the full date range for the coin, explicitly cast to pd.Timestamp
    full_date_range = pd.to_datetime(pd.date_range(start=coin_df['date'].min(), end=training_period_end)).to_pydatetime()

    # Get the existing dates for the coin, explicitly cast to pd.Timestamp
    existing_dates = set(pd.to_datetime(coin_df['date'].unique()).to_pydatetime())
    
    # Find the missing dates by subtracting existing from full date range
    missing_dates = set(full_date_range) - existing_dates
    
    # Store the missing dates for the current coin_id
    missing_dates_dict[coin_id] = sorted(missing_dates)

# Convert to DataFrame for easier display
missing_dates_df = pd.DataFrame(list(missing_dates_dict.items()), columns=['coin_id', 'missing_dates'])

In [235]:
# Initialize a dictionary to store missing dates
missing_dates_dict = {}

# Iterate over unique coin_ids
for coin_id in df['coin_id'].unique():
    # Filter data for the current coin_id
    coin_df = df[df['coin_id'] == coin_id]
    
    # Create the full date range for the coin
    full_date_range = pd.date_range(start=coin_df['date'].min(), end=training_period_end)
    
    # Get the existing dates for the coin
    existing_dates = set(coin_df['date'].unique())
    
    # Find the missing dates by subtracting existing from full date range
    missing_dates = set(full_date_range) - existing_dates
    
    # Store the missing dates for the current coin_id
    missing_dates_dict[coin_id] = sorted(missing_dates)

# Convert to DataFrame for easier display
missing_dates_df = pd.DataFrame(list(missing_dates_dict.items()), columns=['coin_id', 'missing_dates'])
missing_dates_df

Unnamed: 0,coin_id,missing_dates
0,04f6120a-f0dd-4260-bb2b-b8f827fdba61,"[2024-01-01 00:00:00, 2024-01-02 00:00:00, 202..."
1,0b9d343d-4e25-4d22-b49c-fa17509a0333,"[2024-01-01 00:00:00, 2024-01-02 00:00:00, 202..."
2,0db96a94-082b-4e13-a315-860850e9ff4f,"[2024-01-01 00:00:00, 2024-01-02 00:00:00, 202..."
3,0e1c102e-2e7d-4aed-af2d-1526c2e0720a,"[2024-01-01 00:00:00, 2024-01-02 00:00:00, 202..."
4,0eedc336-a78e-4b25-957e-57117227ef78,"[2024-01-01 00:00:00, 2024-01-02 00:00:00, 202..."
...,...,...
113,f68b64ae-61d5-4dd6-b448-4ae9c754bd07,"[2024-01-01 00:00:00, 2024-01-02 00:00:00, 202..."
114,f7b278de-7fa7-4f87-ba5d-3eb2e57d933a,"[2024-01-01 00:00:00, 2024-01-02 00:00:00, 202..."
115,f87b6a04-49f1-475c-8a0d-e65ddea3129c,"[2024-01-01 00:00:00, 2024-01-02 00:00:00, 202..."
116,f9d19e1d-8637-4e7c-b808-4b4bfe18e316,"[2024-01-01 00:00:00, 2024-01-02 00:00:00, 202..."


In [234]:
df.groupby('coin_id')['date'].count()

coin_id
04f6120a-f0dd-4260-bb2b-b8f827fdba61    121
0b9d343d-4e25-4d22-b49c-fa17509a0333    121
0db96a94-082b-4e13-a315-860850e9ff4f    121
0e1c102e-2e7d-4aed-af2d-1526c2e0720a    121
0eedc336-a78e-4b25-957e-57117227ef78    121
                                       ... 
f68b64ae-61d5-4dd6-b448-4ae9c754bd07    121
f7b278de-7fa7-4f87-ba5d-3eb2e57d933a    121
f87b6a04-49f1-475c-8a0d-e65ddea3129c    121
f9d19e1d-8637-4e7c-b808-4b4bfe18e316    121
fd1f38d9-5c05-4809-80fe-b67a07fd345c    121
Name: date, Length: 118, dtype: int64

In [219]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)

flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df, metrics_config, config['training_data']['training_period_end'])


ValueError: ('Timeseries contains missing dates. Ensure all dates are filled up to the training_period_end for all coins. Missing dates found: %s', coin_id
04f6120a-f0dd-4260-bb2b-b8f827fdba61    DatetimeIndex(['2024-01-01', '2024-01-02', '20...
0b9d343d-4e25-4d22-b49c-fa17509a0333    DatetimeIndex(['2024-01-01', '2024-01-02', '20...
0db96a94-082b-4e13-a315-860850e9ff4f    DatetimeIndex(['2024-01-01', '2024-01-02', '20...
0e1c102e-2e7d-4aed-af2d-1526c2e0720a    DatetimeIndex(['2024-01-01', '2024-01-02', '20...
0eedc336-a78e-4b25-957e-57117227ef78    DatetimeIndex(['2024-01-01', '2024-01-02', '20...
                                                              ...                        
f68b64ae-61d5-4dd6-b448-4ae9c754bd07    DatetimeIndex(['2024-01-01', '2024-01-02', '20...
f7b278de-7fa7-4f87-ba5d-3eb2e57d933a    DatetimeIndex(['2024-01-01', '2024-01-02', '20...
f87b6a04-49f1-475c-8a0d-e65ddea3129c    DatetimeIndex(['2024-01-01', '2024-01-02', '20...
f9d19e1d-8637-4e7c-b808-4b4bfe18e316    DatetimeIndex(['2024-01-01', '2024-01-02', '20...
fd1f38d9-5c05-4809-80fe-b67a07fd345c    DatetimeIndex(['2024-01-01', '2024-01-02', '20...
Name: date, Length: 118, dtype: object)

In [216]:
# Flatten the buysell metrics DataFrame
flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df, metrics_config, config['training_data']['training_period_end'])

# Example: Verify that total_bought is aggregated as a sum
# Group original by coin_id and date for manual verification
expected_total_bought = buysell_metrics_df.groupby(['coin_id', 'date'])['total_bought'].sum().reset_index()

# Compare to the flattened result
result_total_bought = flattened_buysell_metrics_df[['coin_id', 'date', 'total_bought']]


ValueError: Timeseries contains missing dates. Ensure all dates are filled up to the training_period_end before calling flatten_coin_date_df().

In [218]:
config['training_data']['training_period_end']

'2024-04-30'