In [2]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

# load dotenv
load_dotenv()

# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [3]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe) 
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')

#### Load the datasets

In [4]:
# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])
logger.info(f"Prices data shape: {prices_df.shape}")

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])

# identify sharks
shark_coins_df = td.classify_shark_coins(profits_df, config['training_data'])
shark_wallets_df = td.classify_shark_wallets(shark_coins_df,config['training_data'])

# generate and flatten buysell_metrics
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)
flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])


[13/Sep/2024 14:32:25] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[13/Sep/2024 14:32:25] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[13/Sep/2024 14:32:25] INFO [dreams_core.core.<module>:4] Prices data shape: (110929, 3)
[13/Sep/2024 14:32:54] INFO [dreams_core.core.retrieve_transfers_data:414] retrieved transfers_df with shape (23823401, 5) after 28.8 seconds.
[13/Sep/2024 14:32:54] INFO [dreams_core.core.prepare_profits_data:455] Preparing profits_df data...
[13/Sep/2024 14:33:44] INFO [dreams_core.core.calculate_wallet_profitability:635] Generated profits df after 18.85 seconds
[13/Sep/2024 14:34:06] INFO [dreams_core.core.clean_profits_df:706] Finished cleaning profits_df after 21.97 seconds.
[13/Sep/2024 14:34:14] INFO [dreams_core.core.classify_shark_coins:772] creation of shark_coins_df complete.
[13/Sep/2024 14:34:14] INFO [dreams_cor

In [11]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe) 
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')


# save the flattened df
output_dir = '..//modeling/outputs/flattened_outputs/'
metric_description = 'buysell_metrics'
modeling_period_start = config['training_data']['modeling_period_start']
version = '0.1'

file_location = fe.save_flattened_outputs(flattened_buysell_metrics_df, output_dir, metric_description, modeling_period_start, version)
file_location

# input_path = '..//modeling/outputs/flattened_outputs/flattened_output_2024-09-13_00-36_trainingstart_2024-05-01.csv'

# flattened_df = pd.read_csv(input_path)
# flattened_df.columns


# input_directory = '..//modeling/outputs/preprocessed_outputs/'
# input_filenames = ['flattened_output_2024-09-13_00-36_trainingstart_2024-05-01_preprocessed.csv']

'..//modeling/outputs/flattened_outputs/buysell_metrics_2024-09-13_14-45_model_period_2024-05-01_v0.1.csv'

In [19]:
input_path = '..//modeling/outputs/flattened_outputs/buysell_metrics_2024-09-13_14-44_model_period_2024-05-01_v0.1.csv'
pd.read_csv(input_path)

Unnamed: 0,coin_id,buyers_new_sum,buyers_new_mean,buyers_new_std,buyers_new_sum_7d_period_1,buyers_new_max_7d_period_1,buyers_new_change_7d_period_1,buyers_new_pct_change_7d_period_1,buyers_new_sum_7d_period_2,buyers_new_max_7d_period_2,...,total_bought_sum_7d_period_5,total_bought_change_7d_period_5,total_bought_sum_7d_period_6,total_bought_change_7d_period_6,total_bought_sum_7d_period_7,total_bought_change_7d_period_7,total_bought_sum_7d_period_8,total_bought_change_7d_period_8,total_sold_sum,total_buyers_sum
0,04f6120a-f0dd-4260-bb2b-b8f827fdba61,63,0.520661157025,2.02525378473,0,0,0,0,0,0,...,20089.086065,-2784.22774725,21977.7540765,-917.303515846,92588.3805934,-69123.0908494,0,0,75218.833379,117
1,0b9d343d-4e25-4d22-b49c-fa17509a0333,111,0.917355371901,4.31390537846,0,0,0,0,2,1,...,6080.67566472,-2014.5095628,8294.61265217,-2767.13632441,10090.9473855,-4302.86646049,8540.47910557,1528.59955508,913533.443143,239
2,0db96a94-082b-4e13-a315-860850e9ff4f,98,0.809917355372,5.28254050243,2,1,0,0,0,0,...,0,0,434891.74702,-206107.395938,225835.317112,225835.317112,1715870.17579,177008.783497,17754163.8538,183
3,0e1c102e-2e7d-4aed-af2d-1526c2e0720a,10,0.0826446280992,0.653028545312,0,0,0,0,0,0,...,0,0,0,0,0,0,287271.526847,0,2692156.0024,12
4,0eedc336-a78e-4b25-957e-57117227ef78,126,1.04132231405,8.62979016181,0,0,0,0,1,1,...,6949864.05293,36.8934881473,13002874.2547,-466411.972073,9007196.22715,-3761439.33482,15315039.5427,-4862882.41559,253189019.099,265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,f68b64ae-61d5-4dd6-b448-4ae9c754bd07,82,0.677685950413,3.16547752067,0,0,0,0,0,0,...,9973.36080645,995.08,21100.7794067,925.495241594,57989.0352859,21836.3581739,132762.409817,-9264.55315782,1378554.83495,250
114,f7b278de-7fa7-4f87-ba5d-3eb2e57d933a,30,0.247933884298,2.09873371241,0,0,0,0,0,0,...,177258.566632,0,7901.40510732,0,1977568.59051,-795814.488734,753591.387226,511721.417861,10466146.0714,48
115,f87b6a04-49f1-475c-8a0d-e65ddea3129c,2,0.0165289256198,0.128028061662,0,0,0,0,0,0,...,4422888.75896,0,0,0,0,0,0,0,4422888.75896,3
116,f9d19e1d-8637-4e7c-b808-4b4bfe18e316,59,0.487603305785,4.29168906624,1,1,0,0,1,1,...,338581.560072,33381.863181,0,0,0,0,0,0,189417.368226,129


In [23]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe) 
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')

import re

input_dir = '..//modeling/outputs/flattened_outputs/'
input_filenames = [
    'buysell_metrics_2024-09-13_14-44_model_period_2024-05-01_v0.1.csv'
    ,'buysell_metrics_2024-09-13_14-45_model_period_2024-05-01_v0.1.csv'
]
input_filenames

df = fe.create_training_data_df(input_dir,input_filenames)
df

IndexError: Boolean index has wrong length: 56 instead of 112