In [None]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

# load dotenv
load_dotenv()


# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

#### Load the datasets

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)

config = load_config('config.yaml')
metrics_config = load_config('metrics_config.yaml')

# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])
logger.info(f"Prices data shape: {prices_df.shape}")

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])

# identify sharks
shark_coins_df = td.classify_shark_coins(profits_df, config['training_data'])
shark_wallets_df = td.classify_shark_wallets(shark_coins_df,config['training_data'])

# generate and flatten buysell_metrics
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)
flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])


In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)


# generate and flatten buysell_metrics
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)
flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])

buysell_metrics_df.dtypes

In [None]:
buysell_metrics_df_test = pd.read_csv('../tests/fixtures/buysell_metrics_df.csv')
buysell_metrics_df_test['date'] = pd.to_datetime(buysell_metrics_df_test['date']).astype('datetime64[ns]')

buysell_metrics_df_test.head()

buysell_metrics_df = buysell_metrics_df_test

In [None]:
zero_activity_rows

In [None]:
buysell_metrics_df.head()

In [None]:
flattened_buysell_metrics_df['total_bought_sum']

In [None]:

# Introduce an outlier in the buysell_metrics_df for total_bought
outlier_df = buysell_metrics_df.copy()
outlier_df.loc[0, 'total_bought'] = 1e12  # Extreme value

# Flatten the modified DataFrame
flattened_buysell_metrics_df = fe.flatten_coin_date_df(outlier_df, metrics_config, config['training_data']['training_period_end'])

# Ensure the extreme value is handled and aggregated correctly
assert flattened_buysell_metrics_df['total_bought_sum'].max() >= 1e12, "Outlier in total_bought not handled correctly"

In [None]:
full_date_range

In [None]:
missing_dates

In [None]:
# Initialize a dictionary to store missing dates
missing_dates_dict = {}

# Iterate over unique coin_ids
for coin_id in df['coin_id'].unique():
    # Filter data for the current coin_id
    coin_df = df[df['coin_id'] == coin_id]
    
    # Create the full date range for the coin, explicitly cast to datetime
    full_date_range = pd.to_datetime(pd.date_range(start=coin_df['date'].min(), end=training_period_end))
    
    # Get the existing dates for the coin, explicitly cast to datetime
    existing_dates = set(pd.to_datetime(coin_df['date'].unique()))
    
    # Find the missing dates by subtracting existing from full date range
    missing_dates = set(full_date_range) - existing_dates
    
    # Store the missing dates for the current coin_id
    missing_dates_dict[coin_id] = sorted(missing_dates)

# Convert to DataFrame for easier display
missing_dates_df = pd.DataFrame(list(missing_dates_dict.items()), columns=['coin_id', 'missing_dates'])
missing_dates_df

In [None]:
training_period_end = '2024-04-30'

df = buysell_metrics_df
missing_dates = df.groupby('coin_id')['date'].apply(
    lambda x: pd.date_range(start=x.min(), end=training_period_end).difference(x.unique())
)
# missing_dates_df = missing_dates.reset_index(level=0)
pd.DataFrame(missing_dates)

In [None]:
# Initialize a dictionary to store missing dates
missing_dates_dict = {}

# Iterate over unique coin_ids
for coin_id in df['coin_id'].unique():
    # Filter data for the current coin_id
    coin_df = df[df['coin_id'] == coin_id]
    
    # Create the full date range for the coin, explicitly cast to pd.Timestamp
    full_date_range = pd.to_datetime(pd.date_range(start=coin_df['date'].min(), end=training_period_end)).to_pydatetime()

    # Get the existing dates for the coin, explicitly cast to pd.Timestamp
    existing_dates = set(pd.to_datetime(coin_df['date'].unique()).to_pydatetime())
    
    # Find the missing dates by subtracting existing from full date range
    missing_dates = set(full_date_range) - existing_dates
    
    # Store the missing dates for the current coin_id
    missing_dates_dict[coin_id] = sorted(missing_dates)

# Convert to DataFrame for easier display
missing_dates_df = pd.DataFrame(list(missing_dates_dict.items()), columns=['coin_id', 'missing_dates'])

In [None]:
# Initialize a dictionary to store missing dates
missing_dates_dict = {}

# Iterate over unique coin_ids
for coin_id in df['coin_id'].unique():
    # Filter data for the current coin_id
    coin_df = df[df['coin_id'] == coin_id]
    
    # Create the full date range for the coin
    full_date_range = pd.date_range(start=coin_df['date'].min(), end=training_period_end)
    
    # Get the existing dates for the coin
    existing_dates = set(coin_df['date'].unique())
    
    # Find the missing dates by subtracting existing from full date range
    missing_dates = set(full_date_range) - existing_dates
    
    # Store the missing dates for the current coin_id
    missing_dates_dict[coin_id] = sorted(missing_dates)

# Convert to DataFrame for easier display
missing_dates_df = pd.DataFrame(list(missing_dates_dict.items()), columns=['coin_id', 'missing_dates'])
missing_dates_df

In [None]:
df.groupby('coin_id')['date'].count()

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)

flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df, metrics_config, config['training_data']['training_period_end'])


In [None]:
# Flatten the buysell metrics DataFrame
flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df, metrics_config, config['training_data']['training_period_end'])

# Example: Verify that total_bought is aggregated as a sum
# Group original by coin_id and date for manual verification
expected_total_bought = buysell_metrics_df.groupby(['coin_id', 'date'])['total_bought'].sum().reset_index()

# Compare to the flattened result
result_total_bought = flattened_buysell_metrics_df[['coin_id', 'date', 'total_bought']]


In [None]:
config['training_data']['training_period_end']