In [1]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

# load dotenv
load_dotenv()

# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [2]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe) 
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')

### Training Data and Metrics Generation

In [3]:
logger.setLevel(logging.INFO)

# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])

# identify sharks
shark_coins_df = td.classify_shark_coins(profits_df, config['training_data'])
shark_wallets_df = td.classify_shark_wallets(shark_coins_df,config['training_data'])

# generate and flatten buysell_metrics
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)


[13/Sep/2024 21:54:12] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[13/Sep/2024 21:54:13] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[13/Sep/2024 21:54:43] INFO [dreams_core.core.retrieve_transfers_data:333] retrieved transfers_df with shape (23187686, 5) after 30.7 seconds.
[13/Sep/2024 21:54:43] INFO [dreams_core.core.prepare_profits_data:374] Preparing profits_df data...
[13/Sep/2024 21:55:33] INFO [dreams_core.core.calculate_wallet_profitability:554] Generated profits df after 18.75 seconds
[13/Sep/2024 21:55:54] INFO [dreams_core.core.clean_profits_df:625] Finished cleaning profits_df after 21.20 seconds.
[13/Sep/2024 21:56:01] INFO [dreams_core.core.classify_shark_coins:691] creation of shark_coins_df complete.
[13/Sep/2024 21:56:01] INFO [dreams_core.core.generate_buysell_metrics_df:33] Preparing buysell_metrics_df...
[13/Sep/2024 21:56

### Feature Engineering

In [5]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe) 
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')

# flatten, save, and preprocess the flattened df
output_directory = '..//modeling/outputs/flattened_outputs/'
metric_description = 'buysell_metrics'
modeling_period_start = config['training_data']['modeling_period_start']
version = '0.1'

flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])
flattened_output_filepath = fe.save_flattened_outputs(flattened_buysell_metrics_df, output_directory, metric_description, modeling_period_start, version)
preprocessed_output_filepath = fe.preprocess_coin_df(flattened_output_filepath, modeling_config)

# create the model input df
input_directory = f"{preprocessed_output_filepath.split('preprocessed_outputs/')[0]}preprocessed_outputs/"
input_filenames = [
    preprocessed_output_filepath.split('preprocessed_outputs/')[1]
]
training_data_df = fe.create_training_data_df(input_directory, input_filenames)

# create the target variables df
target_variable_df,_ = fe.create_target_variables_mooncrater(prices_df, config['training_data'], modeling_config)



[13/Sep/2024 21:56:28] INFO [dreams_core.core.flatten_coin_date_df:82] Flattening columns ['buyers_new', 'total_bought', 'total_sold', 'total_buyers'] into coin-level features...
[13/Sep/2024 21:56:28] INFO [dreams_core.core.flatten_coin_date_df:98] Flattened input df into coin-level features with shape (114, 56) after 0.09 seconds.
[13/Sep/2024 21:56:28] INFO [dreams_core.core.create_training_data_df:500] 1 files were successfully merged.
[13/Sep/2024 21:56:28] INFO [dreams_core.core.create_training_data_df:504] All specified files were found and merged successfully.
[13/Sep/2024 21:56:28] INFO [dreams_core.core.create_target_variables_mooncrater:582] Target variables created for 334 coins with 29/334 (8.7%) moons and 91/334 (27.2%) craters.


In [7]:
import pandas as pd
import logging

def prepare_model_input_df(training_data_df, target_variable_df):
    """
    Prepares the final model input DataFrame by merging the training data with the target variables 
    on 'coin_id'. Checks for data quality issues like missing columns, duplicate coin_ids, and missing target variables.

    Parameters:
    - training_data_df: DataFrame containing the features for training the model.
    - target_variable_df: DataFrame containing target variables for each coin_id.

    Returns:
    - model_input_df: Merged DataFrame with both features and target variables.
    """

    # Step 1: Ensure that both DataFrames have 'coin_id' as a column
    if 'coin_id' not in training_data_df.columns:
        raise ValueError("The 'coin_id' column is missing in training_data_df")
    if 'coin_id' not in target_variable_df.columns:
        raise ValueError("The 'coin_id' column is missing in target_variable_df")

    # Step 2: Check for duplicated coin_id entries in both DataFrames
    if training_data_df['coin_id'].duplicated().any():
        raise ValueError("Duplicate 'coin_id' found in training_data_df")
    if target_variable_df['coin_id'].duplicated().any():
        raise ValueError("Duplicate 'coin_id' found in target_variable_df")

    # Step 3: Merge the training data with the target variable DataFrame on 'coin_id'
    model_input_df = pd.merge(training_data_df, target_variable_df, on='coin_id', how='inner')

    # Step 4: Check if any coin_id from training_data_df is missing a target variable
    missing_targets = set(training_data_df['coin_id']) - set(target_variable_df['coin_id'])
    if missing_targets:
        logger.warning("Some 'coin_id's are missing target variables: %s", ', '.join(map(str, missing_targets)))

    # Step 5: Perform final quality checks (e.g., no NaNs in important columns)
    if model_input_df.isnull().any().any():
        logger.warning("NaN values found in the merged DataFrame")

    # Step 6: Return the final model input DataFrame
    logger.info("Model input DataFrame created with shape: %s", model_input_df.shape)
    
    return model_input_df

In [8]:
model_input_df = prepare_model_input_df(training_data_df, target_variable_df)

In [9]:
model_input_df

Unnamed: 0,coin_id,buyers_new_sum_buysell_metrics,buyers_new_mean_buysell_metrics,buyers_new_std_buysell_metrics,buyers_new_sum_7d_period_1_buysell_metrics,buyers_new_max_7d_period_1_buysell_metrics,buyers_new_change_7d_period_1_buysell_metrics,buyers_new_pct_change_7d_period_1_buysell_metrics,buyers_new_sum_7d_period_2_buysell_metrics,buyers_new_max_7d_period_2_buysell_metrics,...,total_bought_sum_7d_period_6_buysell_metrics,total_bought_change_7d_period_6_buysell_metrics,total_bought_sum_7d_period_7_buysell_metrics,total_bought_change_7d_period_7_buysell_metrics,total_bought_sum_7d_period_8_buysell_metrics,total_bought_change_7d_period_8_buysell_metrics,total_sold_sum_buysell_metrics,total_buyers_sum_buysell_metrics,is_moon,is_crater
0,04f6120a-f0dd-4260-bb2b-b8f827fdba61,34,0.566666666667,1.62987642941,0,0,0,0,0,0,...,14856.8662266,1211.11961168,45413.2778779,-34568.380127,0,0,50221.2439699,63,1,0
1,0b9d343d-4e25-4d22-b49c-fa17509a0333,25,0.416666666667,2.23448827545,0,0,0,0,0,0,...,3250,-250,1923.35001536,-1115.42099899,3558.43510787,-1763.77561525,56488.7082947,42,0,1
2,0db96a94-082b-4e13-a315-860850e9ff4f,16,0.266666666667,1.68610137688,2,1,0,0,0,0,...,0,0,0,0,1191933.80318,0,857068.688664,21,0,0
3,0e1c102e-2e7d-4aed-af2d-1526c2e0720a,2,0.0333333333333,0.258198889747,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
4,0eedc336-a78e-4b25-957e-57117227ef78,48,0.8,4.77919218761,0,0,0,0,1,1,...,1934810.43636,0,18303635.8375,-10591739.9837,11833138,-5205198,62088054.3225,78,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,f68b64ae-61d5-4dd6-b448-4ae9c754bd07,35,0.583333333333,2.40966744078,0,0,0,0,0,0,...,13649.7769231,1680.5378172,33708.7201122,22172.1960451,104225.006961,-17493.7641044,345315.704962,87,0,0
110,f7b278de-7fa7-4f87-ba5d-3eb2e57d933a,13,0.216666666667,1.42723706851,0,0,0,0,0,0,...,27.863588,0,1563811.87629,-599700.122691,511721.417861,511721.417861,10000,20,0,0
111,f87b6a04-49f1-475c-8a0d-e65ddea3129c,2,0.0333333333333,0.181020334719,0,0,0,0,1,1,...,0,0,0,0,0,0,0,3,0,0
112,f9d19e1d-8637-4e7c-b808-4b4bfe18e316,26,0.433333333333,2.47952063893,0,0,0,0,0,0,...,0,0,0,0,0,0,53673.7024709,71,0,0
