In [1]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

# load dotenv
load_dotenv()

# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [2]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


### Training Data (profits_df) Generation

In [3]:
logger.setLevel(logging.INFO)

# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])



[15/Sep/2024 01:03:41] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[15/Sep/2024 01:03:41] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[15/Sep/2024 01:04:03] INFO [dreams_core.core.retrieve_transfers_data:333] retrieved transfers_df with shape (15703125, 5) after 21.8 seconds.
[15/Sep/2024 01:04:03] INFO [dreams_core.core.prepare_profits_data:374] Preparing profits_df data...
[15/Sep/2024 01:04:34] INFO [dreams_core.core.calculate_wallet_profitability:554] Generated profits df after 11.98 seconds
[15/Sep/2024 01:04:47] INFO [dreams_core.core.clean_profits_df:625] Finished cleaning profits_df after 12.68 seconds.


In [28]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')


X_train,X_test,y_train,y_test = i.build_configured_model_input(profits_df, prices_df, config, metrics_config, modeling_config)

[15/Sep/2024 01:50:47] INFO [dreams_core.core.classify_wallet_cohort:714] Wallet cohort classification complete. 759/68423 eligible wallets were added to the cohort.
[15/Sep/2024 01:50:47] INFO [dreams_core.core.generate_buysell_metrics_df:32] Preparing buysell_metrics_df...
[15/Sep/2024 01:50:49] INFO [dreams_core.core.generate_buysell_metrics_df:93] Generated buysell_metrics_df after 2.57 seconds.
[15/Sep/2024 01:50:50] INFO [dreams_core.core.flatten_coin_date_df:84] Flattening columns ['buyers_new', 'total_bought', 'total_sold', 'total_buyers'] into coin-level features...
[15/Sep/2024 01:50:50] INFO [dreams_core.core.flatten_coin_date_df:100] Flattened input df into coin-level features with shape (93, 57) after 0.10 seconds.
[15/Sep/2024 01:50:50] INFO [dreams_core.core.preprocess_coin_df:425] Preprocessed file saved at: ..//modeling/outputs/preprocessed_outputs/sharks_cohort_2024-09-15_01-50_model_period_2024-03-01_preprocessed.csv
[15/Sep/2024 01:50:50] INFO [dreams_core.core.crea

### Metrics and Feature Engineering

In [12]:
modeling_config[model]

'/outputs/flattened_outputs/'

In [14]:
print(modeling_config['modeling']['modeling_folder'])
print(os.path.join(modeling_config['modeling']['modeling_folder'],'/outputs/flattened_outputs/'))

..//modeling
/outputs/flattened_outputs/


In [24]:
modeling_config['modeling']['train_test_split']

'sharks cohort'

In [4]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')


# identify cohort
cohort_summary_df = td.classify_wallet_cohort(profits_df, config['wallet_cohorts']['sharks'])

# generate and flatten buysell_metrics
cohort_wallets = cohort_summary_df[cohort_summary_df['in_cohort']==True]['wallet_address']
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets)

# flatten, save, and preprocess the flattened df
flattened_output_directory = os.path.join(modeling_config['modeling']['modeling_folder'],'/outputs/flattened_outputs/')
cohort_name = list(config['wallet_cohorts'].keys())[0]
metric_description = f"{cohort_name}_cohort"

flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])
flattened_df, flattened_filepath = fe.save_flattened_outputs(flattened_buysell_metrics_df, flattened_output_directory, metric_description, config['training_data']['modeling_period_start'])
preprocessed_df, preprocessed_filepath = fe.preprocess_coin_df(flattened_filepath, modeling_config, metrics_config)

# create the training data df
input_directory = f"{preprocessed_filepath.split('preprocessed_outputs/')[0]}preprocessed_outputs/"
input_filenames = [
    preprocessed_filepath.split('preprocessed_outputs/')[1]
]
training_data_df = fe.create_training_data_df(input_directory, input_filenames)

# create the target variable df
target_variable_df,_ = fe.create_target_variables_mooncrater(prices_df, config['training_data'], modeling_config)

# merge the two into the final model input df
model_input_df = fe.prepare_model_input_df(training_data_df, target_variable_df, modeling_config['modeling']['target_column'])

# split the df into train and test sets
X_train, X_test, y_train, y_test = m.split_model_input(
    model_input_df,
    modeling_config['modeling']['target_column'],
    modeling_config['modeling']['train_test_split'],
    modeling_config['modeling']['train_test_split']
)

[15/Sep/2024 01:04:53] INFO [dreams_core.core.classify_wallet_cohort:714] Wallet cohort classification complete. 759/68423 eligible wallets were added to the cohort.
[15/Sep/2024 01:04:53] INFO [dreams_core.core.generate_buysell_metrics_df:32] Preparing buysell_metrics_df...
[15/Sep/2024 01:04:56] INFO [dreams_core.core.generate_buysell_metrics_df:93] Generated buysell_metrics_df after 2.60 seconds.
[15/Sep/2024 01:04:56] INFO [dreams_core.core.flatten_coin_date_df:84] Flattening columns ['buyers_new', 'buyers_repeat', 'sellers_new', 'sellers_repeat', 'total_bought', 'total_sold', 'total_net_transfers', 'total_holders', 'total_balance'] into coin-level features...
[15/Sep/2024 01:04:56] INFO [dreams_core.core.flatten_coin_date_df:100] Flattened input df into coin-level features with shape (93, 204) after 0.24 seconds.
[15/Sep/2024 01:04:56] INFO [dreams_core.core.preprocess_coin_df:423] Preprocessed file saved at: ..//modeling/outputs/preprocessed_outputs/buysell_metrics_0.1_2024-09-15

## Full Workflow

In [44]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


from tqdm.notebook import tqdm


def run_experiments(method, config_folder, modeling_folder, max_evals=50):
    """
    Runs experiments using a specified search method (grid or random), builds models,
    and logs the results of each experiment.

    Args:
    - method (str): 'grid' or 'random' to select the search method.
    - config_folder (str): Path to the folder containing all configuration files.
    - modeling_folder (str): Path to the folder where models, logs, and results will be saved.
    - max_evals (int): Number of iterations for Random search (default is 50).
    """

    # 1. Generate the experiment configurations
    experiment_configurations = i.generate_experiment_configurations(config_folder, method=method, max_evals=max_evals)

    # Generate prices_df
    config = load_config(os.path.join(config_folder,'config.yaml'))
    prices_df = td.retrieve_prices_data()
    prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])

    # 2. Iterate through each configuration with a progress bar
    with tqdm(total=len(experiment_configurations), desc="Running Experiments") as pbar:
        for experiment in experiment_configurations:
            
            # 2.1 Prepare the full configuration by applying overrides from the current experiment config
            config, metrics_config, modeling_config = i.prepare_configs(config_folder, experiment)
            
            # 2.2 Retrieve or rebuild profits_df based on config changes
            profits_df = i.rebuild_profits_df_if_necessary(config, modeling_folder)
            
            # 2.3 Build the configured model input data (train/test data)
            X_train, X_test, y_train, y_test = i.build_configured_model_input(profits_df, prices_df, config, metrics_config, modeling_config)

            # 2.4 Train the model using the current configuration and log the results
            model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['hyperparameters'])
            
            # 2.5 Evaluate the model's performance on the test set
            metrics = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

            # 2.6 Log the experiment results for this configuration
            m.log_experiment_results(modeling_folder, model_id, config, metrics)

            # Update the progress bar after each experiment
            pbar.update(1)


    # 3. Compare all experiments and analyze the best-performing configuration
    i.analyze_experiments(modeling_folder)


method = 'random'
config_folder = '../config'
modeling_folder = modeling_config['modeling']['modeling_folder']
max_evals = 10
run_experiments(method, config_folder, modeling_folder, max_evals)

[15/Sep/2024 11:19:24] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[15/Sep/2024 11:19:24] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.


Running Experiments:   0%|          | 0/10 [00:00<?, ?it/s]

[15/Sep/2024 11:19:45] INFO [dreams_core.core.retrieve_transfers_data:333] retrieved transfers_df with shape (15703125, 5) after 20.3 seconds.
[15/Sep/2024 11:19:47] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[15/Sep/2024 11:19:47] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[15/Sep/2024 11:19:48] INFO [dreams_core.core.prepare_profits_data:374] Preparing profits_df data...
[15/Sep/2024 11:20:21] INFO [dreams_core.core.calculate_wallet_profitability:554] Generated profits df after 12.43 seconds
[15/Sep/2024 11:20:37] INFO [dreams_core.core.clean_profits_df:625] Finished cleaning profits_df after 16.39 seconds.
[15/Sep/2024 11:20:44] INFO [dreams_core.core.classify_wallet_cohort:714] Wallet cohort classification complete. 535/29142 eligible wallets were added to the cohort.
[15/Sep/2024 11:20:44] INFO [dreams_core.core.generate_buysell_metrics_

AttributeError: module 'insights' has no attribute 'train_model'

In [39]:
experiment_configurations[0]

{'modeling_config.target_variables.moon_threshold': 0.3,
 'modeling_config.preprocessing.drop_features': ['total_sellers_sum',
  'buyers_new_median'],
 'metrics_config.metrics.total_bought.aggregations.sum.scaling': 'standard',
 'metrics_config.metrics.total_bought.aggregations.median.scaling': 'None',
 'metrics_config.metrics.buyers_new.aggregations.sum.scaling': 'minmax',
 'metrics_config.metrics.buyers_new.aggregations.mean.scaling': 'minmax',
 'config.wallet_cohorts.sharks.wallet_minimum_inflows': 10000,
 'config.wallet_cohorts.sharks.wallet_min_coin_wins': 3,
 'config.wallet_cohorts.sharks.wallet_maximum_inflows': 100000,
 'config.wallet_cohorts.sharks.coin_return_win_threshold': 0.75,
 'config.wallet_cohorts.sharks.coin_profits_win_threshold': 5000,
 'config.training_data.training_period_duration': 180,
 'config.training_data.modeling_period_duration': 14,
 'config.data_cleaning.profitability_filter': 15000000,
 'config.data_cleaning.inflows_filter': 10000000}

In [31]:
# 1. Identify cohort of wallets (e.g., sharks) based on the cohort classification logic
cohort_summary_df = td.classify_wallet_cohort(profits_df, config['wallet_cohorts']['sharks'])
print(f"Shape of cohort_summary_df: {cohort_summary_df.shape}")

# 2. Generate buysell metrics for wallets in the identified cohort
cohort_wallets = cohort_summary_df[cohort_summary_df['in_cohort']]['wallet_address']
buysell_metrics_df = cwm.generate_buysell_metrics_df(
    profits_df,
    config['training_data']['training_period_end'],
    cohort_wallets
)
print(f"Shape of buysell_metrics_df: {buysell_metrics_df.shape}")

# 3. Flatten the buysell metrics DataFrame, save it, and preprocess it
flattened_output_directory = os.path.join(
    modeling_config['modeling']['modeling_folder'],
    'outputs/flattened_outputs/'
)
cohort_name = list(config['wallet_cohorts'].keys())[0]
metric_description = f"{cohort_name}_cohort"

flattened_buysell_metrics_df = fe.flatten_coin_date_df(
    buysell_metrics_df,
    metrics_config,
    config['training_data']['training_period_end']
)
print(f"Shape of flattened_buysell_metrics_df: {flattened_buysell_metrics_df.shape}")

# Save flattened outputs
flattened_buysell_metrics_df, flattened_filepath = fe.save_flattened_outputs(
    flattened_buysell_metrics_df,
    flattened_output_directory,
    metric_description,
    config['training_data']['modeling_period_start']
)
print(f"Shape of flattened_buysell_metrics_df after saving: {flattened_buysell_metrics_df.shape}")
print(f"Flattened outputs saved at: {flattened_filepath}")

# Preprocess the flattened DataFrame
preprocessed_df, preprocessed_filepath = fe.preprocess_coin_df(
    flattened_filepath,
    modeling_config,
    metrics_config
)
print(f"Shape of preprocessed_df: {preprocessed_df.shape}")
print(f"Preprocessed outputs saved at: {preprocessed_filepath}")

# 4. Create training data from the preprocessed DataFrame
input_directory = f"{preprocessed_filepath.split('preprocessed_outputs/')[0]}preprocessed_outputs/"
input_filenames = [preprocessed_filepath.split('preprocessed_outputs/')[1]]
training_data_df = fe.create_training_data_df(input_directory, input_filenames)
print(f"Shape of training_data_df: {training_data_df.shape}")

# 5. Create the target variable DataFrame based on price changes
target_variable_df, _ = fe.create_target_variables_mooncrater(
    prices_df,
    config['training_data'],
    modeling_config
)
print(f"Shape of target_variable_df: {target_variable_df.shape}")

# 6. Merge the training data with the target variables to create the model input DataFrame
model_input_df = fe.prepare_model_input_df(
    training_data_df,
    target_variable_df,
    modeling_config['modeling']['target_column']
)
print(f"Shape of model_input_df: {model_input_df.shape}")

# 7. Split the data into train and test sets
X_train, X_test, y_train, y_test = m.split_model_input(
    model_input_df,
    modeling_config['modeling']['target_column'],
    modeling_config['modeling']['train_test_split'],
    modeling_config['modeling']['random_state']
)
print(f"Shapes of X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")

KeyboardInterrupt: 