In [26]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema
import progressbar


# load dotenv
load_dotenv()

# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df, create_progress_bar
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [3]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


### Training Data (profits_df) Generation

In [5]:
logger.setLevel(logging.INFO)

# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])



[15/Sep/2024 15:48:10] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[15/Sep/2024 15:48:11] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[15/Sep/2024 15:48:34] INFO [dreams_core.core.retrieve_transfers_data:333] retrieved transfers_df with shape (15703125, 5) after 23.6 seconds.
[15/Sep/2024 15:48:34] INFO [dreams_core.core.prepare_profits_data:374] Preparing profits_df data...
[15/Sep/2024 15:49:08] INFO [dreams_core.core.calculate_wallet_profitability:554] Generated profits df after 12.43 seconds
[15/Sep/2024 15:49:22] INFO [dreams_core.core.clean_profits_df:625] Finished cleaning profits_df after 14.43 seconds.


In [6]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')


X_train,X_test,y_train,y_test = i.build_configured_model_input(profits_df, prices_df, config, metrics_config, modeling_config)

[15/Sep/2024 15:49:28] INFO [dreams_core.core.classify_wallet_cohort:714] Wallet cohort classification complete. 759/68423 eligible wallets were added to the cohort.
[15/Sep/2024 15:49:29] INFO [dreams_core.core.generate_buysell_metrics_df:32] Preparing buysell_metrics_df...
[15/Sep/2024 15:49:31] INFO [dreams_core.core.generate_buysell_metrics_df:93] Generated buysell_metrics_df after 2.64 seconds.
[15/Sep/2024 15:49:31] INFO [dreams_core.core.flatten_coin_date_df:84] Flattening columns ['buyers_new', 'total_bought', 'total_sold', 'total_buyers'] into coin-level features...
[15/Sep/2024 15:49:31] INFO [dreams_core.core.flatten_coin_date_df:100] Flattened input df into coin-level features with shape (93, 57) after 0.10 seconds.
[15/Sep/2024 15:49:31] INFO [dreams_core.core.preprocess_coin_df:424] Preprocessed file saved at: ..//modeling/outputs/preprocessed_outputs/sharks_cohort_2024-09-15_15-49_model_period_2024-03-01_preprocessed.csv
[15/Sep/2024 15:49:31] INFO [dreams_core.core.crea

AttributeError: 'Series' object has no attribute 'set_index'

### Metrics and Feature Engineering

In [22]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')


# identify cohort
cohort_summary_df = td.classify_wallet_cohort(profits_df, config['wallet_cohorts']['sharks'])

# generate and flatten buysell_metrics
cohort_wallets = cohort_summary_df[cohort_summary_df['in_cohort']==True]['wallet_address']
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets)

# flatten, save, and preprocess the flattened df
flattened_output_directory = os.path.join(modeling_config['modeling']['modeling_folder'],'outputs/flattened_outputs')
cohort_name = list(config['wallet_cohorts'].keys())[0]
metric_description = f"{cohort_name}_cohort"

flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])
flattened_df, flattened_filepath = fe.save_flattened_outputs(flattened_buysell_metrics_df, flattened_output_directory, metric_description, config['training_data']['modeling_period_start'])
preprocessed_df, preprocessed_filepath = fe.preprocess_coin_df(flattened_filepath, modeling_config, metrics_config)

# create the training data df
input_directory = f"{preprocessed_filepath.split('preprocessed_outputs/')[0]}preprocessed_outputs/"
input_filenames = [
    preprocessed_filepath.split('preprocessed_outputs/')[1]
]
training_data_df = fe.create_training_data_df(input_directory, input_filenames)

# create the target variable df
target_variable_df,_ = fe.create_target_variables_mooncrater(prices_df, config['training_data'], modeling_config)

# merge the two into the final model input df
model_input_df = fe.prepare_model_input_df(training_data_df, target_variable_df, modeling_config['modeling']['target_column'])

# split the df into train and test sets
X_train, X_test, y_train, y_test = m.split_model_input(
    model_input_df,
    modeling_config['modeling']['target_column'],
    modeling_config['modeling']['train_test_split'],
    modeling_config['modeling']['random_state']
)

[15/Sep/2024 16:04:35] INFO [dreams_core.core.classify_wallet_cohort:714] Wallet cohort classification complete. 759/68423 eligible wallets were added to the cohort.
[15/Sep/2024 16:04:35] INFO [dreams_core.core.generate_buysell_metrics_df:32] Preparing buysell_metrics_df...
[15/Sep/2024 16:04:38] INFO [dreams_core.core.generate_buysell_metrics_df:93] Generated buysell_metrics_df after 2.91 seconds.
[15/Sep/2024 16:04:38] INFO [dreams_core.core.flatten_coin_date_df:84] Flattening columns ['buyers_new', 'total_bought', 'total_sold', 'total_buyers'] into coin-level features...
[15/Sep/2024 16:04:38] INFO [dreams_core.core.flatten_coin_date_df:100] Flattened input df into coin-level features with shape (93, 57) after 0.10 seconds.
[15/Sep/2024 16:04:38] INFO [dreams_core.core.preprocess_coin_df:424] Preprocessed file saved at: ..//modeling/outputs/preprocessed_outputs/sharks_cohort_2024-09-15_16-04_model_period_2024-03-01_preprocessed.csv
[15/Sep/2024 16:04:38] INFO [dreams_core.core.crea

In [25]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')


modeling_folder = modeling_config['modeling']['modeling_folder']
experiment_id = 'dev_work'

# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['modeling']['model_params'])

# 3.5 Evaluate the model's performance on the test set
metrics = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

# 3.6 Log the experiment results for this configuration
m.log_experiment_results(modeling_folder, model_id, experiment_id)

{'experiment_id': 'dev_work',
 'Model ID': 'a286e9e0-90c9-4b1e-b938-35d353e9fb82',
 'Model parameters': {'n_estimators': 100, 'random_state': 42},
 'accuracy': 0.5263157894736842,
 'precision': 0.4666666666666667,
 'recall': 0.875,
 'f1_score': 0.6086956521739131,
 'roc_auc': 0.4602272727272727,
 'log_loss': 1.0151526078156965,
 'confusion_matrix': '[[3, 8], [1, 7]]',
 'feature_importance': {'buyers_new_sum_sharks_cohort': 0.0352781251461175,
  'buyers_new_mean_sharks_cohort': 0.0330718788326681,
  'buyers_new_std_sharks_cohort': 0.0463142681819382,
  'buyers_new_median_sharks_cohort': 0.0,
  'buyers_new_sum_7d_period_1_sharks_cohort': 0.0115715749539976,
  'buyers_new_max_7d_period_1_sharks_cohort': 0.010186462921881,
  'buyers_new_change_7d_period_1_sharks_cohort': 0.0068356739344019,
  'buyers_new_pct_change_7d_period_1_sharks_cohort': 0.0021996758427585,
  'buyers_new_sum_7d_period_2_sharks_cohort': 0.0053410163890525,
  'buyers_new_max_7d_period_2_sharks_cohort': 0.005984380024878

## Full Workflow

In [28]:

importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

search_method = 'random'
experiment_name = 'dev'

i.run_experiments(search_method, modeling_config, experiment_name, max_evals=3)

[15/Sep/2024 16:35:52] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[15/Sep/2024 16:35:52] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
 [  0%] |                                                   | (ETA:  --:--:--)
[15/Sep/2024 16:36:09] INFO [dreams_core.core.retrieve_transfers_data:333] retrieved transfers_df with shape (15703125, 5) after 16.6 seconds.
[15/Sep/2024 16:36:12] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[15/Sep/2024 16:36:12] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[15/Sep/2024 16:36:12] INFO [dreams_core.core.prepare_profits_data:374] Preparing profits_df data...
[15/Sep/2024 16:36:44] INFO [dreams_core.core.calculate_wallet_profitability:554] Generated profits df after 12.02 second

experiment done


In [None]:
# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)


importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

import progressbar

import progressbar

import insights as i
importlib.reload(i)

def run_experiments(method, config_folder, modeling_folder, max_evals=50):
    """
    Runs experiments using a specified search method (grid or random), builds models,
    and logs the results of each experiment.

    Args:
    - method (str): 'grid' or 'random' to select the search method.
    - config_folder (str): Path to the folder containing all configuration files.
    - modeling_folder (str): Path to the folder where models, logs, and results will be saved.
    - max_evals (int): Number of iterations for Random search (default is 50).
    """

    # 1. Generate the experiment configurations
    experiment_configurations = i.generate_experiment_configurations(config_folder, method=method, max_evals=max_evals)

    # Generate prices_df
    config = load_config(os.path.join(config_folder,'config.yaml'))
    prices_df = td.retrieve_prices_data()
    prices_df,_ = td.fill_prices_gaps(prices_df, config['data_cleaning']['max_gap_days'])

    # 2. Create the progress bar
    total_experiments = len(experiment_configurations)
    bar = progressbar.ProgressBar(maxval=total_experiments, widgets=[
        ' [', progressbar.Percentage(), '] ',
        progressbar.Bar(), ' (', progressbar.ETA(), ') '
    ]).start()

    # 3. Iterate through each configuration
    for n, experiment in enumerate(experiment_configurations):
        
        # 3.1 Prepare the full configuration by applying overrides from the current experiment config
        config, metrics_config, modeling_config = i.prepare_configs(config_folder, experiment)
        
        # 3.2 Retrieve or rebuild profits_df based on config changes
        profits_df = i.rebuild_profits_df_if_necessary(config, modeling_folder)
        
        # 3.3 Build the configured model input data (train/test data)
        X_train, X_test, y_train, y_test = i.build_configured_model_input(profits_df, prices_df, config, metrics_config, modeling_config)

        # 3.4 Train the model using the current configuration and log the results
        model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['modeling']['model_params'])
        
        # 3.5 Evaluate the model's performance on the test set
        metrics = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

        # 3.6 Log the experiment results for this configuration
        m.log_experiment_results(modeling_folder, model_id)

        # Update the progress bar
        bar.update(n + 1)

    # Finish the progress bar
    bar.finish()

    # 4. Compare all experiments and analyze the best-performing configuration
    i.analyze_experiments(modeling_folder)


method = 'random'
config_folder = '../config'
modeling_folder = modeling_config['modeling']['modeling_folder']
max_evals = 10
run_experiments(method, config_folder, modeling_folder, max_evals)

[15/Sep/2024 13:26:41] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[15/Sep/2024 13:26:42] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
 [  0%] |                                                   | (ETA:  --:--:--)
[15/Sep/2024 13:27:02] INFO [dreams_core.core.retrieve_transfers_data:333] retrieved transfers_df with shape (15703125, 5) after 20.4 seconds.
[15/Sep/2024 13:27:05] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[15/Sep/2024 13:27:06] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[15/Sep/2024 13:27:06] INFO [dreams_core.core.prepare_profits_data:374] Preparing profits_df data...
[15/Sep/2024 13:27:39] INFO [dreams_core.core.calculate_wallet_profitability:554] Generated profits df after 12.63 second

TypeError: log_experiment_results() takes 2 positional arguments but 4 were given

In [59]:
modeling_folder

'..//modeling'

In [67]:


importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# # 3.1 Prepare the full configuration by applying overrides from the current experiment config
# config, metrics_config, modeling_config = i.prepare_configs(config_folder, experiment)

# # 3.2 Retrieve or rebuild profits_df based on config changes
# profits_df = i.rebuild_profits_df_if_necessary(config, modeling_folder)

# # 3.3 Build the configured model input data (train/test data)
# X_train, X_test, y_train, y_test = i.build_configured_model_input(profits_df, prices_df, config, metrics_config, modeling_config)

# # 3.4 Train the model using the current configuration and log the results
# model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['modeling']['model_params'])

# 3.5 Evaluate the model's performance on the test set
metrics = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

# # 3.6 Log the experiment results for this configuration
# m.log_experiment_results(modeling_folder, model_id)

In [65]:
X_train

Unnamed: 0,buyers_new_sum_sharks_cohort,buyers_new_mean_sharks_cohort,buyers_new_std_sharks_cohort,buyers_new_median_sharks_cohort,buyers_new_sum_7d_period_1_sharks_cohort,buyers_new_max_7d_period_1_sharks_cohort,buyers_new_change_7d_period_1_sharks_cohort,buyers_new_pct_change_7d_period_1_sharks_cohort,buyers_new_sum_7d_period_2_sharks_cohort,buyers_new_max_7d_period_2_sharks_cohort,...,total_bought_sum_7d_period_5_sharks_cohort,total_bought_change_7d_period_5_sharks_cohort,total_bought_sum_7d_period_6_sharks_cohort,total_bought_change_7d_period_6_sharks_cohort,total_bought_sum_7d_period_7_sharks_cohort,total_bought_change_7d_period_7_sharks_cohort,total_bought_sum_7d_period_8_sharks_cohort,total_bought_change_7d_period_8_sharks_cohort,total_sold_sum_sharks_cohort,total_buyers_sum_sharks_cohort
77,0.0327868852459,0.0327868852459,-0.666334537032,0,0,0,0,0,0,0,...,0,0,0,0,0,0,6722.13120633,-6722.13120633,-0.113769492786,7
42,0.0245901639344,0.0245901639344,-0.770686204016,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.113772970627,4
49,0.131147540984,0.131147540984,-0.528277308094,0,0,0,0,0,0,0,...,0,0,0,0,10000,0,0,0,-0.113767917747,23
11,0.639344262295,0.639344262295,0.204501633982,0,2,1,0,0,13,4,...,71755.4421336,-15993.7191636,140434.089729,-25357.1112721,337499.119114,-91873.010132,1107104.47389,50799.4446022,-0.113656063647,132
30,0.122950819672,0.122950819672,-0.0435484150835,0,0,0,0,0,0,0,...,45465.1378269,0,0,0,26766.866104,0,0,0,-0.11377215967,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20,0.131147540984,0.131147540984,-0.483638976779,0,0,0,0,0,0,0,...,434421.808787,50556,27414.6244,27414.6244,214698.115571,0,136526.658399,0,-0.113759610776,35
60,0.196721311475,0.196721311475,-0.186525920006,0,3,2,0,0,2,1,...,0,0,90948.1721081,45123.5117018,0,0,3193.35975781,-3193.35975781,-0.113738596861,36
71,0.319672131148,0.319672131148,-0.318532737644,0,1,1,0,0,0,0,...,5110.31420029,-1544.48289324,10198.6842807,-4557.03070649,7679.52023748,-3991.9173667,30942.0666352,9327.7602703,-0.113772488226,80
14,0,0,-0.848313320509,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,-0.113775096713,1


In [63]:
X_test

Unnamed: 0,buyers_new_sum_sharks_cohort,buyers_new_mean_sharks_cohort,buyers_new_std_sharks_cohort,buyers_new_median_sharks_cohort,buyers_new_sum_7d_period_1_sharks_cohort,buyers_new_max_7d_period_1_sharks_cohort,buyers_new_change_7d_period_1_sharks_cohort,buyers_new_pct_change_7d_period_1_sharks_cohort,buyers_new_sum_7d_period_2_sharks_cohort,buyers_new_max_7d_period_2_sharks_cohort,...,total_bought_sum_7d_period_5_sharks_cohort,total_bought_change_7d_period_5_sharks_cohort,total_bought_sum_7d_period_6_sharks_cohort,total_bought_change_7d_period_6_sharks_cohort,total_bought_sum_7d_period_7_sharks_cohort,total_bought_change_7d_period_7_sharks_cohort,total_bought_sum_7d_period_8_sharks_cohort,total_bought_change_7d_period_8_sharks_cohort,total_sold_sum_sharks_cohort,total_buyers_sum_sharks_cohort
78,0.016393442623,0.016393442623,-0.751113315363,0,3,2,2,100,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.113775096713,3
0,0.33606557377,0.33606557377,-0.33371154499,0,2,1,0,0,0,0,...,3425.07277788,-259.29160586,15748.2282356,3074.43734735,7353.88277376,-151.50142402,4370.14192189,0.0,-0.113772888317,92
68,0.303278688525,0.303278688525,-0.0672762638177,0,2,1,0,0,3,1,...,88170.8266494,0.0,0.0,0.0,33367.2912962,6218.66991256,147153.498702,-6150.78670442,-0.113747068665,59
22,0.0819672131148,0.0819672131148,-0.649274153714,0,1,1,0,0,0,0,...,15126.5114585,-1146.87789769,196.727782485,0.0,0.0,0.0,36694.6397012,0.0,-0.113774189302,22
12,0.66393442623,0.66393442623,2.34362386558,0,0,0,0,0,0,0,...,450.210738479,-117.359478247,154.5,0.0,1.00161403285,0.0,75.40502041,0.0,-0.113773184056,146
82,0.016393442623,0.016393442623,-0.690401335047,0,0,0,0,0,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.113775096713,3
10,0.106557377049,0.106557377049,-0.340557339792,0,0,0,0,0,0,0,...,1100.00000012,0.0,53120.5261531,0.0,52275.7768977,0.0,0.0,0.0,-0.113741324604,21
18,0.122950819672,0.122950819672,-0.624967676721,0,3,1,-1,-100,3,1,...,0.0,0.0,0.0,0.0,0.0,0.0,309045.13,0.0,-0.113747810884,23
4,0.122950819672,0.122950819672,-0.182096115728,0,1,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.113770961721,21
66,0.0,0.0,-0.848313320509,0,1,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.113775096713,1


In [31]:
# 1. Identify cohort of wallets (e.g., sharks) based on the cohort classification logic
cohort_summary_df = td.classify_wallet_cohort(profits_df, config['wallet_cohorts']['sharks'])
print(f"Shape of cohort_summary_df: {cohort_summary_df.shape}")

# 2. Generate buysell metrics for wallets in the identified cohort
cohort_wallets = cohort_summary_df[cohort_summary_df['in_cohort']]['wallet_address']
buysell_metrics_df = cwm.generate_buysell_metrics_df(
    profits_df,
    config['training_data']['training_period_end'],
    cohort_wallets
)
print(f"Shape of buysell_metrics_df: {buysell_metrics_df.shape}")

# 3. Flatten the buysell metrics DataFrame, save it, and preprocess it
flattened_output_directory = os.path.join(
    modeling_config['modeling']['modeling_folder'],
    'outputs/flattened_outputs/'
)
cohort_name = list(config['wallet_cohorts'].keys())[0]
metric_description = f"{cohort_name}_cohort"

flattened_buysell_metrics_df = fe.flatten_coin_date_df(
    buysell_metrics_df,
    metrics_config,
    config['training_data']['training_period_end']
)
print(f"Shape of flattened_buysell_metrics_df: {flattened_buysell_metrics_df.shape}")

# Save flattened outputs
flattened_buysell_metrics_df, flattened_filepath = fe.save_flattened_outputs(
    flattened_buysell_metrics_df,
    flattened_output_directory,
    metric_description,
    config['training_data']['modeling_period_start']
)
print(f"Shape of flattened_buysell_metrics_df after saving: {flattened_buysell_metrics_df.shape}")
print(f"Flattened outputs saved at: {flattened_filepath}")

# Preprocess the flattened DataFrame
preprocessed_df, preprocessed_filepath = fe.preprocess_coin_df(
    flattened_filepath,
    modeling_config,
    metrics_config
)
print(f"Shape of preprocessed_df: {preprocessed_df.shape}")
print(f"Preprocessed outputs saved at: {preprocessed_filepath}")

# 4. Create training data from the preprocessed DataFrame
input_directory = f"{preprocessed_filepath.split('preprocessed_outputs/')[0]}preprocessed_outputs/"
input_filenames = [preprocessed_filepath.split('preprocessed_outputs/')[1]]
training_data_df = fe.create_training_data_df(input_directory, input_filenames)
print(f"Shape of training_data_df: {training_data_df.shape}")

# 5. Create the target variable DataFrame based on price changes
target_variable_df, _ = fe.create_target_variables_mooncrater(
    prices_df,
    config['training_data'],
    modeling_config
)
print(f"Shape of target_variable_df: {target_variable_df.shape}")

# 6. Merge the training data with the target variables to create the model input DataFrame
model_input_df = fe.prepare_model_input_df(
    training_data_df,
    target_variable_df,
    modeling_config['modeling']['target_column']
)
print(f"Shape of model_input_df: {model_input_df.shape}")

# 7. Split the data into train and test sets
X_train, X_test, y_train, y_test = m.split_model_input(
    model_input_df,
    modeling_config['modeling']['target_column'],
    modeling_config['modeling']['train_test_split'],
    modeling_config['modeling']['random_state']
)
print(f"Shapes of X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")

KeyboardInterrupt: 