In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')


# Load all configs as global variables
global CONFIG, METRICS_CONFIG, MODELING_CONFIG, EXPERIMENTS_CONFIG

CONFIG = u.load_config('../config/config.yaml')
METRICS_CONFIG = u.load_config('../config/metrics_config.yaml')
MODELING_CONFIG = u.load_config('../config/modeling_config.yaml')
EXPERIMENTS_CONFIG = u.load_config('../config/experiments_config.yaml')

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')


## Base Tables

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')

# retreive market data
market_data_df = td.retrieve_market_data()
market_data_df,_ = td.fill_market_data_gaps(market_data_df,config['data_cleaning']['max_gap_days'])
market_data_df,_,_ = cwm.split_dataframe_by_coverage(
    market_data_df,
    start_date=config['training_data']['training_period_start'],
    end_date=config['training_data']['modeling_period_end'],
    id_column='coin_id'
)
prices_df = market_data_df[['coin_id','date','price']].copy()


# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )


# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])

## Experiment Setup

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')

# initial steps for this model
filtered_market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]

training_data_tuples = []
training_data_dfs = []

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')

# 1. Extract config variables and store experiment metadata
# ---------------------------------------------------------
# Extract folder paths from modeling_config
modeling_folder = modeling_config['modeling']['modeling_folder']
config_folder = modeling_config['modeling']['config_folder']

# Load experiments_config.yaml
experiments_config = u.load_config(os.path.join(config_folder, 'experiments_config.yaml'))

# Extract metadata and experiment details from experiments_config
experiment_name = experiments_config['metadata']['experiment_name']
experiment_id = f"{experiment_name}_{uuid.uuid4()}"
search_method = experiments_config['metadata']['search_method']
max_evals = experiments_config['metadata']['max_evals']

# Add a timestamp to the metadata
metadata = experiments_config['metadata']
metadata['experiment_id'] = experiment_id
metadata['start_time'] = datetime.now().isoformat()
metadata['trial_logs'] = []  # Initialize the array for trial log filenames


# 2. Initialize trial configurations and initial variables
# -------------------------------------------------------------------------
# Generate the trial configurations based on variable_overrides
trial_configurations = i.generate_experiment_configurations(config_folder, method=search_method, max_evals=max_evals)

# Cap the number of trials if 'max_evals' is set
max_evals = experiments_config['metadata'].get('max_evals', len(trial_configurations))
total_trials = min(len(trial_configurations), max_evals)

# Generate prices_df
config = u.load_config(os.path.join(config_folder, 'config.yaml'))
market_data_df = td.retrieve_market_data()
market_data_df, _ = td.fill_market_data_gaps(market_data_df, config['data_cleaning']['max_gap_days'])
prices_df = market_data_df[['coin_id','date','price']].copy()

# Initialize progress bar and empty variables
trials_bar = u.create_progress_bar(total_trials)
profits_df = None


In [None]:
n = 0
trial = trial_configurations[:total_trials][n]

In [None]:

# 3.1 Prepare the full configuration by applying overrides from the current trial config
config, metrics_config, modeling_config = i.prepare_configs(config_folder, trial)

# Store the configuration settings used in this trial in metadata
metadata['config_settings'] = {
    "config": config,
    "metrics_config": metrics_config,
    "modeling_config": modeling_config
}

# # 3.2 Retrieve or rebuild profits_df based on config changes
# profits_df = i.rebuild_profits_df_if_necessary(config, modeling_folder, prices_df, profits_df)

# 3.3 Build the configured model input data (train/test data)
X_train, X_test, y_train, y_test = i.build_configured_model_input(profits_df, prices_df, config, metrics_config, modeling_config)

# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['modeling']['model_params'])

# 3.5 Evaluate and log the model's performance on the test set
_ = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

# 3.6 Log the trial results for this configuration
# Include the trial name, metadata, and other relevant details
trial_log_filename = m.log_trial_results(modeling_folder, model_id, experiment_id, trial)

# Append the trial log filename to the metadata
metadata['trial_logs'].append(trial_log_filename)


## Model Generation Sequence

### Prices Metrics

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')

# dataset variables
dataset_category = 'time_series'
dataset_name = 'market_data'
dataset_df = filtered_market_data_df.copy()
dataset_metrics_config = metrics_config[dataset_category][dataset_name]

market_data_tuples, market_data_training_dfs = fe.generate_time_series_features(
        dataset_name,
        dataset_df,
        dataset_metrics_config,
        config,
        modeling_config
    )


In [None]:
dataset_metrics_config
training_data_tuples, training_data_dfs = fe.generate_time_series_features(
        dataset_name,
        dataset_df,
        dataset_metrics_config,
        config,
        modeling_config
    )

training_data_tuples

In [None]:
training_data_tuples.append(training_data_tuples)
training_data_tuples

In [None]:
len(training_data_tuples)

In [None]:
# run through each dataset
for dataset_name in metrics_config['time_series']:


### Wallet Cohorts

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.DEBUG)

# dataset variables
dataset_category = 'wallet_cohorts'


for cohort_name in metrics_config[dataset_category]:

    # load configs
    dataset_metrics_config = metrics_config[dataset_category][cohort_name]
    dataset_config = config['datasets'][dataset_category][cohort_name]
    cohort_description = dataset_config['description']

    # identify wallets in the cohort
    cohort_summary_df = cwm.classify_wallet_cohort(profits_df, dataset_config)
    cohort_wallets = cohort_summary_df[cohort_summary_df['in_cohort']==True]['wallet_address']

    # If no cohort members were identified, continue
    if len(cohort_wallets) == 0:
        logger.info("No wallets identified as members of cohort '%s'", cohort_name)
        continue

    # generate cohort buysell_metrics
    cohort_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets)

    # generate features from the metrics
    dataset_features_df, dataset_tuple = fe.convert_dataset_metrics_to_features(
        cohort_metrics_df,
        dataset_config,
        dataset_metrics_config,
        config,
        modeling_config
    )

    # identify columns for logging
    dataset_features = dataset_features_df.columns.tolist()
    dataset_features.remove('coin_id')

    logger.info("Generated %s features for %s '%s'.",
                len(dataset_features), dataset_category, cohort_name)
    logger.debug('Features generated: %s', dataset_features)

    training_data_tuples.append(dataset_tuple)
    training_data_dfs.append(dataset_features_df)



### Metadata

In [None]:
# dataset_category = 'coin_facts'
# dataset_name = 'coin_metadata'


# # load configs
# dataset_config = config['datasets'][dataset_category][dataset_name]


# # generate features
# metadata_df = td.retrieve_metadata_data()
# metadata_features_df = td.generate_coin_metadata_features(metadata_df, config)
# metadata_features_df.head()

# # save flattened output
# flattened_output_directory = os.path.join(modeling_config['modeling']['modeling_folder'],'outputs/flattened_outputs')
# flattened_metadata_df, flattened_metadata_filepath = fe.save_flattened_outputs(
#     metadata_features_df,
#     flattened_output_directory,
#     dataset_config['description'],
#     config['training_data']['modeling_period_start']
# )

# # check preprocessed file
# preprocessed_metadata_df, preprocessed_metadata_output_path = fe.preprocess_coin_df(
#     flattened_metadata_filepath,
#     modeling_config,
#     dataset_config
# )

# metadata_tuple = (preprocessed_metadata_output_path.split('preprocessed_outputs/')[1], dataset_config['fill_method'])


# training_data_tuples.append(metadata_tuple)
# training_data_dfs.append(preprocessed_metadata_df)

### Construct Model

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


# merge training data
modeling_folder = modeling_config['modeling']['modeling_folder']
training_data_df, merge_logs_df = fe.create_training_data_df(modeling_folder, training_data_tuples)

# create the target variable df
target_variable_df,_ = fe.create_target_variables_mooncrater(filtered_market_data_df, config['training_data'], modeling_config)

# merge the two into the final model input df
model_input_df = fe.prepare_model_input_df(training_data_df, target_variable_df, modeling_config['modeling']['target_column'])

# split the df into train and test sets
X_train, X_test, y_train, y_test = m.split_model_input(
    model_input_df,
    modeling_config['modeling']['target_column'],
    modeling_config['modeling']['train_test_split'],
    modeling_config['modeling']['random_state']
)

# 3.4 Train the model using the current configuration and log the results
modeling_folder = modeling_config['modeling']['modeling_folder']
model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['modeling']['model_params'])

# 3.5 Evaluate the model's performance on the test set
metrics = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

# 3.6 Log the experiment results for this configuration
m.log_trial_results(modeling_folder, model_id)

metrics

In [None]:
# Assuming `model` is your trained model and `feature_names` is a list of your feature names
feature_importances = model.feature_importances_
feature_names = X_train.columns  # Replace with the correct source of your feature names if different

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
})

# Sort by importance (optional)
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

# Display the feature importance
feature_importance_df.sort_values('importance',ascending=False)
# feature_importance_df.sort_values('feature',ascending=False)

## tests failing

In [None]:
@pytest.mark.unit
def test_fe_flatten_date_features_bucketing():
    """
    Unit test for checking the bucketing functionality in flatten_date_features function.

    Test Cases:
    1. Checks that the function correctly buckets the aggregated metrics based on the specified bucket ranges.
    """

    # Sample DataFrame for testing
    sample_coin_high_df = pd.DataFrame({
        'buyers_new': [10, 20, 30, 40, 50, 60],
        'sellers_new': [5, 10, 15, 20, 25, 30]
    })

    # Sample DataFrame for testing
    sample_coin_low_df = pd.DataFrame({
        'buyers_new': [0, 3, 4, 15, 33, 12],
        'sellers_new': [5, 10, 15, 20, 25, 30]
    })


    # Sample configuration for metrics with buckets
    metrics_config = {
        'buyers_new': {
            'aggregations': {
                'sum': {'buckets': [{'low': 100}, {'medium': 200}, {'high': 'remainder'}]},  # Define the bucket ranges
                'mean': {'scaling': 'none'}
            }
        }
    }

    flat_features_high = fe.flatten_date_features(sample_coin_high_df, metrics_config)
    flat_features_low = fe.flatten_date_features(sample_coin_low_df, metrics_config)

    # Test Case 1: Bucketing functionality
    assert flat_features_high['buyers_new_sum_bucket'] == 'high'  # Sum = 210
    assert flat_features_low['buyers_new_sum_bucket'] == 'low'  # Sum = 67

    # Test Case 2: Non-bucketed aggregation still returns raw value
    assert flat_features_high['buyers_new_mean'] == 35  # Mean of buyers_new column
    assert round(flat_features_low['buyers_new_mean']) == 11  # Mean of buyers_new column



In [None]:
round(flat_features_high['buyers_new_mean'])

In [None]:
flat_features_low