In [None]:
# pyright: reportMissingModuleSource=false
import sys
import os
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
from utils import load_config, cw_filter_df, create_progress_bar
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


## Source Tables

In [None]:
# load configs
prices_metrics_config = metrics_config['time_series']['prices']
prices_dataset_config = config['datasets']['time_series']['prices']

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,prices_log = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])


## Metadata Features

In [None]:
# load configs
dataset_config = config['datasets']['coin_facts']['coin_metadata']



# generate features
metadata_df = td.retrieve_metadata_data()
metadata_features_df = td.generate_coin_metadata_features(metadata_df, config)
metadata_features_df.head()

# save flattened output
flattened_output_directory = os.path.join(modeling_config['modeling']['modeling_folder'],'outputs/flattened_outputs')
flattened_metadata_df, flattened_metadata_filepath = fe.save_flattened_outputs(
    metadata_features_df,
    flattened_output_directory,
    dataset_config['description'],
    config['training_data']['modeling_period_start']
)

# check preprocessed file
preprocessed_metadata_df, preprocessed_metadata_output_path = fe.preprocess_coin_df(
    flattened_metadata_filepath,
    modeling_config,
    dataset_config
)

preprocessed_metadata_df.head()

## Prices Metrics

In [None]:
# load configs
dataset_config = config['datasets']['time_series']['prices']
dataset_metrics_config = metrics_config['time_series']['prices']


# generate prices metrics metrics
prices_metrics_df,partial_prices_metrics_df = cwm.generate_time_series_metrics(prices_df, config, metrics_config, dataset_key='prices', colname='price')

# flatten, save, and preprocess the flattened df
flattened_output_directory = os.path.join(modeling_config['modeling']['modeling_folder'],'outputs/flattened_outputs')

flattened_prices_metrics_df = fe.flatten_coin_date_df(
    prices_metrics_df,
    prices_metrics_config,
    config['training_data']['training_period_end']
)
flattened_prices_metrics_df, flattened_prices_metrics_filepath = fe.save_flattened_outputs(
    flattened_prices_metrics_df,
    flattened_output_directory,
    prices_dataset_config['description'],
    config['training_data']['modeling_period_start']
)
prices_preprocessed_df, prices_preprocessed_filepath = fe.preprocess_coin_df(
    flattened_prices_metrics_filepath
    ,modeling_config
    ,prices_dataset_config
    ,prices_metrics_config
)


prices_preprocessed_df.head()

## Wallet Cohorts

In [None]:
# load configs
cohort_name = 'sharks'
sharks_metrics_config = metrics_config['wallet_cohorts'][cohort_name]
sharks_dataset_config = config['datasets']['wallet_cohorts'][cohort_name]
cohort_description = sharks_dataset_config['description']


# identify wallets in the cohort
cohort_summary_df = cwm.classify_wallet_cohort(profits_df, sharks_dataset_config)
cohort_wallets = cohort_summary_df[cohort_summary_df['in_cohort']==True]['wallet_address']

# generate cohort buysell_metrics
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets)

# flatten, save, and preprocess the flattened df
flattened_output_directory = os.path.join(modeling_config['modeling']['modeling_folder'],'outputs/flattened_outputs')

flattened_buysell_metrics_df = fe.flatten_coin_date_df(
    buysell_metrics_df,
    sharks_metrics_config,
    config['training_data']['training_period_end']
)
flattened_buysell_metrics_df, flattened_buysell_metrics_filepath = fe.save_flattened_outputs(
    flattened_buysell_metrics_df,
    flattened_output_directory,
    cohort_description,
    config['training_data']['modeling_period_start']
)
buysell_preprocessed_df, buysell_preprocessed_filepath = fe.preprocess_coin_df(
    flattened_buysell_metrics_filepath,
    modeling_config,
    sharks_dataset_config,
    sharks_metrics_config
)




In [None]:
sharks_dataset_config

### The Rest

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')




# create the training data df
input_filenames = [
    (buysell_preprocessed_filepath.split('preprocessed_outputs/')[1], 'drop_records'),
    (prices_preprocessed_filepath.split('preprocessed_outputs/')[1], 'drop_records')
]
training_data_df, merge_logs_df = fe.create_training_data_df(modeling_config['modeling']['modeling_folder'], input_filenames)

# create the target variable df
target_variable_df,_ = fe.create_target_variables_mooncrater(prices_df, config['training_data'], modeling_config)

# merge the two into the final model input df
model_input_df = fe.prepare_model_input_df(training_data_df, target_variable_df, modeling_config['modeling']['target_column'])

# split the df into train and test sets
X_train, X_test, y_train, y_test = m.split_model_input(
    model_input_df,
    modeling_config['modeling']['target_column'],
    modeling_config['modeling']['train_test_split'],
    modeling_config['modeling']['random_state']
)

# 3.4 Train the model using the current configuration and log the results
modeling_folder = modeling_config['modeling']['modeling_folder']
model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['modeling']['model_params'])

# 3.5 Evaluate the model's performance on the test set
metrics = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

# 3.6 Log the experiment results for this configuration
m.log_trial_results(modeling_folder, model_id)

metrics

## Test fixes etc

In [None]:
def mock_modeling_config():
    """
    Returns a mock modeling configuration dictionary.
    The configuration includes preprocessing options such as features to drop.
    """
    return {
        'preprocessing': {
            'drop_features': ['feature_to_drop']
        }
    }

def mock_metrics_config():
    """
    Returns a mock metrics configuration dictionary.
    This configuration includes settings for scaling different features.
    """
    return {
        'feature_1': {
            'aggregations': {
                'sum': {'scaling': 'standard'},
                'max': {}
            }
        }
    }


def mock_input_df():
    """
    Creates a mock DataFrame and saves it as a CSV for testing.
    The CSV file is saved in the 'tests/test_modeling/outputs/flattened_outputs' directory.

    Returns:
    - input_path: Path to the CSV file.
    - df: Original mock DataFrame.
    """
    data = {
        'feature_1_sum': [1, 2, 3],
        'feature_to_drop': [10, 20, 30],
        'feature_3': [100, 200, 300]
    }
    df = pd.DataFrame(data)
    input_path = 'temp/mock_input.csv'
    df.to_csv(input_path, index=False)
    return input_path, df

mock_modeling_config = mock_modeling_config()
mock_metrics_config = mock_metrics_config()
mock_input_df = mock_input_df()

In [None]:
output_df

In [None]:
"""
Tests that the preprocess_coin_df function correctly applies scaling to the specified features.

Steps:
- Preprocesses the mock DataFrame by applying standard scaling to 'feature_1'.
- Asserts that the column is scaled correctly.
- Cleans up the test files after execution.
"""
input_path, original_df = mock_input_df
dataset_config = {}

# Call the function
output_df, output_path = fe.preprocess_coin_df(input_path, mock_modeling_config, dataset_config, mock_metrics_config)

# Check that 'feature_1' is scaled (mean should be near 0 and std should be near 1)
scaled_column = output_df['feature_1_sum']
assert abs(scaled_column.mean()) < 1e-6, "Standard scaling not applied correctly to 'feature_1_sum'."
assert abs(np.std(scaled_column) - 1) < 1e-6, "Standard scaling not applied correctly to 'feature_1_sum'."

# Cleanup (remove the test files)
os.remove(output_path)
os.remove(input_path)

output_df