In [None]:
# pyright: reportMissingModuleSource=false
import sys
import os
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
from utils import load_config, cw_filter_df, create_progress_bar
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


## Base Tables

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# retreive market data
market_data_df = td.retrieve_market_data()
market_data_df,_ = td.fill_market_data_gaps(market_data_df,config['data_cleaning']['max_gap_days'])


# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )


# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, market_data_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])

## Experiment Setup

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# initial steps for this model
filtered_market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]

training_data_tuples = []

### Prices Metrics

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


training_data_tuples = []

# dataset variables
dataset_category = 'time_series'
dataset_name = 'market_data'
dataset_df = filtered_market_data_df.copy()

# declare dataset configs based on variables
dataset_config = config['datasets'][dataset_category][dataset_name]
dataset_metrics_config = metrics_config[dataset_category][dataset_name]

# calculate metrics for each value column
# for value_column in list(dataset_metrics_config.keys()):
for value_column in list(dataset_metrics_config.keys()):


    print(value_column)

    # a value_column-specific df will be used for feature generation
    value_column_config = config['datasets'][dataset_category][dataset_name][value_column]
    value_column_metrics_config = metrics_config[dataset_category][dataset_name][value_column]
    value_column_df = dataset_df[['date','coin_id',value_column]].copy()

    # check if there are any time series metrics to add, e.g. sma, ema, etc
    if 'metrics' in value_column_metrics_config:

        # calculate and merge all metrics in the config
        all_metrics = []

        # generate metrics
        metric_df, _ = cwm.generate_time_series_metrics(
            metric_df,
            config,
            value_column_metrics_config['metrics'],
            value_column,
            id_column='coin_id'
        )

        all_metrics.append(metric_df)

        # join all generated metrics for the value_column together
        value_column_metrics_df = all_metrics[0]
        for metrics_df in all_metrics[1:]:
            value_column_metrics_df = pd.merge(value_column_metrics_df, metrics_df, on='date', how='outer')

    else:
        # if no additional metrics are needed, pass through coins in the original df that have values for all dates
        value_column_metrics_df, _, _ = cwm.split_dataframe_by_coverage(
            value_column_df,
            config['training_data']['training_period_start'],
            config['training_data']['training_period_end'],
            id_column='coin_id'
        )

    # generate features from the metrics
    value_column_features_df, value_column_tuple = fe.convert_to_features(
        value_column_metrics_df,
        value_column_config,
        dataset_metrics_config,
        config,
        modeling_config
    )

    logger.info('Generated features for %s.%s.%s',
                dataset_category, dataset_name, value_column)

    training_data_tuples.append(value_column_tuple)

training_data_tuples

### Wallet Cohorts

In [None]:
dataset_category = 'wallet_cohorts'
cohort_tuples = []

for cohort_name in metrics_config[dataset_category]:

    # load configs
    dataset_metrics_config = metrics_config[dataset_category][cohort_name]
    dataset_config = config['datasets'][dataset_category][cohort_name]
    cohort_description = dataset_config['description']

    # identify wallets in the cohort
    cohort_summary_df = cwm.classify_wallet_cohort(profits_df, dataset_config)
    cohort_wallets = cohort_summary_df[cohort_summary_df['in_cohort']==True]['wallet_address']

    # generate cohort buysell_metrics
    cohort_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets)

    # generate features from the metrics
    dataset_features_df, dataset_tuple = fe.convert_to_features(
        cohort_metrics_df,
        dataset_config,
        dataset_metrics_config,
        config,
        modeling_config
    )

    logger.info('Generated features for %s.%s',
                dataset_category, cohort_name)

    training_data_tuples.append(dataset_tuple)



### Construct Model

In [None]:

# merge training data
modeling_folder = modeling_config['modeling']['modeling_folder']
training_data_df, merge_logs_df = fe.create_training_data_df(modeling_folder, training_data_tuples)

# create the target variable df
target_variable_df,_ = fe.create_target_variables_mooncrater(filtered_market_data_df, config['training_data'], modeling_config)

# merge the two into the final model input df
model_input_df = fe.prepare_model_input_df(training_data_df, target_variable_df, modeling_config['modeling']['target_column'])

# split the df into train and test sets
X_train, X_test, y_train, y_test = m.split_model_input(
    model_input_df,
    modeling_config['modeling']['target_column'],
    modeling_config['modeling']['train_test_split'],
    modeling_config['modeling']['random_state']
)

# 3.4 Train the model using the current configuration and log the results
modeling_folder = modeling_config['modeling']['modeling_folder']
model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['modeling']['model_params'])

# 3.5 Evaluate the model's performance on the test set
metrics = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

# 3.6 Log the experiment results for this configuration
m.log_trial_results(modeling_folder, model_id)

metrics

In [None]:
model.feature_importances_

In [None]:
metrics

## Test fixes etc

In [None]:
def generate_preprocessed_output(dataset_category, dataset_name, config, metrics_config, modeling_config):
    # Load configs
    dataset_config = config['datasets'][dataset_category][dataset_name]
    dataset_metrics_config = metrics_config.get(dataset_category, {}).get(dataset_name, None)

    # Generate features/metrics based on dataset_category
    if dataset_category == 'coin_facts':
        df = td.retrieve_metadata_data()
        features_df = td.generate_coin_metadata_features(df, config)
    elif dataset_category == 'time_series':
        df = td.get_time_series_data()  # Assuming a function that retrieves the appropriate time series
        features_df, _ = cwm.generate_time_series_metrics(df, config, metrics_config, dataset_key=dataset_name, value_column=dataset_config['value_column'])
    elif dataset_category == 'wallet_cohorts':
        cohort_summary_df = cwm.classify_wallet_cohort(profits_df, dataset_config)
        cohort_wallets = cohort_summary_df[cohort_summary_df['in_cohort']==True]['wallet_address']
        df = profits_df  # Or other relevant DataFrame
        features_df = cwm.generate_buysell_metrics_df(df, config['training_data']['training_period_end'], cohort_wallets)

    # Flatten, save, and preprocess the DataFrame
    flattened_output_directory = os.path.join(modeling_config['modeling']['modeling_folder'], 'outputs/flattened_outputs')
    flattened_df, flattened_filepath = fe.save_flattened_outputs(features_df, flattened_output_directory, dataset_config['description'], config['training_data']['modeling_period_start'])
    preprocessed_df, preprocessed_filepath = fe.preprocess_coin_df(flattened_filepath, modeling_config, dataset_config, dataset_metrics_config)

    return preprocessed_df, preprocessed_filepath





In [None]:
dataset_category = 'wallet_cohorts'
dataset_name = 'sharks'

preprocessed_df, preprocessed_filepath = generate_preprocessed_output(dataset_category, dataset_name, config, metrics_config, modeling_config)

In [None]:
preprocessed_df