In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false
import sys
import os
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
from utils import load_config, cw_filter_df, create_progress_bar
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# Pydantic

In [None]:
from pydantic import TypeAdapter
sys.path.append('..//src/config_models')
import metrics_config
importlib.reload(metrics_config)


metrics_config_dict = load_config('../config/metrics_config.yaml')
print(metrics_config_dict)


metrics_config_adapter = TypeAdapter(metrics_config.MetricsConfig)

pydantic_config = metrics_config_adapter.validate_python(metrics_config_dict)

print(pydantic_config.dict())


In [None]:
print(metrics_config_dict)

In [None]:
config_data

In [None]:
# from pydantic import ValidationError
# sys.path.append('..//src/config_models')
# from config as pyc
# importlib.reload(pyc)


In [None]:
import yaml
from pydantic import ValidationError
sys.path.append('..//src/config_models')
from config import MainConfig

def load_and_validate_config(config_path: str) -> MainConfig:
    """
    Load the YAML configuration file and validate it against the MainConfig model.
    """
    with open(config_path, 'r') as file:
        config_dict = yaml.safe_load(file)

    try:
        config = MainConfig(**config_dict)
        return config
    except ValidationError as e:
        print(f"Configuration validation error:")
        print(e)
        raise

def print_config_summary(config: MainConfig):
    """
    Print a summary of the validated configuration.
    """
    print("Configuration Summary:")
    print(f"Modeling period start: {config.training_data.modeling_period_start}")
    print(f"Modeling period duration: {config.training_data.modeling_period_duration} days")
    print(f"Training period duration: {config.training_data.training_period_duration} days")
    print(f"Number of wallet cohorts: {len(config.datasets.wallet_cohorts)}")
    print(f"Time series data types: {list(config.datasets.time_series['market_data'].keys())}")
    print(f"Coin facts data types: {list(config.datasets.coin_facts.keys())}")
    print(f"Profitability filter: {config.data_cleaning.profitability_filter}")
    print(f"Inflows filter: {config.data_cleaning.inflows_filter}")
    print(f"Max gap days: {config.data_cleaning.max_gap_days}")


config_path = "../config/config.yaml"  # Update this path to your config.yaml location
try:
    config = load_and_validate_config(config_path)
    print("Configuration validated successfully!")
    print_config_summary(config)
except ValidationError:
    print("Configuration validation failed. Please check the errors above and correct your config.yaml file.")

In [None]:
import yaml
from pydantic import ValidationError
sys.path.append('..//src/config_models')
from metrics_config import MetricsConfig
importlib.reload(metrics_config)


metrics_config_dict = load_config('../config/metrics_config.yaml')

pydantic_config = MetricsConfig(**metrics_config_dict)
pydantic_config

In [None]:
try:
    pydantic_config = MetricsConfig(**metrics_config_dict)
    print(pydantic_config.dict())
except ValidationError as e:
    print(f"Validation error: {e}")

In [None]:
from pydantic import TypeAdapter

metrics_config_adapter = TypeAdapter(MetricsConfig)
try:
    pydantic_config = metrics_config_adapter.validate_python(metrics_config_dict)
    print(pydantic_config.dict())
except ValidationError as e:
    print(f"Validation error: {e}")

In [None]:
import yaml
from pydantic import TypeAdapter
sys.path.append('..//src/config_models')
from metrics_config import MetricsConfig, WalletCohortMetricType

def load_and_validate_config(file_path):
    with open(file_path, 'r') as file:
        config_data = yaml.safe_load(file)

    metrics_config_adapter = TypeAdapter(MetricsConfig)
    try:
        metrics_config = metrics_config_adapter.validate_python(config_data)
        print("Configuration is valid!")
        return metrics_config
    except Exception as e:
        print(f"Configuration is invalid: {str(e)}")
        return None

def print_config_details(config):
    if config and config.wallet_cohorts:
        for cohort_name, cohort in config.wallet_cohorts.cohorts.items():
            print(f"\nCohort: {cohort_name}")
            for metric_type, metric in cohort.metrics.items():
                print(f"  Metric: {metric_type}")
                if metric.aggregations and metric.aggregations.sum:
                    print(f"    Scaling: {metric.aggregations.sum.scaling}")

if __name__ == "__main__":
    config = load_and_validate_config('../config/metrics_config.yaml')
    if config:
        print_config_details(config)

In [None]:
config

## Base Tables

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# retreive market data
market_data_df = td.retrieve_market_data()
market_data_df,_ = td.fill_market_data_gaps(market_data_df,config['data_cleaning']['max_gap_days'])
prices_df = market_data_df[['coin_id','date','price']].copy()


# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )


# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])

## Experiment Setup

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# initial steps for this model
filtered_market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]

training_data_tuples = []
training_data_dfs = []

### Prices Metrics

In [None]:
import pdb
logger.setLevel(logging.DEBUG)
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# dataset variables
dataset_category = 'time_series'
dataset_name = 'market_data'
dataset_df = filtered_market_data_df.copy()



# declare dataset configs based on variables
dataset_config = config['datasets'][dataset_category][dataset_name]
dataset_metrics_config = metrics_config[dataset_category][dataset_name]

# calculate metrics for each value column
for value_column in list(dataset_metrics_config.keys()):

    # a value_column-specific df will be used for feature generation
    value_column_config = config['datasets'][dataset_category][dataset_name][value_column]
    value_column_metrics_config = metrics_config[dataset_category][dataset_name][value_column]
    value_column_df = dataset_df[['date','coin_id',value_column]].copy()

    # check if there are any time series metrics to add, e.g. sma, ema, etc
    if 'metrics' in value_column_metrics_config:

        # calculate and merge all metrics in the config
        all_metrics = []

        # generate metrics
        metric_df, _ = cwm.generate_time_series_metrics(
            metric_df,
            config,
            value_column_metrics_config['metrics'],
            value_column,
            id_column='coin_id'
        )

        all_metrics.append(metric_df)

        # join all generated metrics for the value_column together
        value_column_metrics_df = all_metrics[0]
        for metrics_df in all_metrics[1:]:
            value_column_metrics_df = pd.merge(value_column_metrics_df, metrics_df, on='date', how='outer')

    else:
        # if no additional metrics are needed, pass through coins in the original df that have values for all dates
        value_column_metrics_df, _, _ = cwm.split_dataframe_by_coverage(
            value_column_df,
            config['training_data']['training_period_start'],
            config['training_data']['training_period_end'],
            id_column='coin_id'
        )

    # generate features from the metrics
    value_column_features_df, value_column_tuple = fe.convert_coin_date_metrics_to_features(
        value_column_metrics_df,
        value_column_config,
        dataset_metrics_config,
        config,
        modeling_config
    )

    logger.info('Generated features for %s.%s.%s',
                dataset_category, dataset_name, value_column)

    training_data_tuples.append(value_column_tuple)
    training_data_dfs.append(value_column_features_df)


### Wallet Cohorts

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

# dataset variables
dataset_category = 'wallet_cohorts'


for cohort_name in metrics_config[dataset_category]:

    # load configs
    dataset_metrics_config = metrics_config[dataset_category][cohort_name]
    dataset_config = config['datasets'][dataset_category][cohort_name]
    cohort_description = dataset_config['description']

    # identify wallets in the cohort
    cohort_summary_df = cwm.classify_wallet_cohort(profits_df, dataset_config)
    cohort_wallets = cohort_summary_df[cohort_summary_df['in_cohort']==True]['wallet_address']

    # If no cohort members were identified, continue
    if len(cohort_wallets) == 0:
        logger.info("No wallets identified as members of cohort '%s'", cohort_name)
        continue

    # generate cohort buysell_metrics
    cohort_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets)

    # generate features from the metrics
    dataset_features_df, dataset_tuple = fe.convert_coin_date_metrics_to_features(
        cohort_metrics_df,
        dataset_config,
        dataset_metrics_config,
        config,
        modeling_config
    )

    logger.info('Generated features for %s.%s',
                dataset_category, cohort_name)

    training_data_tuples.append(dataset_tuple)
    training_data_dfs.append(dataset_features_df)



In [None]:
cohort_metrics_df

### Metadata

In [None]:
dataset_category = 'coin_facts'
dataset_name = 'coin_metadata'





# load configs
dataset_config = config['datasets'][dataset_category][dataset_name]


# generate features
metadata_df = td.retrieve_metadata_data()
metadata_features_df = td.generate_coin_metadata_features(metadata_df, config)
metadata_features_df.head()

# save flattened output
flattened_output_directory = os.path.join(modeling_config['modeling']['modeling_folder'],'outputs/flattened_outputs')
flattened_metadata_df, flattened_metadata_filepath = fe.save_flattened_outputs(
    metadata_features_df,
    flattened_output_directory,
    dataset_config['description'],
    config['training_data']['modeling_period_start']
)

# check preprocessed file
preprocessed_metadata_df, preprocessed_metadata_output_path = fe.preprocess_coin_df(
    flattened_metadata_filepath,
    modeling_config,
    dataset_config
)

metadata_tuple = (preprocessed_metadata_output_path.split('preprocessed_outputs/')[1], dataset_config['fill_method'])


training_data_tuples.append(metadata_tuple)
training_data_dfs.append(preprocessed_metadata_df)

In [None]:
training_data_tuples = training_data_tuples[:7]

### Construct Model

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')



# merge training data
modeling_folder = modeling_config['modeling']['modeling_folder']
training_data_df, merge_logs_df = fe.create_training_data_df(modeling_folder, training_data_tuples)

# create the target variable df
target_variable_df,_ = fe.create_target_variables_mooncrater(filtered_market_data_df, config['training_data'], modeling_config)

# merge the two into the final model input df
model_input_df = fe.prepare_model_input_df(training_data_df, target_variable_df, modeling_config['modeling']['target_column'])

# split the df into train and test sets
X_train, X_test, y_train, y_test = m.split_model_input(
    model_input_df,
    modeling_config['modeling']['target_column'],
    modeling_config['modeling']['train_test_split'],
    modeling_config['modeling']['random_state']
)

# 3.4 Train the model using the current configuration and log the results
modeling_folder = modeling_config['modeling']['modeling_folder']
model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['modeling']['model_params'])

# 3.5 Evaluate the model's performance on the test set
metrics = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

# 3.6 Log the experiment results for this configuration
m.log_trial_results(modeling_folder, model_id)

metrics

In [None]:
# Assuming `model` is your trained model and `feature_names` is a list of your feature names
feature_importances = model.feature_importances_
feature_names = X_train.columns  # Replace with the correct source of your feature names if different

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
})

# Sort by importance (optional)
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

# Display the feature importance
feature_importance_df.sort_values('importance',ascending=False)
# feature_importance_df.sort_values('feature',ascending=False)

## tests failing