In [None]:
# pyright: reportMissingModuleSource=false
import sys
import os
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
from utils import load_config, cw_filter_df, create_progress_bar
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


## Base Tables

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# retreive market data
market_data_df = td.retrieve_market_data()
market_data_df,_ = td.fill_market_data_gaps(market_data_df,config['data_cleaning']['max_gap_days'])
prices_df = market_data_df[['coin_id','date','price']].copy()


# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )


# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])

In [None]:
market_data_df.isna().sum()

In [None]:
profits_df = td.prepare_profits_data(transfers_df, market_data_df)


In [None]:
profits_df.isna().sum()

## Experiment Setup

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# initial steps for this model
filtered_market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]

training_data_tuples = []
training_data_dfs = []

### Prices Metrics

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# dataset variables
dataset_category = 'time_series'
dataset_name = 'market_data'
dataset_df = filtered_market_data_df.copy()



# declare dataset configs based on variables
dataset_config = config['datasets'][dataset_category][dataset_name]
dataset_metrics_config = metrics_config[dataset_category][dataset_name]

# calculate metrics for each value column
for value_column in list(dataset_metrics_config.keys()):

    # a value_column-specific df will be used for feature generation
    value_column_config = config['datasets'][dataset_category][dataset_name][value_column]
    value_column_metrics_config = metrics_config[dataset_category][dataset_name][value_column]
    value_column_df = dataset_df[['date','coin_id',value_column]].copy()

    # check if there are any time series metrics to add, e.g. sma, ema, etc
    if 'metrics' in value_column_metrics_config:

        # calculate and merge all metrics in the config
        all_metrics = []

        # generate metrics
        metric_df, _ = cwm.generate_time_series_metrics(
            metric_df,
            config,
            value_column_metrics_config['metrics'],
            value_column,
            id_column='coin_id'
        )

        all_metrics.append(metric_df)

        # join all generated metrics for the value_column together
        value_column_metrics_df = all_metrics[0]
        for metrics_df in all_metrics[1:]:
            value_column_metrics_df = pd.merge(value_column_metrics_df, metrics_df, on='date', how='outer')

    else:
        # if no additional metrics are needed, pass through coins in the original df that have values for all dates
        value_column_metrics_df, _, _ = cwm.split_dataframe_by_coverage(
            value_column_df,
            config['training_data']['training_period_start'],
            config['training_data']['training_period_end'],
            id_column='coin_id'
        )

    # generate features from the metrics
    value_column_features_df, value_column_tuple = fe.convert_coin_date_metrics_to_features(
        value_column_metrics_df,
        value_column_config,
        dataset_metrics_config,
        config,
        modeling_config
    )

    logger.info('Generated features for %s.%s.%s',
                dataset_category, dataset_name, value_column)

    training_data_tuples.append(value_column_tuple)
    training_data_dfs.append(value_column_features_df)


### Wallet Cohorts

In [None]:
if 'market_cap' in market_data_df.columns:
    print('x')

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

# dataset variables
dataset_category = 'wallet_cohorts'


for cohort_name in metrics_config[dataset_category]:

    # load configs
    dataset_metrics_config = metrics_config[dataset_category][cohort_name]
    dataset_config = config['datasets'][dataset_category][cohort_name]
    cohort_description = dataset_config['description']

    # identify wallets in the cohort
    cohort_summary_df = cwm.classify_wallet_cohort(profits_df, dataset_config)
    cohort_wallets = cohort_summary_df[cohort_summary_df['in_cohort']==True]['wallet_address']

    # If no cohort members were identified, continue
    if len(cohort_wallets) == 0:
        logger.info("No wallets identified as members of cohort '%s'", cohort_name)
        continue

    # generate cohort buysell_metrics
    cohort_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets)

    # generate features from the metrics
    dataset_features_df, dataset_tuple = fe.convert_coin_date_metrics_to_features(
        cohort_metrics_df,
        dataset_config,
        dataset_metrics_config,
        config,
        modeling_config
    )

    logger.info('Generated features for %s.%s',
                dataset_category, cohort_name)

    training_data_tuples.append(dataset_tuple)
    training_data_dfs.append(dataset_features_df)



### Construct Model

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')



# merge training data
modeling_folder = modeling_config['modeling']['modeling_folder']
training_data_df, merge_logs_df = fe.create_training_data_df(modeling_folder, training_data_tuples)

# create the target variable df
target_variable_df,_ = fe.create_target_variables_mooncrater(filtered_market_data_df, config['training_data'], modeling_config)

# merge the two into the final model input df
model_input_df = fe.prepare_model_input_df(training_data_df, target_variable_df, modeling_config['modeling']['target_column'])

# split the df into train and test sets
X_train, X_test, y_train, y_test = m.split_model_input(
    model_input_df,
    modeling_config['modeling']['target_column'],
    modeling_config['modeling']['train_test_split'],
    modeling_config['modeling']['random_state']
)

# 3.4 Train the model using the current configuration and log the results
modeling_folder = modeling_config['modeling']['modeling_folder']
model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['modeling']['model_params'])

# 3.5 Evaluate the model's performance on the test set
metrics = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

# 3.6 Log the experiment results for this configuration
m.log_trial_results(modeling_folder, model_id)

metrics

In [None]:
# Assuming `model` is your trained model and `feature_names` is a list of your feature names
feature_importances = model.feature_importances_
feature_names = X_train.columns  # Replace with the correct source of your feature names if different

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
})

# Sort by importance (optional)
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

# Display the feature importance
feature_importance_df.sort_values('importance',ascending=False)
# feature_importance_df.sort_values('feature',ascending=False)

## tests failing

In [None]:
# Sample data for testing
sample_df = pd.DataFrame({
    'date': [
        pd.Timestamp('2024-01-01'),
        pd.Timestamp('2024-01-02'),
        pd.Timestamp('2024-01-03'),
        pd.Timestamp('2024-01-01'),
        pd.Timestamp('2024-01-02'),
        pd.Timestamp('2024-01-03')
        ],
    'coin_id': [1, 1, 1, 2, 2, 2],
    'buyers_new': [10, 20, 30, 40, 50, 60],
    'sellers_new': [5, 10, 15, 20, 25, 30]
})

# Sample configuration for metrics
df_metrics_config = {
    'buyers_new': {
        'aggregations': {
            'sum': {'scaling': 'none'},
            'mean': {'scaling': 'none'},
            'max': {'scaling': 'none'},
            'min': {'scaling': 'none'},
            'median': {'scaling': 'none'},
            'std': {'scaling': 'none'}
        }
    },
    'sellers_new': {
        'aggregations': {
            'sum': {'scaling': 'none'},
            'mean': {'scaling': 'none'},
            'max': {'scaling': 'none'}
        }
    }
}

# demo
training_period_end = '2024-01-03'

# Test Case 1: Basic functionality with multiple coins
result = fe.flatten_coin_date_df(sample_df, df_metrics_config, training_period_end)

# Check that there are two coins in the output
assert len(result['coin_id'].unique()) == 2
assert sorted(result['coin_id'].unique()) == [1, 2]

# Check that all expected columns exist for both coins
expected_columns = [
    'coin_id', 'buyers_new_sum', 'buyers_new_mean', 'buyers_new_max', 'buyers_new_min',
    'buyers_new_median', 'buyers_new_std', 'sellers_new_sum', 'sellers_new_mean', 'sellers_new_max'
]
assert all(col in result.columns for col in expected_columns)

# Test Case 2: One coin with missing metric data (buyers_new should raise ValueError)
df_missing_metric = pd.DataFrame({
    'date': [pd.Timestamp('2024-01-01'), pd.Timestamp('2024-01-02'), pd.Timestamp('2024-01-03')],
    'coin_id': [1, 1, 1],
    'sellers_new': [5, 10, 15]
})

# with pytest.raises(ValueError, match="Metric 'buyers_new' is missing from the input DataFrame."):
df = fe.flatten_coin_date_df(df_missing_metric, df_metrics_config, training_period_end)
df

# # Test Case 3: Empty DataFrame (should raise ValueError)
# df_empty = pd.DataFrame(columns=['coin_id', 'buyers_new', 'sellers_new'])
# with pytest.raises(ValueError, match="Input DataFrame is empty"):
#     fe.flatten_coin_date_df(df_empty, df_metrics_config, training_period_end)

# # Test Case 4: One coin in the dataset
# df_one_coin = pd.DataFrame({
#     'date': [pd.Timestamp('2024-01-01'), pd.Timestamp('2024-01-02'), pd.Timestamp('2024-01-03')],
#     'coin_id': [1, 1, 1],
#     'buyers_new': [10, 20, 30],
#     'sellers_new': [5, 10, 15]
# })
# result_one_coin = fe.flatten_coin_date_df(df_one_coin, df_metrics_config, training_period_end)

# # Check that the single coin is processed correctly and the columns are as expected
# assert len(result_one_coin['coin_id'].unique()) == 1
# assert 'buyers_new_sum' in result_one_coin.columns
# assert result_one_coin['buyers_new_sum'].iloc[0] == 60  # Sum of buyers_new for coin 1



In [None]:
rolling_features

In [None]:
for metric, config in metrics_config.items():
    print(config)

In [None]:
df_metrics_config = metrics_config
time_series_df =sample_coin_df

flat_features = {}
matched_columns = False

# Apply global stats calculations for each metric
for metric, config in df_metrics_config.items():
    print(config)
    if metric not in time_series_df.columns:
        continue

    matched_columns = True
    ts = time_series_df[metric].copy()  # Get the time series for this metric

    # Standard aggregations
    if 'aggregations' in config:
        print(config)
        for agg, agg_config in config['aggregations'].items():
            agg_value = calculate_stat(ts, agg)

            # Generate bucket columns if buckets are specified in the config
            if 'buckets' in agg_config:
                bucket_category = bucketize_value(agg_value, agg_config['buckets'])
                flat_features[f'{metric}_{agg}_bucket'] = bucket_category

            # Return the aggregate metric if it is not bucketized
            else:
                flat_features[f'{metric}_{agg}'] = agg_value



In [None]:
config['aggregations']

In [None]:
metrics_config_real = load_config('../config/metrics_config.yaml')
metrics_config_real['time_series']['market_data']['volume']
