In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import re
import pdb
import datetime
import json
from datetime import datetime, timedelta
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load dotenv
load_dotenv()

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
pd.set_option('display.max_colwidth', 70)  # Increase to desired number of characters
# pd.reset_option('display.float_format')

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp


# reload all modules
modules = [u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)


## Aggregate training data function

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

training_data_df, prices_df, join_logs_df = tw.generate_all_time_windows_model_inputs(config,metrics_config,modeling_config)

#### Stepwise function

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# 1. Retrieve base datasets used by all windows
# ---------------------------------------------
macro_trends_df, market_data_df, profits_df, prices_df = tw.prepare_all_windows_base_data(config,
                                                                                            metrics_config)


# 2. Generate flattened features for each dataset in each window
# --------------------------------------------------------------
# Generate time_windows config overrides that will modify each window's config settings
time_windows = tw.generate_time_windows(config)

all_flattened_dfs = []
all_flattened_filepaths = []

for _, time_window in enumerate(time_windows):

    # Prepare time window config files
    window_config, window_metrics_config, window_modeling_config = (
        exp.prepare_configs(modeling_config['modeling']['config_folder'], time_window))

    # Generate flattened feature dfs for all datasets for the window
    window_flattened_dfs, window_flattened_filepaths = tw.generate_window_flattened_dfs(
        market_data_df,
        macro_trends_df,
        profits_df,
        prices_df,
        window_config,
        window_metrics_config,
        window_modeling_config
    )

    # Store window's flattened features
    all_flattened_dfs.extend(window_flattened_dfs)
    all_flattened_filepaths.extend(window_flattened_filepaths)


# 3. Combine features from all datasets in all time windows with target variables
# -------------------------------------------------------------------------------
# Combine all time windows for each dataset, the join the datasets together
concatenated_dfs = tw.concat_dataset_time_windows_dfs(all_flattened_filepaths,modeling_config)
training_data_df, join_logs_df = tw.join_dataset_all_windows_dfs(concatenated_dfs)


## Modeling Sequence

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


# Create target variables for all time windows
target_variable_df, returns_df, = tv.create_target_variables_for_all_time_windows(training_data_df,
                                                                                prices_df,
                                                                                config,
                                                                                modeling_config)

# Split target variables into the train/test/validation/future sets
sets_X_y_dict = ds.perform_train_test_validation_future_splits(training_data_df,
                                                                target_variable_df,
                                                                modeling_config)

# Preprocess X data for all sets
preprocessed_sets_X_y_dict = prp.preprocess_sets_X_y(sets_X_y_dict,config,metrics_config,modeling_config)



In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

X_train = preprocessed_sets_X_y_dict['train'][0]
y_train = preprocessed_sets_X_y_dict['train'][1]
X_test = preprocessed_sets_X_y_dict['test'][0]
y_test = preprocessed_sets_X_y_dict['test'][1]
returns_test = returns_df.loc[y_test.index, ['returns']]


# # Winsorize returns
# y_train['returns'] = m.winsorize(y_train['returns'],0.01)


# 3.4 Train the model using the current configuration and log the results
model, model_id, cv_scores = m.train_model(
                    X_train,
                    y_train,
                    modeling_config)

# 3.5 Evaluate and save the model performance on the test set to a CSV
metrics_dict, y_pred, y_pred_prob = m.evaluate_model(model, X_test, y_test, model_id, returns_test, modeling_config)

m.log_trial_results(modeling_config, model_id)
print(cv_scores)
metrics_dict

In [None]:
pd.set_option('display.max_colwidth', 100)  # Increase to desired number of characters

feature_importances = model.feature_importances_
features = X_train.columns  # Feature names

# Create a DataFrame with feature names and importance
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

# Sort by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df.head(10)

In [None]:
for module in modules:
    importlib.reload(module)


# Select y_pred_prob from the classifier, or y_pred from a regressor
predictions = y_pred_prob if y_pred_prob is not None else y_pred
returns = returns_test['returns']
winsorization_cutoff = modeling_config["evaluation"]["winsorization_cutoff"]


ia.generate_profitability_curves(predictions, returns, winsorization_cutoff)

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

ia.generate_profitability_curves_by_time_window(predictions, returns, winsorization_cutoff=0.01)

In [None]:
# Merge datasets
df = pd.DataFrame({
    'predictions': predictions,
    'returns': returns,
})

df.sort_values('predictions', ascending=False).head(20)

In [None]:
bad_coin_ids = [
    "d710a818-ce6e-4bab-b5ff-e39d06099c1d"
    ,"7bb63899-80d5-4a35-8ff2-09dc74c6ce0d"
    ,"f2c5bdc0-93a9-416c-adbe-abf19b0247d8"
    ,"4f3bd04c-9f8b-47c9-85de-af46b7d095bf"
    ,"6267c4b3-4f70-45b9-8574-9028d53775ee"
    ,"184d124c-d38a-4669-93ff-25dda20901d8"
    ,"ae0e5b04-0e47-480d-abfd-10ed64df0df9"
    ,"aab2214e-52d9-4506-bc67-6b121e57c735"
    ,"2b5050a3-4558-4cba-be44-973a4a6dadd9"
]

df[df.index.get_level_values('coin_id').isin(bad_coin_ids)].sort_values('predictions', ascending=False)

In [None]:
y_train
y_test.groupby(level='time_window')['is_moon'].sum()

In [None]:
X_train.columns

## Modeling Sequence old

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

X_train = preprocessed_sets_X_y_dict['train'][0]
y_train = preprocessed_sets_X_y_dict['train'][1]
X_test = preprocessed_sets_X_y_dict['test'][0]
y_test = preprocessed_sets_X_y_dict['test'][1]
returns_test = returns_df.loc[y_test.index, ['returns']]

# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(
                    X_train,
                    y_train,
                    modeling_config)

# 3.5 Evaluate and save the model performance on the test set to a CSV
metrics_dict, y_pred, y_pred_prob = m.evaluate_model(model, X_test, y_test, model_id, returns_test, modeling_config)

metrics_dict

In [None]:
feature_importances = model.feature_importances_
features = X_train.columns  # Feature names

# Create a DataFrame with feature names and importance
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

# Sort by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df.head(20)

In [None]:
for module in modules:
    importlib.reload(module)


# Select y_pred_prob from the classifier, or y_pred from a regressor
predictions = y_pred_prob if y_pred_prob is not None else y_pred
returns = returns_test['returns']
winsorization_cutoff = modeling_config["evaluation"]["winsorization_cutoff"]


ia.generate_profitability_curves(predictions, returns, winsorization_cutoff)

In [None]:
ia.generate_profitability_curves_by_time_window(predictions, returns, winsorization_cutoff=0)

## Codespace

In [None]:
cohort_name = 'whales'
dataset_config = config['datasets']['wallet_cohorts'][cohort_name]
dataset_config

In [None]:
training_period_start = config['training_data']['training_period_start']
cohort_lookback = config['datasets']['wallet_cohorts'][cohort_name]['lookback_period']
cohort_lookback_start = pd.to_datetime(training_period_start) - timedelta(days=cohort_lookback)
cohort_lookback_start

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# 1. Retrieve base datasets used by all windows
# ---------------------------------------------
macro_trends_df, market_data_df, profits_df, prices_df = tw.prepare_all_windows_base_data(config,
                                                                                            metrics_config)


# 2. Generate flattened features for each dataset in each window
# --------------------------------------------------------------
# Generate time_windows config overrides that will modify each window's config settings
time_windows = tw.generate_time_windows(config)

all_flattened_dfs = []
all_flattened_filepaths = []

for _, time_window in enumerate(time_windows):

    # Prepare time window config files
    window_config, window_metrics_config, window_modeling_config = (
        exp.prepare_configs(modeling_config['modeling']['config_folder'], time_window))

    # Generate flattened feature dfs for all datasets for the window
    window_flattened_dfs, window_flattened_filepaths = tw.generate_window_flattened_dfs(
        market_data_df,
        macro_trends_df,
        profits_df,
        prices_df,
        window_config,
        window_metrics_config,
        window_modeling_config
    )

    # Store window's flattened features
    all_flattened_dfs.extend(window_flattened_dfs)
    all_flattened_filepaths.extend(window_flattened_filepaths)


# 3. Combine features from all datasets in all time windows with target variables
# -------------------------------------------------------------------------------
# Combine all time windows for each dataset, the join the datasets together
concatenated_dfs = tw.concat_dataset_time_windows_dfs(all_flattened_filepaths,modeling_config)
training_data_df, join_logs_df = tw.join_dataset_all_windows_dfs(concatenated_dfs)


In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# Market data: retrieve and clean full history
market_data_df = dr.retrieve_market_data()
market_data_df = dr.clean_market_data(market_data_df, config)

# Profits: retrieve and clean profits data spanning the earliest to latest training periods
profits_df = dr.retrieve_profits_data(config['training_data']['earliest_cohort_lookback_start'],
                                    config['training_data']['training_period_end'],
                                    config['data_cleaning']['minimum_wallet_inflows'])
profits_df, _ = dr.clean_profits_df(profits_df, config['data_cleaning'])


# 2. Filtering based on dataset overlap
# -------------------------------------
# Filter market_data to only coins with transfers data if configured to
if config['data_cleaning']['exclude_coins_without_transfers']:
    market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]
# Create prices_df: lightweight reference for other functions
prices_df = market_data_df[['coin_id','date','price']].copy()

# Filter profits_df to remove records for any coins that were removed in data cleaning
profits_df = profits_df[profits_df['coin_id'].isin(market_data_df['coin_id'])]


# 3. Add indicators (additional time series)
# ------------------------------------------
# Macro trends: add indicators
macro_trends_df = ind.generate_time_series_indicators(macro_trends_df,
                                                    metrics_config['macro_trends'],
                                                    # None)
# Market data: add indicators
market_data_df_full = ind.generate_time_series_indicators(market_data_df,
                                                    metrics_config['time_series']['market_data'],
                                                    'coin_id')
# market_data_df = ind.add_market_data_dualcolumn_indicators(market_data_df)
market_data_df_full.head()

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

market_data_df = market_data_df_full.copy()
market_data_df.columns

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

market_data_df = ind.add_mfi_column(market_data_df, price_col='price', volume_col='volume', window=14)
market_data_df.columns

In [None]:
market_data_df

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

market_data_df = ind.add_market_data_dualcolumn_indicators(market_data_df)
market_data_df.head()

In [None]:
# Market data: retrieve and clean full history
market_data_df = dr.retrieve_market_data()
market_data_df = dr.clean_market_data(market_data_df, config)
market_data_df.columns

In [None]:
def add_market_data_multicolumn_indicators(market_data_df):
    """
    Adds multi-column indicators to market_data_df
    """
    market_data_df = ind.add_mfi_column(market_data_df, price_col='price', volume_col='volume', window=14)
    market_data_df['obv'] = ind.generalized_obv(market_data_df['price'], market_data_df['volume'])

    return market_data_df

In [None]:
market_data_df.head()

In [None]:
market_data_df.describe()

In [None]:
def generate_time_series_indicators(dataset_df, dataset_metrics_config, id_column):
    """
    Generates all indicators for a time series dataframe keyed on coin_id and date. This is
    a wrapper function to apply ind.generate_column_time_series_indicators() to each dataset
    column with indicator configurations.

    Params:
    - dataset_df (DataFrame): The df containing dataset metrics and a coin_id and date column,
        as well as columns needing indicator calculations.
    - dataset_metrics_config (dict): The subcomponent of metrics_config that has keys for the
        columns needing indicators, e.g. metrics_config['time_series']['market_data']
    - id_column: whether the input df has an id column that needs to be grouped on

    Returns:
    - dataset_indicators_df (DataFrame): The original dataset_df with added columns for all
        configured indicators.
    """
    # Calculate indicators for each value column
    for value_column in list(dataset_metrics_config.keys()):

        if 'indicators' in dataset_metrics_config[value_column].keys():
            dataset_df = generate_column_time_series_indicators(
                dataset_df,
                value_column,
                dataset_metrics_config[value_column]['indicators'],
                id_column
            )

    return dataset_df


In [None]:
def generate_time_windows(config):
    """
    Generates the parameter dicts used by i.prepare_configs() to generate the full set
    of config files.

    Params:
        config (dict): config.yaml

    Returns:
        time_windows (list of dicts): a list of dicts that can be used to override the
        config.yaml settings for each time window.
    """
    start_date = pd.to_datetime(config['training_data']['modeling_period_start'])
    window_frequency = config['training_data']['time_window_frequency']

    time_windows = [
        {'config.training_data.modeling_period_start': start_date.strftime('%Y-%m-%d')}
    ]

    for _ in range(config['training_data']['additional_windows']):
        start_date -= timedelta(days=window_frequency)
        time_windows.append({'config.training_data.modeling_period_start': start_date.strftime('%Y-%m-%d')})

    time_windows

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


training_data_config = config['training_data']

# Extract the config values
modeling_period_start = datetime.strptime(training_data_config['modeling_period_start'],
                                            '%Y-%m-%d')
modeling_period_duration = training_data_config['modeling_period_duration']  # in days
training_period_duration = training_data_config['training_period_duration']  # in days

# Training and Modeling Period Dates
# ----------------------------------
# Calculate modeling_period_end (inclusive of the start date)
modeling_period_end = modeling_period_start + timedelta(days=modeling_period_duration - 1)

# Calculate training_period_end (just before modeling_period_start)
training_period_end = modeling_period_start - timedelta(days=1)

# Calculate training_period_start (inclusive of the start date)
training_period_start = training_period_end - timedelta(days=training_period_duration - 1)

# Lookback Dates
# --------------
# Calculate the start date of the earliest window
window_frequency = training_data_config['time_window_frequency']
additional_windows = training_data_config['additional_windows']
total_days_range = ((window_frequency * additional_windows) # the number of lookback days from the time windows
                    + (modeling_period_duration + training_period_duration))
earliest_window_start = pd.to_datetime(modeling_period_end) - timedelta(days=total_days_range)

# Calculate the earliest cohort lookback date for the earliest window
# Identify all unique cohort lookback periods
cohort_lookback_periods = [
    cohort['lookback_period']
    for cohort in config['datasets']['wallet_cohorts'].values()
]
earliest_cohort_lookback_start = (earliest_window_start -
                                    timedelta(days=max(cohort_lookback_periods)))


earliest_window_start

In [None]:
pd.to_datetime(modeling_period_end) - timedelta(days=80)


## Junkyard

In [None]:
returns

In [None]:
len(predictions)

## Tests failing