In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load dotenv
load_dotenv()

# Custom format function for displaying numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import training_data as td
import feature_engineering as fe
import coin_wallet_metrics as cwm
import modeling as m
import insights.analysis as ia
import insights.model_input_flows as mf
import utils as u


# reload all modules
modules = [td, fe, cwm, m, ia, mf, u]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.DEBUG)


## Modeling Sequence

In [None]:
for module in modules:
    importlib.reload(module)

config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)

# Initialize empty lists to hold concatenated data
X_train_list, X_test_list = [], []
y_train_list, y_test_list = [], []
returns_test_list = []

# Generate time_windows config overrides that will modify each window's config settings
time_windows = mf.generate_time_windows(config)

for n, window in enumerate(time_windows):

    model_data = mf.build_time_window_model_input(n, window, config, metrics_config, modeling_config)

    # Append the current window's data to the lists
    X_train_list.append(model_data['X_train'])
    X_test_list.append(model_data['X_test'])
    y_train_list.append(model_data['y_train'])
    y_test_list.append(model_data['y_test'])
    returns_test_list.append(model_data['returns_test'])


# Concatenate all the data for each part
X_train = pd.concat(X_train_list, axis=0)
X_test = pd.concat(X_test_list, axis=0)
y_train = pd.concat(y_train_list, axis=0)
y_test = pd.concat(y_test_list, axis=0)
returns_test = pd.concat(returns_test_list, axis=0)

In [None]:
[importlib.reload(module) for module in modules]
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')

In [None]:
for module in modules:
    importlib.reload(module)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(
                    X_train,
                    y_train,
                    modeling_folder,
                    modeling_config)

# 3.5 Evaluate and save the model performance on the test set to a CSV
metrics_dict, y_pred, y_pred_prob = m.evaluate_model(model, X_test, y_test, model_id, returns_test, modeling_config)


metrics_dict

In [None]:
for module in modules:
    importlib.reload(module)


# Select y_pred_prob from the classifier, or y_pred from a regressor
predictions = y_pred_prob or y_pred
returns = returns_test['returns']
winsorization_cutoff = modeling_config["evaluation"]["winsorization_cutoff"]


ia.generate_profitability_curves(predictions, returns, winsorization_cutoff)

In [None]:

# Winsorize the returns (apply caps to the top n % of values)
returns_winsorized = m.winsorize(returns, winsorization_cutoff)

# Merge datasets
df = pd.DataFrame({
    'predictions': predictions,
    'returns': returns_winsorized,
})

# Sort by actual returns to obtain optimal performance
df_sorted = df.sort_values('returns', ascending=False)
cumulative_best_returns = np.cumsum(df_sorted['returns'])
cumulative_best_avg_returns = df_sorted['returns'].expanding().mean()

# Sort by model score to obtain modeled performance
df_sorted = df.sort_values('predictions', ascending=False)
cumulative_model_returns = np.cumsum(df_sorted['returns'])
cumulative_model_avg_returns = df_sorted['returns'].expanding().mean()

# Create subplots for side-by-side plots
_, axes = plt.subplots(1, 2, figsize=(14, 6))

# First plot: Cumulative Returns Performance
axes[0].plot(cumulative_best_returns.values, label='Optimal Performance')
axes[0].plot(cumulative_model_returns.values, label='Model Performance')
axes[0].set_title('Cumulative Returns Performance')
axes[0].set_ylabel('Cumulative Returns')
axes[0].set_xlabel('Rank Number')
axes[0].legend()

# Second plot: Average Returns Performance
axes[1].plot(cumulative_best_avg_returns.values, label='Optimal Avg Performance')
axes[1].plot(cumulative_model_avg_returns.values, label='Model Avg Performance')
axes[1].set_title('Average Returns Performance')
axes[1].set_ylabel('Average Returns')
axes[1].set_xlabel('Rank Number')
axes[1].legend()

In [None]:
profits_df = None

if profits_df:
    print('x')

In [None]:
# Plot the two cumulative return series
plt.plot(cumulative_best_returns.values, label='Optimal Performance')
plt.plot(cumulative_model_returns.values, label='Model Performance')

# Set the labels
plt.ylabel('Cumulative Returns')
plt.xlabel('Rank Number')  # No label on the x-axis
plt.legend()

# Show the plot
plt.show()

In [None]:
df.sort_values('returns', ascending=False).head(20)

In [None]:
df_sorted = df.sort_values('predictions', ascending=False)

df_sorted['cumulative_returns'] = np.cumsum(df_sorted['returns'])

df_sorted.head(20)

## Macro Trends

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)

google_trends_df = td.retrieve_google_trends_data()

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)

training_data_tuples = []

# 1. Generate and merge features for all datasets
# -------------------------------------
# Time series features
dataset_name = 'market_data'  # update to loop through all time series
market_data_tuples, _ = fe.generate_time_series_features(
        dataset_name,
        market_data_df,
        config,
        metrics_config,
        modeling_config
    )
training_data_tuples.extend(market_data_tuples)

# Wallet cohort features
wallet_cohort_tuples, _ = fe.generate_wallet_cohort_features(
        profits_df,
        config,
        metrics_config,
        modeling_config
    )
training_data_tuples.extend(wallet_cohort_tuples)

# Google trends features
dataset_name = 'google_trends'  # update to loop through all macro trends
google_trends_tuples, _ = fe.generate_macro_trends_features(
        dataset_name,
        google_trends_df,
        config,
        metrics_config,
        modeling_config
    )
training_data_tuples.extend(google_trends_tuples)

# Merge all the features
training_data_df, _ = fe.create_training_data_df(
                        modeling_config['modeling']['modeling_folder'],
                        training_data_tuples)



In [None]:
training_data_tuples

In [None]:
google_trends_df.tail()

In [None]:
google_trends_df.head()

In [None]:
dataset_config = config['datasets']['macro_trends'][dataset_name]

In [None]:
dataset_metrics_config

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)

# set parameters
dataset_name = 'google_trends'
dataset_df = google_trends_df
config,
metrics_config,
modeling_config

training_data_tuples, training_data_dfs = fe.generate_macro_trends_features(
        dataset_name,
        dataset_df,
        config,
        metrics_config,
        modeling_config
    )

In [None]:
training_data_dfs[1]

In [None]:
metrics_config['macro_trends'][dataset_name]

In [None]:
dataset_config = config['datasets']['macro_trends'][dataset_name]
dataset_config

In [None]:
# flatten metrics
flattened_features = fe.flatten_date_features(value_column_metrics_df,dataset_metrics_config)
flattened_google_trends_df = pd.DataFrame([flattened_features])

# save flattened metrics
flattened_google_trends_df, flattened_google_trends_filepath = fe.save_flattened_outputs(
    flattened_google_trends_df,
    os.path.join(
        modeling_config['modeling']['modeling_folder'],  # Folder to store flattened outputs
        'outputs/flattened_outputs'
    ),
    value_column_config['description'],  # Descriptive metadata for the dataset
    config['training_data']['modeling_period_start']  # Ensure data starts from modeling period
)

# preprocess metrics
google_trends_preprocessed_df, google_trends_preprocessed_filepath = fe.preprocess_coin_df(
    flattened_google_trends_filepath
    ,modeling_config
    ,value_column_config
    ,value_column_metrics_config
)

google_trends_tuple = (google_trends_preprocessed_filepath.split('preprocessed_outputs/')[1], value_column_config['fill_method'])

In [None]:
dataset_config

In [None]:
config['datasets']['macro_trends'][dataset_name]

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


# set up config variables
dataset_category = 'macro_trends'
dataset_name = 'google_trends'
dataset_config = config['datasets'][dataset_category][dataset_name]
dataset_metrics_config = metrics_config[dataset_category][dataset_name]

# load dataset
google_trends_df = td.retrieve_google_trends_data()


# calculate and merge all metrics in the config
all_metrics = []
for key in list(dataset_metrics_config.keys()):
    value_column_metrics_config = metrics_config[dataset_category][dataset_name][key]
    metric_df = google_trends_df[['date',key]]

    # check if there are any time series indicators to add, e.g. sma, ema, etc
    if 'indicators' in value_column_metrics_config:
        value_column_metrics_df, _ = cwm.generate_time_series_indicators(
            metric_df,
            config,
            value_column_metrics_config,
            key,
            id_column=None
        )

    else:
        # if no indicators are needed, pass through coins with complete date coverage
        logging.getLogger().setLevel(logging.WARNING)
        value_column_metrics_df, _ = cwm.split_dataframe_by_coverage(
            value_column_df,
            config['training_data']['training_period_start'],
            config['training_data']['training_period_end'],
            id_column='coin_id'
        )
        logging.getLogger().setLevel(logging.INFO)

    all_metrics.append(metric_df)

all_metrics_df = all_metrics[0]
for metrics_df in all_metrics[1:]:
    all_metrics_df = pd.merge(all_metrics_df, metrics_df, on='date', how='outer')


# flatten metrics
flattened_features = fe.flatten_date_features(all_metrics_df,dataset_metrics_config)
flattened_google_trends_df = pd.DataFrame([flattened_features])

# save flattened metrics
flattened_google_trends_df, flattened_google_trends_filepath = fe.save_flattened_outputs(
    flattened_google_trends_df,
    os.path.join(modeling_config['modeling']['modeling_folder'],'outputs/flattened_outputs'),
    dataset_config['description'],
    config['training_data']['modeling_period_start']
)

# preprocess metrics
google_trends_preprocessed_df, google_trends_preprocessed_filepath = fe.preprocess_coin_df(
    flattened_google_trends_filepath
    ,modeling_config
    ,dataset_config
    ,dataset_metrics_config
)

google_trends_tuple = (google_trends_preprocessed_filepath.split('preprocessed_outputs/')[1], dataset_config['fill_method'])

## Junkyard

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


start_date = config['training_data']['training_period_start']
end_date = config['training_data']['modeling_period_end']

# Retrieve market data
market_data_df = td.retrieve_market_data()
market_data_df, _ = cwm.split_dataframe_by_coverage(market_data_df, start_date, end_date, id_column='coin_id')
prices_df = market_data_df[['coin_id','date','price']].copy()

# Retrieve profits data if necessary
if 'profits_df' not in globals():
    profits_df = None
profits_df = i.rebuild_profits_df_if_necessary(
                config,
                modeling_folder,
                prices_df,
                profits_df)

# Filter market_data rows without transfers if configured to do so
if config['data_cleaning']['exclude_coins_without_transfers']:
    market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]
    prices_df = market_data_df[['coin_id','date','price']].copy()



## tests failing