In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.DEBUG)

# Custom format function for displaying numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')


# Load all configs as global variables
global CONFIG, METRICS_CONFIG, MODELING_CONFIG, EXPERIMENTS_CONFIG, MODELING_FOLDER
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
CONFIG = config
METRICS_CONFIG = metrics_config
MODELING_CONFIG = modeling_config
EXPERIMENTS_CONFIG = experiments_config
MODELING_FOLDER = MODELING_CONFIG['modeling']['modeling_folder']
modeling_folder = MODELING_FOLDER

## Overall Sequencing

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


start_date = config['training_data']['training_period_start']
end_date = config['training_data']['modeling_period_end']

# Retrieve market data
market_data_df = td.retrieve_market_data()
market_data_df, _ = cwm.split_dataframe_by_coverage(market_data_df, start_date, end_date, id_column='coin_id')
prices_df = market_data_df[['coin_id','date','price']].copy()

# Retrieve profits data if necessary
if 'profits_df' not in globals():
    profits_df = None
profits_df = i.rebuild_profits_df_if_necessary(
                config,
                modeling_folder,
                prices_df,
                profits_df)

# Filter market_data rows without transfers if configured to do so
if config['data_cleaning']['exclude_coins_without_transfers']:
    market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]
    prices_df = market_data_df[['coin_id','date','price']].copy()

# Retrieve Google Trends data
google_trends_df = td.retrieve_google_trends_data()

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


# Initialize empty lists to hold concatenated data
X_train_all, X_test_all = [], []
y_train_all, y_test_all = [], []
returns_test_all = []

time_windows = td.generate_time_windows(config)

for n, window in enumerate(time_windows):

    # Prepare the full configuration by applying overrides from the current trial config
    config, metrics_config, modeling_config = i.prepare_configs(modeling_config['modeling']['config_folder'], window)

    # Define window start and end dates
    start_date = config['training_data']['training_period_start']
    end_date = config['training_data']['modeling_period_end']

    # Rebuild market data
    market_data_df = td.retrieve_market_data()
    market_data_df, _ = cwm.split_dataframe_by_coverage(market_data_df, start_date, end_date, id_column='coin_id')
    prices_df = market_data_df[['coin_id','date','price']].copy()

    # Rebuild profits_df
    profits_df = i.rebuild_profits_df_if_necessary(config, modeling_folder, prices_df, profits_df)

    # Build the configured model input data for the nth window
    X_train_n, X_test_n, y_train_n, y_test_n, returns_test_n = i.build_configured_model_input(
                                        profits_df,
                                        market_data_df,
                                        google_trends_df,
                                        config,
                                        metrics_config,
                                        modeling_config)

    # Append the current window's data to the lists
    X_train_all.append(X_train_n)
    X_test_all.append(X_test_n)
    y_train_all.append(y_train_n)
    y_test_all.append(y_test_n)
    returns_test_all.append(returns_test_n)

# Concatenate all the data for each part
X_train = pd.concat(X_train_all, axis=0)
X_test = pd.concat(X_test_all, axis=0)
y_train = pd.concat(y_train_all, axis=0)
y_test = pd.concat(y_test_all, axis=0)
returns_test = pd.concat(returns_test_all, axis=0)

In [None]:
# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(
                    X_train,
                    y_train,
                    modeling_folder,
                    modeling_config)

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


# 3.5 Evaluate and save the model performance on the test set to a CSV
metrics_dict, y_pred, y_pred_prob = m.evaluate_model(model, X_test, y_test, model_id, returns_test, modeling_config)


metrics_dict

In [None]:
y_pred_proba

In [None]:
len(y_pred_prob)

In [None]:
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'


In [None]:
y_pred_prob

In [None]:
pd.DataFrame(returns_test).sort_values('returns')

In [None]:
running_profitability_scores

In [None]:
pd.DataFrame(running_profitability_scores).sort_values('returns')

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)



running_profitability_scores = m.calculate_running_profitability_score(
                                                        y_pred_prob,
                                                        returns_test,
                                                        modeling_config["evaluation"]["winsorization_cutoff"]
                                                        )


running_profitability_scores.reset_index().plot(kind='line')
plt.show()

## Macro Trends

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)

google_trends_df = td.retrieve_google_trends_data()

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)

training_data_tuples = []

# 1. Generate and merge features for all datasets
# -------------------------------------
# Time series features
dataset_name = 'market_data'  # update to loop through all time series
market_data_tuples, _ = fe.generate_time_series_features(
        dataset_name,
        market_data_df,
        config,
        metrics_config,
        modeling_config
    )
training_data_tuples.extend(market_data_tuples)

# Wallet cohort features
wallet_cohort_tuples, _ = fe.generate_wallet_cohort_features(
        profits_df,
        config,
        metrics_config,
        modeling_config
    )
training_data_tuples.extend(wallet_cohort_tuples)

# Google trends features
dataset_name = 'google_trends'  # update to loop through all macro trends
google_trends_tuples, _ = fe.generate_macro_trends_features(
        dataset_name,
        google_trends_df,
        config,
        metrics_config,
        modeling_config
    )
training_data_tuples.extend(google_trends_tuples)

# Merge all the features
training_data_df, _ = fe.create_training_data_df(
                        modeling_config['modeling']['modeling_folder'],
                        training_data_tuples)



In [None]:
training_data_tuples

In [None]:
google_trends_df.tail()

In [None]:
google_trends_df.head()

In [None]:
dataset_config = config['datasets']['macro_trends'][dataset_name]

In [None]:
dataset_metrics_config

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)

# set parameters
dataset_name = 'google_trends'
dataset_df = google_trends_df
config,
metrics_config,
modeling_config

training_data_tuples, training_data_dfs = fe.generate_macro_trends_features(
        dataset_name,
        dataset_df,
        config,
        metrics_config,
        modeling_config
    )

In [None]:
training_data_dfs[1]

In [None]:
metrics_config['macro_trends'][dataset_name]

In [None]:
dataset_config = config['datasets']['macro_trends'][dataset_name]
dataset_config

In [None]:
# flatten metrics
flattened_features = fe.flatten_date_features(value_column_metrics_df,dataset_metrics_config)
flattened_google_trends_df = pd.DataFrame([flattened_features])

# save flattened metrics
flattened_google_trends_df, flattened_google_trends_filepath = fe.save_flattened_outputs(
    flattened_google_trends_df,
    os.path.join(
        modeling_config['modeling']['modeling_folder'],  # Folder to store flattened outputs
        'outputs/flattened_outputs'
    ),
    value_column_config['description'],  # Descriptive metadata for the dataset
    config['training_data']['modeling_period_start']  # Ensure data starts from modeling period
)

# preprocess metrics
google_trends_preprocessed_df, google_trends_preprocessed_filepath = fe.preprocess_coin_df(
    flattened_google_trends_filepath
    ,modeling_config
    ,value_column_config
    ,value_column_metrics_config
)

google_trends_tuple = (google_trends_preprocessed_filepath.split('preprocessed_outputs/')[1], value_column_config['fill_method'])

In [None]:
dataset_config

In [None]:
config['datasets']['macro_trends'][dataset_name]

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


# set up config variables
dataset_category = 'macro_trends'
dataset_name = 'google_trends'
dataset_config = config['datasets'][dataset_category][dataset_name]
dataset_metrics_config = metrics_config[dataset_category][dataset_name]

# load dataset
google_trends_df = td.retrieve_google_trends_data()


# calculate and merge all metrics in the config
all_metrics = []
for key in list(dataset_metrics_config.keys()):
    value_column_metrics_config = metrics_config[dataset_category][dataset_name][key]
    metric_df = google_trends_df[['date',key]]

    # check if there are any time series indicators to add, e.g. sma, ema, etc
    if 'indicators' in value_column_metrics_config:
        value_column_metrics_df, _ = cwm.generate_time_series_indicators(
            metric_df,
            config,
            value_column_metrics_config,
            key,
            id_column=None
        )

    else:
        # if no indicators are needed, pass through coins with complete date coverage
        logging.getLogger().setLevel(logging.WARNING)
        value_column_metrics_df, _ = cwm.split_dataframe_by_coverage(
            value_column_df,
            config['training_data']['training_period_start'],
            config['training_data']['training_period_end'],
            id_column='coin_id'
        )
        logging.getLogger().setLevel(logging.INFO)

    all_metrics.append(metric_df)

all_metrics_df = all_metrics[0]
for metrics_df in all_metrics[1:]:
    all_metrics_df = pd.merge(all_metrics_df, metrics_df, on='date', how='outer')


# flatten metrics
flattened_features = fe.flatten_date_features(all_metrics_df,dataset_metrics_config)
flattened_google_trends_df = pd.DataFrame([flattened_features])

# save flattened metrics
flattened_google_trends_df, flattened_google_trends_filepath = fe.save_flattened_outputs(
    flattened_google_trends_df,
    os.path.join(modeling_config['modeling']['modeling_folder'],'outputs/flattened_outputs'),
    dataset_config['description'],
    config['training_data']['modeling_period_start']
)

# preprocess metrics
google_trends_preprocessed_df, google_trends_preprocessed_filepath = fe.preprocess_coin_df(
    flattened_google_trends_filepath
    ,modeling_config
    ,dataset_config
    ,dataset_metrics_config
)

google_trends_tuple = (google_trends_preprocessed_filepath.split('preprocessed_outputs/')[1], dataset_config['fill_method'])

## Junkyard

## tests failing