In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load dotenv
load_dotenv()

# Custom format function for displaying numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as ri
import feature_engineering as fe
import coin_wallet_metrics as cwm
import modeling as m
import insights.analysis as ia
import insights.model_input_flows as mf
import utils as u


# reload all modules
modules = [dr, ri, fe, cwm, m, ia, mf, u]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)


## Modeling Sequence

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs
logger.setLevel(logging.INFO)


# Initialize empty lists to hold concatenated data
X_train_list, X_test_list = [], []
y_train_list, y_test_list = [], []
returns_test_list = []

# Generate time_windows config overrides that will modify each window's config settings
time_windows = mf.generate_time_windows(config)

for n, window in enumerate(time_windows):

    model_data = mf.build_time_window_model_input(n, window, config, metrics_config, modeling_config)

    # Append the current window's data to the lists
    X_train_list.append(model_data['X_train'])
    X_test_list.append(model_data['X_test'])
    y_train_list.append(model_data['y_train'])
    y_test_list.append(model_data['y_test'])
    returns_test_list.append(model_data['returns_test'])


# Concatenate all the data for each part
X_train = pd.concat(X_train_list, axis=0)
X_test = pd.concat(X_test_list, axis=0)
y_train = pd.concat(y_train_list, axis=0)
y_test = pd.concat(y_test_list, axis=0)
returns_test = pd.concat(returns_test_list, axis=0)

In [None]:
X_train.shape

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs
logger.setLevel(logging.INFO)


# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(
                    X_train,
                    y_train,
                    modeling_config)

# 3.5 Evaluate and save the model performance on the test set to a CSV
metrics_dict, y_pred, y_pred_prob = m.evaluate_model(model, X_test, y_test, model_id, returns_test, modeling_config)

metrics_dict

In [None]:
feature_importances = model.feature_importances_
features = X_train.columns  # Feature names

# Create a DataFrame with feature names and importance
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

# Sort by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df.head(20)

In [None]:
importance_df

In [None]:
for module in modules:
    importlib.reload(module)


# Select y_pred_prob from the classifier, or y_pred from a regressor
predictions = y_pred_prob or y_pred
returns = returns_test['returns']
winsorization_cutoff = modeling_config["evaluation"]["winsorization_cutoff"]


ia.generate_profitability_curves(predictions, returns, winsorization_cutoff)

In [None]:
for module in modules:
    importlib.reload(module)


# Select y_pred_prob from the classifier, or y_pred from a regressor
predictions = y_pred_prob or y_pred
returns = returns_test['returns']
winsorization_cutoff = modeling_config["evaluation"]["winsorization_cutoff"]


ia.generate_profitability_curves(predictions, returns, winsorization_cutoff)

#### global_market_data from coingecko

In [None]:
df = pd.read_csv("/Users/jeremymeadow/DreamsData/Local/datasets/macro_trends/CoinGecko-GlobalCryptoMktCap-2024-10-04.csv")
df['date'] = pd.to_datetime(df['snapped_at'], unit='ms')
df = df.drop(columns='snapped_at')
df = df[['date','market_cap','total_volume']]

df.to_csv("/Users/jeremymeadow/DreamsData/Local/datasets/macro_trends/formatted/crypto_global_market.csv", index=False)

### bitcoin indicators

In [None]:
import os
import pandas as pd

def load_and_process_bitcoin_data(directory_path):
    # List to store individual dataframes
    dfs = []

    # Iterate through CSV files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory_path, filename)

            # Read the CSV file
            df = pd.read_csv(file_path)

            # Convert 'DateTime' column to datetime type
            df['DateTime'] = pd.to_datetime(df['DateTime'])

            # Set 'DateTime' as the index
            df.set_index('DateTime', inplace=True)

            # Append to the list of dataframes
            dfs.append(df)

    # Join all dataframes
    combined_df = pd.concat(dfs, axis=1, join='outer')

    # Remove duplicate 'BTC price' columns
    combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]

    # Define a dictionary for column renaming
    rename_dict = {
        'BTC price': 'btc_price',
        'CDD Terminal Ajusted 90dma': 'cdd_terminal_adjusted_90dma',
        'Fear and Greed': 'fear_and_greed',
        'MVRV Z-Score': 'mvrv_z_score',
        'VDD Multiple': 'vdd_multiple'
    }

    # Rename columns using the dictionary
    combined_df = combined_df.rename(columns=rename_dict)

    # Rename the index (DateTime column)
    combined_df.index.name = 'date'

    return combined_df

# Example usage:
# df = load_and_process_bitcoin_data('/path/to/csv/directory')
# print(df.head())

df = load_and_process_bitcoin_data('/Users/jeremymeadow/DreamsData/Local/datasets/macro_trends/bitcoin_macro_indicators')

# remove partial recent records
df = df.loc[:'2024-10-02']
df.to_csv("/Users/jeremymeadow/DreamsData/Local/datasets/macro_trends/formatted/bitcoin_indicators.csv", index=True)

## Macro Trends

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs



# Retrieve data
macro_trends_df = dr.retrieve_macro_trends_data()
macro_trends_df = cwm.generate_macro_trends_features(macro_trends_df, config)
macro_trends_tuples, _ = fe.generate_macro_trends_features(
        macro_trends_df,
        config,
        metrics_config,
        modeling_config
    )

macro_trends_tuples

In [None]:
config

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


dataset_df = macro_trends_df.copy().reset_index()


# function variables we want to reference
dataset_category = 'macro_trends'
id_column=None
dataset_metrics_config = metrics_config[dataset_category]

# Model-ready tuples and dfs will go here
training_data_tuples = []
training_data_dfs = []


# calculate metrics for each value column
for value_column in list(dataset_metrics_config.keys()):

    # a value_column-specific df will be used for feature generation
    value_column_config = config['datasets'][dataset_category][value_column]
    value_column_metrics_config = dataset_metrics_config[value_column]
    value_column_df = dataset_df[['date',value_column]].copy()
    # check if there are any time series indicators to add, e.g. sma, ema, etc
    if 'indicators' in value_column_metrics_config:
        value_column_metrics_df, _ = cwm.generate_time_series_indicators(
            value_column_df,
            config,
            value_column_metrics_config['indicators'],
            value_column,
            id_column
        )

    else:
        # if no indicators are needed, pass through coins with complete date coverage
        logging.getLogger().setLevel(logging.WARNING) # suppress INFO logs about splits
        value_column_metrics_df, _ = cwm.split_dataframe_by_coverage(
            value_column_df,
            config['training_data']['training_period_start'],
            config['training_data']['training_period_end'],
            id_column
        )
        logging.getLogger().setLevel(logging.INFO) # could be updated to use original level


    # flatten metrics
    flattened_features = fe.flatten_date_features(value_column_metrics_df,dataset_metrics_config)
    flattened_macro_trends_df = pd.DataFrame([flattened_features])

    # save flattened metrics
    flattened_macro_trends_df, flattened_macro_trends_filepath = fe.save_flattened_outputs(
        flattened_macro_trends_df,
        os.path.join(
            modeling_config['modeling']['modeling_folder'],  # Folder to store flattened outputs
            'outputs/flattened_outputs'
        ),
        value_column_config['description'],  # Descriptive metadata for the dataset
        config['training_data']['modeling_period_start']  # Ensure data starts from modeling period
    )

    # preprocess metrics
    macro_trends_preprocessed_df, macro_trends_preprocessed_filepath = fe.preprocess_coin_df(
        flattened_macro_trends_filepath
        ,modeling_config
        ,value_column_config
        ,value_column_metrics_config
    )

    macro_trends_tuple = (macro_trends_preprocessed_filepath.split('preprocessed_outputs/')[1],
                            value_column_config['fill_method'])
    logger.info('Generated features for %s.%s',
                dataset_category, value_column)

    training_data_tuples.append(macro_trends_tuple)
    training_data_dfs.append(macro_trends_preprocessed_df)




In [None]:
training_data_tuples

In [None]:
value_column_metrics_df

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs
logger.setLevel(logging.INFO)


training_data_tuples = []

# 1. Generate and merge features for all datasets
# -------------------------------------

# Macro trends features
macro_trends_tuples, _ = fe.generate_macro_trends_features(
        macro_trends_df,
        config,
        metrics_config,
        modeling_config
    )
training_data_tuples.extend(macro_trends_tuples)

# Merge all the features
training_data_df, _ = fe.create_training_data_df(
                        modeling_config['modeling']['modeling_folder'],
                        training_data_tuples)



## CSV Uploads

## Junkyard

In [None]:
training_data_tuples

In [None]:
google_trends_df.tail()

In [None]:
google_trends_df.head()

In [None]:
dataset_config = config['datasets']['macro_trends'][dataset_name]

In [None]:
dataset_metrics_config

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)

# set parameters
dataset_name = 'google_trends'
dataset_df = google_trends_df
config,
metrics_config,
modeling_config

training_data_tuples, training_data_dfs = fe.generate_macro_trends_features(
        dataset_name,
        dataset_df,
        config,
        metrics_config,
        modeling_config
    )

In [None]:
training_data_dfs[1]

In [None]:
metrics_config['macro_trends'][dataset_name]

In [None]:
dataset_config = config['datasets']['macro_trends'][dataset_name]
dataset_config

In [None]:
# flatten metrics
flattened_features = fe.flatten_date_features(value_column_metrics_df,dataset_metrics_config)
flattened_google_trends_df = pd.DataFrame([flattened_features])

# save flattened metrics
flattened_google_trends_df, flattened_google_trends_filepath = fe.save_flattened_outputs(
    flattened_google_trends_df,
    os.path.join(
        modeling_config['modeling']['modeling_folder'],  # Folder to store flattened outputs
        'outputs/flattened_outputs'
    ),
    value_column_config['description'],  # Descriptive metadata for the dataset
    config['training_data']['modeling_period_start']  # Ensure data starts from modeling period
)

# preprocess metrics
google_trends_preprocessed_df, google_trends_preprocessed_filepath = fe.preprocess_coin_df(
    flattened_google_trends_filepath
    ,modeling_config
    ,value_column_config
    ,value_column_metrics_config
)

google_trends_tuple = (google_trends_preprocessed_filepath.split('preprocessed_outputs/')[1], value_column_config['fill_method'])

In [None]:
dataset_config

In [None]:
config['datasets']['macro_trends'][dataset_name]

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


# set up config variables
dataset_category = 'macro_trends'
dataset_name = 'google_trends'
dataset_config = config['datasets'][dataset_category][dataset_name]
dataset_metrics_config = metrics_config[dataset_category][dataset_name]

# load dataset
google_trends_df = td.retrieve_google_trends_data()


# calculate and merge all metrics in the config
all_metrics = []
for key in list(dataset_metrics_config.keys()):
    value_column_metrics_config = metrics_config[dataset_category][dataset_name][key]
    metric_df = google_trends_df[['date',key]]

    # check if there are any time series indicators to add, e.g. sma, ema, etc
    if 'indicators' in value_column_metrics_config:
        value_column_metrics_df, _ = cwm.generate_time_series_indicators(
            metric_df,
            config,
            value_column_metrics_config,
            key,
            id_column=None
        )

    else:
        # if no indicators are needed, pass through coins with complete date coverage
        logging.getLogger().setLevel(logging.WARNING)
        value_column_metrics_df, _ = cwm.split_dataframe_by_coverage(
            value_column_df,
            config['training_data']['training_period_start'],
            config['training_data']['training_period_end'],
            id_column='coin_id'
        )
        logging.getLogger().setLevel(logging.INFO)

    all_metrics.append(metric_df)

all_metrics_df = all_metrics[0]
for metrics_df in all_metrics[1:]:
    all_metrics_df = pd.merge(all_metrics_df, metrics_df, on='date', how='outer')


# flatten metrics
flattened_features = fe.flatten_date_features(all_metrics_df,dataset_metrics_config)
flattened_google_trends_df = pd.DataFrame([flattened_features])

# save flattened metrics
flattened_google_trends_df, flattened_google_trends_filepath = fe.save_flattened_outputs(
    flattened_google_trends_df,
    os.path.join(modeling_config['modeling']['modeling_folder'],'outputs/flattened_outputs'),
    dataset_config['description'],
    config['training_data']['modeling_period_start']
)

# preprocess metrics
google_trends_preprocessed_df, google_trends_preprocessed_filepath = fe.preprocess_coin_df(
    flattened_google_trends_filepath
    ,modeling_config
    ,dataset_config
    ,dataset_metrics_config
)

google_trends_tuple = (google_trends_preprocessed_filepath.split('preprocessed_outputs/')[1], dataset_config['fill_method'])

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


start_date = config['training_data']['training_period_start']
end_date = config['training_data']['modeling_period_end']

# Retrieve market data
market_data_df = td.retrieve_market_data()
market_data_df, _ = cwm.split_dataframe_by_coverage(market_data_df, start_date, end_date, id_column='coin_id')
prices_df = market_data_df[['coin_id','date','price']].copy()

# Retrieve profits data if necessary
if 'profits_df' not in globals():
    profits_df = None
profits_df = i.rebuild_profits_df_if_necessary(
                config,
                modeling_folder,
                prices_df,
                profits_df)

# Filter market_data rows without transfers if configured to do so
if config['data_cleaning']['exclude_coins_without_transfers']:
    market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]
    prices_df = market_data_df[['coin_id','date','price']].copy()



## tests failing