In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load dotenv
load_dotenv()

# Custom format function for displaying numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as ri
import feature_engineering as fe
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import modeling as m
import insights.analysis as ia
import insights.model_input_flows as mf
import utils as u


# reload all modules
modules = [dr, ri, fe, cwm, ind, m, ia, mf, u]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)


## Modeling Sequence

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs
logger.setLevel(logging.INFO)


# Initialize empty lists to hold concatenated data
X_train_list, X_test_list = [], []
y_train_list, y_test_list = [], []
returns_test_list = []

# Generate time_windows config overrides that will modify each window's config settings
time_windows = mf.generate_time_windows(config)

for n, window in enumerate(time_windows):

    model_data = mf.build_time_window_model_input(n, window, config, metrics_config, modeling_config)

    # Append the current window's data to the lists
    X_train_list.append(model_data['X_train'])
    X_test_list.append(model_data['X_test'])
    y_train_list.append(model_data['y_train'])
    y_test_list.append(model_data['y_test'])
    returns_test_list.append(model_data['returns_test'])


# Concatenate all the data for each part
X_train = pd.concat(X_train_list, axis=0)
X_test = pd.concat(X_test_list, axis=0)
y_train = pd.concat(y_train_list, axis=0)
y_test = pd.concat(y_test_list, axis=0)
returns_test = pd.concat(returns_test_list, axis=0)

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs
logger.setLevel(logging.INFO)


# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(
                    X_train,
                    y_train,
                    modeling_config)

# 3.5 Evaluate and save the model performance on the test set to a CSV
metrics_dict, y_pred, y_pred_prob = m.evaluate_model(model, X_test, y_test, model_id, returns_test, modeling_config)

metrics_dict

In [None]:
feature_importances = model.feature_importances_
features = X_train.columns  # Feature names

# Create a DataFrame with feature names and importance
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

# Sort by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df.head(20)

In [None]:
importance_df

In [None]:
for module in modules:
    importlib.reload(module)


# Select y_pred_prob from the classifier, or y_pred from a regressor
predictions = y_pred_prob or y_pred
returns = returns_test['returns']
winsorization_cutoff = modeling_config["evaluation"]["winsorization_cutoff"]


ia.generate_profitability_curves(predictions, returns, winsorization_cutoff)

In [None]:
for module in modules:
    importlib.reload(module)


# Select y_pred_prob from the classifier, or y_pred from a regressor
predictions = y_pred_prob or y_pred
returns = returns_test['returns']
winsorization_cutoff = modeling_config["evaluation"]["winsorization_cutoff"]


ia.generate_profitability_curves(predictions, returns, winsorization_cutoff)

#### global_market_data from coingecko

In [None]:
df = pd.read_csv("/Users/jeremymeadow/DreamsData/Local/datasets/macro_trends/CoinGecko-GlobalCryptoMktCap-2024-10-04.csv")
df['date'] = pd.to_datetime(df['snapped_at'], unit='ms')
df = df.drop(columns='snapped_at')
df = df[['date','market_cap','total_volume']]

df.to_csv("/Users/jeremymeadow/DreamsData/Local/datasets/macro_trends/formatted/crypto_global_market.csv", index=False)

### bitcoin indicators

In [None]:
import os
import pandas as pd

def load_and_process_bitcoin_data(directory_path):
    # List to store individual dataframes
    dfs = []

    # Iterate through CSV files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory_path, filename)

            # Read the CSV file
            df = pd.read_csv(file_path)

            # Convert 'DateTime' column to datetime type
            df['DateTime'] = pd.to_datetime(df['DateTime'])

            # Set 'DateTime' as the index
            df.set_index('DateTime', inplace=True)

            # Append to the list of dataframes
            dfs.append(df)

    # Join all dataframes
    combined_df = pd.concat(dfs, axis=1, join='outer')

    # Remove duplicate 'BTC price' columns
    combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]

    # Define a dictionary for column renaming
    rename_dict = {
        'BTC price': 'btc_price',
        'CDD Terminal Ajusted 90dma': 'cdd_terminal_adjusted_90dma',
        'Fear and Greed': 'fear_and_greed',
        'MVRV Z-Score': 'mvrv_z_score',
        'VDD Multiple': 'vdd_multiple'
    }

    # Rename columns using the dictionary
    combined_df = combined_df.rename(columns=rename_dict)

    # Rename the index (DateTime column)
    combined_df.index.name = 'date'

    return combined_df

# Example usage:
# df = load_and_process_bitcoin_data('/path/to/csv/directory')
# print(df.head())

df = load_and_process_bitcoin_data('/Users/jeremymeadow/DreamsData/Local/datasets/macro_trends/bitcoin_macro_indicators')

# remove partial recent records
df = df.loc[:'2024-10-02']
df.to_csv("/Users/jeremymeadow/DreamsData/Local/datasets/macro_trends/formatted/bitcoin_indicators.csv", index=True)

## Time Window Sequencing

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs
logger.setLevel(logging.INFO)


# # Calculate overall start and end dates
# end_date = pd.to_datetime(config['training_data']['modeling_period_end'])

# window_duration = config['training_data']['modeling_period_duration'] + config['training_data']['training_period_duration']
# window_count = config['training_data']['additional_windows'] + 1
# total_days = window_duration * window_count
# start_date = pd.to_datetime(end_date) - timedelta(days=total_days)

# Initialize empty lists to hold concatenated data
X_train_list, X_test_list = [], []
y_train_list, y_test_list = [], []
returns_test_list = []

# Generate time_windows config overrides that will modify each window's config settings
time_windows = mf.generate_time_windows(config)

# for n, window in enumerate(time_windows):

#     model_data = mf.build_time_window_model_input(n, window, config, metrics_config, modeling_config)

n = 0
window = time_windows[n]

### 1. Market Data resequencing

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


# Generate time_windows config overrides that will modify each window's config settings
time_windows = mf.generate_time_windows(config)
n = 0
window = time_windows[n]

# Prepare the full configuration by applying overrides from the current trial config
config, metrics_config, modeling_config = mf.prepare_configs(modeling_config['modeling']['config_folder'], window)

# Generate time_windows config overrides that will modify each window's config settings
time_windows = mf.generate_time_windows(config)
n = 0
window = time_windows[n]

# Retrieve market data
market_data_df = dr.retrieve_market_data()
market_data_df = dr.clean_market_data(market_data_df, config)
# market_data_df, _ = cwm.split_dataframe_by_coverage(market_data_df, start_date, end_date, id_column='coin_id')
# prices_df = market_data_df[['coin_id','date','price']].copy()
market_data_df_full = market_data_df.copy()
market_data_df.shape

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs



def sample_timeseries_ema1():
    """
    Fixture to provide a sample time series for EMA calculation in test_ema_scenario1.
    """
    return pd.Series([1, 2, 3, 4, 5, 6])
sample_timeseries_ema1 = sample_timeseries_ema1()

"""
Unit test for calculating Exponential Moving Average (EMA) for a normal case.

Scenario: Calculate EMA for a time series [1, 2, 3, 4, 5, 6] with a window of 3.
The EMA should apply exponential weighting to each value, giving more weight to recent values.
"""

# Step-by-step logic for EMA calculation:
# - The first two values should be NaN due to insufficient data.
# - From the third value onwards, use the EMA formula:
#   EMA(current) = alpha * current_price + (1 - alpha) * EMA(previous)
#   where alpha = 2 / (window + 1) = 2 / (3 + 1) = 0.5
#
#   Step 3: EMA(3) = 0.5 * 3 + 0.5 * 1.5 = 2.25
#   Step 4: EMA(4) = 0.5 * 4 + 0.5 * 2.25 = 3.125
#   Step 5: EMA(5) = 0.5 * 5 + 0.5 * 3.125 = 4.0625
#   Step 6: EMA(6) = 0.5 * 6 + 0.5 * 4.0625 = 5.03125
expected_ema = pd.Series([np.nan, np.nan, 2.25, 3.125, 4.0625, pytest.approx(5.03125, abs=1e-4)])

# Call the function under test
result = ind.calculate_ema(sample_timeseries_ema1, 3)
result

In [None]:
expected_ema = pd.Series([np.nan, np.nan, 2.25, 3.125, 4.0625, 5.03125])
assert all(result[2:] == expected_ema[2:])

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


# time_series_df = market_data_df[['date','coin_id','price']].copy()
config = config
value_column_indicators_config = metrics_config['time_series']['market_data']['price']['indicators']
value_column = 'price'
id_column='coin_id'


# Set indices
market_data_df = market_data_df_full.copy()

# 2. Indicator Calculations
# ----------------------
if id_column:
    # Multi-series data (e.g., multiple coins)
    market_data_df = market_data_df.sort_values(by=[id_column, 'date']).set_index([id_column, 'date'])
else:
    # Single time series data
    market_data_df = market_data_df.sort_values(by=['date']).set_index('date')


for indicator, indicator_config in value_column_indicators_config.items():
    if indicator == 'sma':
        windows = indicator_config['parameters']['window']
        for w in windows:
            sma = ind.calculate_sma(market_data_df[value_column], w)
            market_data_df[f"{value_column}_{indicator}_{w}"] = sma

    elif indicator == 'ema':
        windows = indicator_config['parameters']['window']
        for w in windows:
            ema = ind.calculate_ema(market_data_df[value_column], w)
            market_data_df[f"{value_column}_{indicator}_{w}"] = ema

    elif indicator == 'rsi':
        # retrieve the list of requested windows and make a column for each
        windows = indicator_config['parameters']['window']
        for w in windows:
            rsi = market_data_df.groupby(level='coin_id', observed=True)['price'].transform(
                lambda x: ind.calculate_rsi(x, w))
            market_data_df[f"{value_column}_{indicator}_{w}"] = ema

    elif indicator == 'bollinger_bands_upper':
        windows = indicator_config['parameters']['window']
        for w in windows:



# # Logging
# logger.debug("Generated time series indicators data.")

# return full_indicators_df, partial_time_series_indicators_df

In [None]:
market_data_df.head(20)

### indicators implementation

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


df = market_data_df.copy()
df = market_data_df.set_index(['coin_id','date'])


# Add Relative Strength Index (RSI)
df['rsi'] = df.groupby(level='coin_id', observed=True)['price'].transform(
    lambda x: ind.calculate_rsi(x, 14))
# Add Money Flow Index (MFI)
df = ind.add_mfi_column(df)

# Calculate MACD with EMAs
df['ema_12'] = df.groupby(level='coin_id', observed=True)['price'].transform(lambda x: ind.calculate_ema(x, 12))
df['ema_26'] = df.groupby(level='coin_id', observed=True)['price'].transform(lambda x: ind.calculate_ema(x, 26))
df = ind.add_crossover_column(df, 'ema_12', 'ema_26', drop_col1=True, drop_col2=True)

# Add Bollinger Bands
df = ind.add_bollinger_bands(df, include_middle=False)
# Add crossover for price and upper band
df = ind.add_crossover_column(df, 'price', 'bollinger_band_upper', drop_col1=False, drop_col2=True)
# Add crossover for price and lower band
df = ind.add_crossover_column(df, 'price', 'bollinger_band_lower', drop_col1=False, drop_col2=True)

# Calculate OBV
df['obv_price_volume'] = ind.generalized_obv(df['price'],df['volume'])


df.head()

## Junkyard

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# Define a function to calculate MFI within each group, similar to the crossovers function
def apply_mfi(group):
    # Reset index to avoid issues with the multi-index during group operations
    group = group.reset_index()
    group['mfi'] = ind.calculate_mfi(group['price'], group['volume'])

    # Set index back to the original multi-index
    return group.set_index(['coin_id', 'date'])

# Apply the function within each 'coin_id' group
df = df.groupby('coin_id', observed=True, group_keys=False).apply(apply_mfi)

# Display the updated DataFrame with the MFI column
df.head()

In [None]:
df.head(20)

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

df2 = df[['ema_12','ema_26']].copy()

df2 = ind.add_crossover_column(df2, 'ema_12', 'ema_26', drop_col1=True, drop_col2=True)
df2.head()

In [None]:
def identify_crossovers(series1, series2):
    """
    Identify crossovers between two time series.

    This function calculates the points where series1 crosses over series2.
    It handles NaN values by converting them to 0.

    Parameters:
    series1 (array-like): The first time series
    series2 (array-like): The second time series

    Returns:
    numpy.ndarray: An array of the same length as the input series, where:
        0 indicates no crossover
        1 indicates an upward crossover (series1 crosses above series2)
        -1 indicates a downward crossover (series1 crosses below series2)
    """
    diff = series1 - series2

    # Handle NaN values
    diff = np.nan_to_num(diff, nan=0.0)

    # Initialize crossovers array
    crossovers = np.zeros(len(series1))

    # Identify crossovers
    signs = np.sign(diff)
    sign_changes = signs[1:] != signs[:-1]
    crossover_indices = np.where(sign_changes)[0] + 1

    # Assign 1 for upward crossovers, -1 for downward crossovers
    crossovers[crossover_indices] = np.where(signs[crossover_indices] > 0, 1, -1)


In [None]:
df[['ema_12','ema_26']]

In [None]:

# Assuming `df` is your DataFrame with multi-index (coin_id, date) and ema_12, ema_26 columns

# Define a function that applies identify_crossovers to a group
def apply_crossovers(group):
    group['crossovers'] = identify_crossovers(group['ema_12'], group['ema_26'])
    return group

# Apply the function within each 'coin_id' group
df = df.groupby('coin_id', group_keys=False).apply(apply_crossovers)

# Display the resulting DataFrame with the new 'crossovers' column
df

In [None]:
import pandas as pd



# Display the resulting DataFrame with the new 'crossovers' column
df

## Tests failing

In [None]:
def sample_timeseries_bollinger1():
    """
    Fixture to provide a sample timeseries for Bollinger Bands calculation in test_calculate_bollinger_bands_scenario1.
    """
    return pd.Series([100, 102, 104, 103, 101, 102, 103, 104, 105, 106, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98])
sample_timeseries_bollinger1=sample_timeseries_bollinger1()

# def test_calculate_bollinger_bands_scenario1(sample_timeseries_bollinger1):
#     """
#     Unit test for calculating Bollinger Bands for a normal case.

#     Scenario: Calculate Bollinger Bands for a timeseries [100, 102, 104, 103, 101, ...] with a window of 5 and num_std of 2.

#     Expected Behavior: The function should return three Series: the middle band (SMA), upper band, and lower band,
#     correctly calculated using the provided window and standard deviation.
#     """

# Define window and standard deviation multiplier
window = 5
num_std = 2

# Call the function under test
middle_band, upper_band, lower_band = ind.calculate_bollinger_bands(sample_timeseries_bollinger1, window=window, num_std=num_std)

# Manually calculate expected values for comparison
expected_middle_band = sample_timeseries_bollinger1.rolling(window=window).mean()
expected_std_dev = sample_timeseries_bollinger1.rolling(window=window).std()
expected_upper_band = expected_middle_band + (expected_std_dev * num_std)
expected_lower_band = expected_middle_band - (expected_std_dev * num_std)

# Assert that the middle band, upper band, and lower band match expected values
assert np.allclose(middle_band[window:], expected_middle_band[window:], atol=1e-4), \
    f"Expected middle band values: {expected_middle_band.values}, but got {middle_band.values}"

assert np.allclose(upper_band[window:], expected_upper_band[window:], atol=1e-4), \
    f"Expected upper band values: {expected_upper_band.values}, but got {upper_band.values}"

assert np.allclose(lower_band[window:], expected_lower_band[window:], atol=1e-4), \
    f"Expected lower band values: {expected_lower_band.values}, but got {lower_band.values}"

# Ensure the first values before the window period are NaN (due to insufficient data)
assert middle_band[:window-1].isna().all(), "Expected NaN values for middle band before the window period."
assert upper_band[:window-1].isna().all(), "Expected NaN values for upper band before the window period."
assert lower_band[:window-1].isna().all(), "Expected NaN values for lower band before the window period."



In [None]:
middle_band

In [None]:
result_mfi

In [None]:
expected_rsi.values