In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import re
import pdb
import datetime
import json
from datetime import datetime, timedelta
import yaml
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load dotenv
load_dotenv()

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp


# reload all modules
modules = [u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)


## Aggregate function

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

sets_X_y_dict, returns_df, join_logs_df = tw.generate_all_time_windows_model_inputs(config,metrics_config,modeling_config)

In [None]:
df = sets_X_y_dict['train_set'][0]
df.columns

In [None]:
# 1. Retrieve base datasets used by all windows
# ---------------------------------------------
macro_trends_df, market_data_df, profits_df, prices_df = tw.prepare_all_windows_base_data(config,
                                                                                            metrics_config)


# 2. Generate flattened features for each dataset in each window
# --------------------------------------------------------------
# Generate time_windows config overrides that will modify each window's config settings
time_windows = tw.generate_time_windows(config)

all_flattened_dfs = []
all_flattened_filepaths = []

for _, time_window in enumerate(time_windows):

    # Prepare time window config files
    window_config, window_metrics_config, window_modeling_config = (
        exp.prepare_configs(modeling_config['modeling']['config_folder'], time_window))

    # Generate flattened feature dfs for all datasets for the window
    window_flattened_dfs, window_flattened_filepaths = tw.generate_window_flattened_dfs(
        market_data_df,
        macro_trends_df,
        profits_df,
        prices_df,
        window_config,
        window_metrics_config,
        window_modeling_config
    )

    # Store window's flattened features
    all_flattened_dfs.extend(window_flattened_dfs)
    all_flattened_filepaths.extend(window_flattened_filepaths)



In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


# 3. Combine features from all datasets in all time windows with target variables
# -------------------------------------------------------------------------------
# Combine all time windows for each dataset, the join the datasets together
concatenated_dfs = tw.concat_dataset_time_windows_dfs(all_flattened_filepaths,modeling_config)
training_data_df, join_logs_df = tw.join_dataset_all_windows_dfs(concatenated_dfs)

# Create target variables for all time windows
target_variable_df, returns_df, = tw.create_target_variables_for_all_time_windows(training_data_df,
                                                                                    prices_df,
                                                                                    config,
                                                                                    modeling_config)

# Split target variables into the train/test/validation/future sets
sets_X_y_dict = ds.perform_train_test_validation_future_splits(training_data_df,
                                                                target_variable_df,
                                                                modeling_config)


## Preprocessing

In [None]:
df = sets_X_y_dict['train_set'][0]
df.columns

In [None]:
df.columns

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


# Load your DataFrames (replace this with your actual data loading code)
datasets = {
    'train': df
}

# Initialize and run the preprocessor
preprocessor = prp.DataPreprocessor(config, metrics_config, modeling_config)
preprocessed_datasets = preprocessor.preprocess(datasets)

# Print results
for dataset_name, df in preprocessed_datasets.items():
    print(f"Columns in {dataset_name} set: {df.columns.tolist()}")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# Confirm there are no null values
if df.isnull().values.any():
    raise ValueError("Missing values detected in the DataFrame.")

# Convert all columns to numeric
df = prp.preprocess_categorical_and_boolean(df)

# Feature Selection
# Drop features specified in modeling_config['drop_features']
drop_features = modeling_config['preprocessing'].get('drop_features', [])
if drop_features:
    df = df.drop(columns=drop_features, errors='warn')

# # Apply feature selection based on sameness_threshold and retain_columns from dataset_config
# sameness_threshold = dataset_config.get('sameness_threshold', 1.0)
# retain_columns = dataset_config.get('retain_columns', [])

# # Drop columns with more than `sameness_threshold` of the same value, unless in retain_columns
# for column in df.columns:
#     if column not in retain_columns:
#         max_value_ratio = df[column].value_counts(normalize=True).max()
#         if max_value_ratio > sameness_threshold:
#             df = df.drop(columns=[column])
#             logger.debug("Dropped column %s due to sameness_threshold", column)


# # Step 4: Scaling and Transformation
# # ----------------------------------------------------
# # Apply scaling if df_metrics_config is provided
# if df_metrics_config:
#     df = apply_scaling(df, df_metrics_config)




In [None]:
from typing import Dict, Any
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs



def calculate_sameness_percentage(column: pd.Series) -> float:
    """
    Calculate the percentage of the most common value in a column.

    Parameters:
    column (pd.Series): The column to analyze.

    Returns:
    float: The percentage (0 to 1) of the most common value in the column.
    """
    return column.value_counts().iloc[0] / len(column)

def create_prefix_mapping(config: Dict[str, Any]) -> Dict[str, Dict[str, float]]:
    """
    Create a mapping of column prefixes to their config paths and sameness thresholds.

    Parameters:
    config (Dict[str, Any]): The configuration dictionary containing dataset information.

    Returns:
    Dict[str, Dict[str, float]]: A dictionary where keys are column prefixes and values are
    dictionaries containing 'path' (str) and 'threshold' (float) for each prefix.
    """
    mapping = {}

    for dataset_type, dataset_config in config['datasets'].items():
        for category, category_config in dataset_config.items():
            if isinstance(category_config, dict) and 'sameness_threshold' in category_config:
                prefix = f"{category}_"
                mapping[prefix] = {
                    'path': f"datasets.{dataset_type}.{category}",
                    'threshold': category_config['sameness_threshold']
                }
            elif isinstance(category_config, dict):
                for subcategory, subcategory_config in category_config.items():
                    if 'sameness_threshold' in subcategory_config:
                        prefix = f"{subcategory}_"
                        mapping[prefix] = {
                            'path': f"datasets.{dataset_type}.{category}.{subcategory}",
                            'threshold': subcategory_config['sameness_threshold']
                        }

    return mapping

def check_and_drop_columns(df: pd.DataFrame, config: Dict[str, Any]) -> pd.DataFrame:
    """
    Check column sameness and drop columns exceeding the threshold.

    This function analyzes each column in the DataFrame, calculates its sameness percentage,
    and drops columns that exceed the threshold specified in the configuration.

    Parameters:
    df (pd.DataFrame): The input DataFrame to process.
    config (Dict[str, Any]): The configuration dictionary containing sameness thresholds.

    Returns:
    pd.DataFrame: A new DataFrame with columns dropped based on the sameness criteria.

    Raises:
    ValueError: If any columns can't be mapped to a sameness threshold or if any config keys
                can't be mapped to columns.
    """
    prefix_mapping = create_prefix_mapping(config)
    columns_to_drop = []
    unmapped_columns = []
    used_config_keys = set()

    for column in df.columns:
        mapped = False
        for prefix, config_info in prefix_mapping.items():
            if column.startswith(prefix):
                mapped = True
                used_config_keys.add(prefix)
                sameness = calculate_sameness_percentage(df[column])
                if sameness > config_info['threshold']:
                    columns_to_drop.append(column)
                break
        if not mapped:
            unmapped_columns.append(column)

    unused_config_keys = set(prefix_mapping.keys()) - used_config_keys

    if unmapped_columns:
        raise ValueError(f"The following columns could not be mapped to a sameness threshold: {unmapped_columns}")

    if unused_config_keys:
        raise ValueError(f"The following config keys could not be mapped to columns: {unused_config_keys}")

    # Drop the columns
    df.drop(columns=columns_to_drop)
    logger.info("Dropped %s columns %s due to sameness thresholds.", len(columns_to_drop), columns_to_drop)

    return df




df_cleaned = check_and_drop_columns(df, config)

In [None]:
df_cleaned

In [None]:
# Step 2: Convert categorical and boolean columns to integers
# ---------------------------------------------------------------
# Convert categorical columns to one-hot encoding (get_dummies)
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
categorical_columns = [col for col in categorical_columns]
for col in categorical_columns:
    num_categories = df[col].nunique()
    if num_categories > 8:
        logger.warning("Column '%s' has %s categories, consider reducing categories.",
                        col, num_categories)
    df = pd.get_dummies(df, columns=[col], drop_first=True)


# Convert boolean columns to integers
df = df.apply(lambda col: col.astype(int) if col.dtype == bool else col)
df

## All windows datasets

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


def generate_all_time_windows_model_inputs(config,metrics_config,modeling_config):
    """
    Generates the X and y splits for all sets across all time windows.

    Sequence:
    1. Retrieve the base datasets that contain records across all windows
    2. Loop through each time window and generate flattened features for the window
    3a. Concat each dataset's window dfs, then join all the dataset dfs with the target variable to
        create a comprehensive feature set keyed on coin_id.
    3b. Split the full feature set into train/test/validation/future sets.

    Params:
    - config, metrics_config, modeling_config: loaded config yaml files

    Returns:
    - sets_X_y_dict (dict[pd.DataFrame, pd.Series]): Dict with keys for each set type (e.g. train_set,
        future_set, etc) that contains the X and y data for the set.
    - returns_df (pd.DataFrame): DataFrame with MultiIndex on time_window,coin_id that contains a
        'returns' column showing actual returns during the each time_window's modeling period.
    - join_logs_df (pd.DataFrame): DataFrame showing the outcomes of each dataset's join and fill
        methods
    """

    # 1. Retrieve base datasets used by all windows
    # ---------------------------------------------
    macro_trends_df, market_data_df, profits_df, prices_df = tw.prepare_all_windows_base_data(config,
                                                                                              metrics_config)


    # 2. Generate flattened features for each dataset in each window
    # --------------------------------------------------------------
    # Generate time_windows config overrides that will modify each window's config settings
    time_windows = tw.generate_time_windows(config)

    all_flattened_dfs = []
    all_flattened_filepaths = []

    for n, time_window in enumerate(time_windows):

        # Prepare time window config files
        window_config, window_metrics_config, window_modeling_config = (
            exp.prepare_configs(modeling_config['modeling']['config_folder'], time_window))

        # Generate flattened feature dfs for all datasets for the window
        window_flattened_dfs, window_flattened_filepaths = tw.generate_window_flattened_dfs(
            market_data_df,
            macro_trends_df,
            profits_df,
            prices_df,
            window_config,
            window_metrics_config,
            window_modeling_config
        )

        # Store window's flattened features
        all_flattened_dfs.extend(window_flattened_dfs)
        all_flattened_filepaths.extend(window_flattened_filepaths)


    # 3. Combine features from all datasets in all time windows with target variables
    # -------------------------------------------------------------------------------
    # Combine all time windows for each dataset, the join the datasets together
    concatenated_dfs = tw.concat_dataset_time_windows_dfs(all_flattened_filepaths,modeling_config)
    training_data_df, join_logs_df = tw.join_dataset_all_windows_dfs(concatenated_dfs)

    # Create target variables for all time windows
    target_variable_df, returns_df, = tw.create_target_variables_for_all_time_windows(training_data_df,
                                                                                        prices_df,
                                                                                        config,
                                                                                        modeling_config)

    # Split target variables into the train/test/validation/future sets
    sets_X_y_dict = prp.perform_train_test_validation_future_splits(training_data_df,
                                                                    target_variable_df,
                                                                    modeling_config)

    return sets_X_y_dict, returns_df, join_logs_df

In [None]:
returns_df.head()

## Generate training data, target vars, and split sets

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# Combine all time windows for each dataset, the join the datasets together
concatenated_dfs = tw.concat_dataset_time_windows_dfs(all_flattened_filepaths,modeling_config)
training_data_df, join_logs_df = tw.join_dataset_all_windows_dfs(concatenated_dfs)

# Create target variables for all time windows
target_variable_df, returns_df, = tw.create_target_variables_for_all_time_windows(training_data_df,
                                                                                    prices_df,
                                                                                    config,
                                                                                    modeling_config)

sets_X_y_dict = prp.perform_train_test_validation_future_splits(training_data_df, target_variable_df, modeling_config)

## split sets

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

sets_X_y_dict = prp.perform_train_test_validation_future_splits(training_data_df, target_variable_df, modeling_config)

In [None]:
y_test

In [None]:
unique_time_windows = training_data_df.index.get_level_values('time_window').unique()

# Select the first n windows, counting in reverse order
if data_partitioning_config['future_set_time_windows'] == 0:
    future_time_windows = []
else:
    future_time_windows = unique_time_windows[-data_partitioning_config['future_set_time_windows']:]
future_mask = training_data_df.index.get_level_values('time_window').isin(future_time_windows)

X_future = training_data_df[future_mask]
y_future = target_variable_df[future_mask]
temp_training_data_df = training_data_df[~future_mask]
temp_target_variable_df = target_variable_df[~future_mask]

# return X_future, y_future, temp_training_data_df, temp_target_variable_df


In [None]:
y_future

In [None]:
data_partitioning_config['future_set_time_windows']

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


original_row_count = len(training_data_df)

# 2. Train Test Split
# -------------------
data_partitioning_config = modeling_config['preprocessing']['data_partitioning']
np.random.seed(modeling_config['modeling']['random_seed'])

# Split future set if specified
X_future, y_future, temp_training_data_df, temp_target_variable_df = prp.split_future_set(
    training_data_df,
    target_variable_df,
    data_partitioning_config)

# Split validation set
X_validation, y_validation, temp_training_data_df, temp_target_variable_df = prp.split_validation_set(
    temp_training_data_df,
    temp_target_variable_df,
    data_partitioning_config,
    training_data_df)

# Split train and test sets
X_train, X_test, y_train, y_test = prp.split_train_test_sets(
    temp_training_data_df,
    temp_target_variable_df,
    data_partitioning_config,
    training_data_df,
)

# Create the result dictionary
sets_dict = {
    'train_set': (X_train, y_train),
    'test_set': (X_test, y_test),
    'validation_set': (X_validation, y_validation),
    'future_set': (X_future, y_future)
}

# 3. Logs and additional data quality checks
# ------------------------------------------

# Prepare log message
target_column = modeling_config['modeling']['target_column']
unique_values = target_variable_df[target_column].unique()
is_binary = len(unique_values) == 2 and set(unique_values).issubset({0, 1})

log_message = "Data Partitioning Results:\n"
total_partitioned_rows = 0
for set_name, (X, y) in sets_dict.items():
    row_count = len(X)
    total_partitioned_rows += row_count
    log_message += f"- {set_name}: {row_count} rows"
    if is_binary:

        positive_count = (y[target_column] == 1).sum()
        total_count = len(y)
        percentage = (positive_count / total_count) * 100 if total_count > 0 else 0
        log_message += f", Positive samples: {positive_count} ({percentage:.2f}%)"
    log_message += "\n"

# Check if total rows in all sets equals original row count
if total_partitioned_rows != original_row_count:
    raise ValueError(f"Data partitioning error: Total rows in all sets ({total_partitioned_rows}) "
                        f"does not match original row count ({original_row_count})")

# Log the consolidated message
logger.info(log_message)


In [None]:
future_set

In [None]:
f", Positive samples: {positive_count[0]} ({percentage[0]:.2f}%)"


In [None]:
# Get unique coin_ids
unique_coin_ids = temp_training_data_df.index.get_level_values('coin_id').unique()
total_coin_ids = len(unique_coin_ids)

# Calculate the number of coin_ids for the validation set
num_validation_coins = int(np.round(data_partitioning_config['validation_set_share'] * total_coin_ids))

# Randomly select coin_ids for the validation set
validation_coin_ids = np.random.choice(unique_coin_ids, size=num_validation_coins, replace=False)

# Create masks for the validation and training sets
validation_mask = temp_training_data_df.index.get_level_values('coin_id').isin(validation_coin_ids)

# # Split the data
# X_val = temp_training_data_df[validation_mask]
# y_val = temp_target_variable_df[validation_mask]
# temp_training_data_df = temp_training_data_df[~validation_mask]
# temp_target_variable_df = temp_target_variable_df[~validation_mask]

In [None]:
len(validation_coin_ids)

In [None]:
training_data_df = training_data_df.sort_index()
target_variable_df = target_variable_df.sort_index()
training_data_df.index.equals(target_variable_df.index)

In [None]:
# Ensure both DataFrames have their indices sorted
df1_name = 'training_data'
df2_name = 'target_var'

df1 = training_data_df.sort_index()
df2 = target_variable_df.sort_index()

print(f"Indices_match: {df1.index.equals(df2.index)}")


# Find indices unique to each DataFrame
only_in_df1 = df1.index.difference(df2.index)
only_in_df2 = df2.index.difference(df1.index)

# Print results
print(f"Indices only in {df1_name}:")
if len(only_in_df1) > 0:
    print(only_in_df1.to_frame(index=False))
else:
    print("None")

print(f"\nIndices only in {df2_name}:")
if len(only_in_df2) > 0:
    print(only_in_df2.to_frame(index=False))
else:
    print("None")

# Check for any duplicates in either index
duplicates_df1 = df1.index[df1.index.duplicated()].unique()
duplicates_df2 = df2.index[df2.index.duplicated()].unique()

print(f"\nDuplicate indices in {df1_name}:")
if len(duplicates_df1) > 0:
    print(duplicates_df1.to_frame(index=False))
else:
    print("None")

print(f"\nDuplicate indices in {df2_name}:")
if len(duplicates_df2) > 0:
    print(duplicates_df2.to_frame(index=False))
else:
    print("None")


In [None]:
print(f"Detailed index comparison between {df1_name} and {df2_name}:")

# Check index types
print(f"\nIndex types:")
print(f"{df1_name}: {type(df1.index)}")
print(f"{df2_name}: {type(df2.index)}")

# Check index names
print(f"\nIndex names:")
print(f"{df1_name}: {df1.index.names}")
print(f"{df2_name}: {df2.index.names}")

# Check index lengths
print(f"\nIndex lengths:")
print(f"{df1_name}: {len(df1.index)}")
print(f"{df2_name}: {len(df2.index)}")

# Check dtypes of each level
print(f"\nDtypes of each level:")
for level in df1.index.names:
    print(f"Level '{level}':")
    print(f"  {df1_name}: {df1.index.get_level_values(level).dtype}")
    print(f"  {df2_name}: {df2.index.get_level_values(level).dtype}")

# Check for NaN values in index
print(f"\nNaN values in index:")
for level in df1.index.names:
    print(f"Level '{level}':")
    print(f"  {df1_name}: {df1.index.get_level_values(level).isnull().sum()} NaN values")
    print(f"  {df2_name}: {df2.index.get_level_values(level).isnull().sum()} NaN values")

# Compare a sample of index values
print(f"\nSample comparison of index values:")
sample_size = min(5, len(df1.index))
sample_indices = np.random.choice(len(df1.index), sample_size, replace=False)
for i in sample_indices:
    print(f"\nSample {i}:")
    print(f"  {df1_name}: {df1.index[i]}")
    print(f"  {df2_name}: {df2.index[i]}")
    print(f"  Equal: {df1.index[i] == df2.index[i]}")


In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


def create_target_variables_for_all_time_windows(training_data_df, prices_df, config, modeling_config):
    """
    Create target variables for all time windows in training_data_df.

    Parameters:
    - training_data_df: DataFrame with multi-index (time_window, coin_id) and 'modeling_period_end' column
    - prices_df: DataFrame with 'coin_id', 'date', and 'price' columns
    - config: config.yaml
    - modeling_config: modeling_config.yaml

    Returns:
    - combined_target_variables: DataFrame with columns for 'time_window' and the configured
        target variable
    - combined_returns: DataFrame with columns 'returns' and 'time_window'
    """
    all_target_variables = []
    all_returns = []

    for time_window in training_data_df.index.get_level_values('time_window').unique():
        # Get the list of coin_ids for the current time_window
        current_coins = training_data_df.loc[time_window].index.get_level_values('coin_id').tolist()

        # Filter prices_df for the current coins
        current_prices_df = prices_df[prices_df['coin_id'].isin(current_coins)]

        # Create copy of config with time_window's modeling period dates
        current_training_data_config = config['training_data'].copy()
        current_training_data_config['modeling_period_start'] = time_window
        current_training_data_config['modeling_period_end'] = (
                pd.to_datetime(time_window) + timedelta(days=current_training_data_config['modeling_period_duration'])
                ).strftime('%Y-%m-%d')

        # Call create_target_variables function
        target_variables_df, returns_df = prp.create_target_variables(
            current_prices_df,
            current_training_data_config,
            modeling_config
        )

        # Add time_window information to the results
        target_variables_df['time_window'] = time_window
        returns_df['time_window'] = time_window

        # Store results
        all_target_variables.append(target_variables_df)
        all_returns.append(returns_df)

    # Combine results
    combined_target_variables = pd.concat(all_target_variables, ignore_index=True)
    combined_returns = pd.concat(all_returns, ignore_index=False)

    return combined_target_variables, combined_returns

# Usage
combined_target_variables, combined_returns = create_target_variables_for_all_time_windows(
    training_data_df,
    prices_df,
    config,
    modeling_config
)

In [None]:
combined_target_variables

In [None]:
def calculate_coin_returns(prices_df, training_data_config):
    """
    Prepares the data and computes price returns for each coin.

    Parameters:
    - prices_df: DataFrame containing price data with columns 'coin_id', 'date', and 'price'.
    - training_data_config: Configuration with modeling period dates.

    Returns:
    - returns_df: DataFrame with columns 'coin_id' and 'returns'.
    """
    prices_df = prices_df.copy()
    prices_df['date'] = pd.to_datetime(prices_df['date'])
    modeling_period_start = pd.to_datetime(training_data_config['modeling_period_start'])
    modeling_period_end = pd.to_datetime(training_data_config['modeling_period_end'])

    # Filter data for start and end dates
    start_prices = prices_df[prices_df['date'] == modeling_period_start].set_index('coin_id')['price']
    end_prices = prices_df[prices_df['date'] == modeling_period_end].set_index('coin_id')['price']

    # Identify coins with both start and end prices
    valid_coins = start_prices.index.intersection(end_prices.index)

    # Check for missing data
    all_coins = prices_df['coin_id'].unique()
    coins_missing_price = set(all_coins) - set(valid_coins)

    if coins_missing_price:
        missing = ', '.join(map(str, coins_missing_price))
        raise ValueError(f"Missing price for coins at start or end date: {missing}")

    # Compute returns
    returns = (end_prices[valid_coins] - start_prices[valid_coins]) / start_prices[valid_coins]
    returns_df = pd.DataFrame({'returns': returns})

    return returns_df

returns_df = calculate_coin_returns(prices_df, config['training_data'])

In [None]:
returns_df.head()

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


def create_target_variables_for_all_time_windows(training_data_df, prices_df, config, modeling_config):
    """
    Create target variables for all time windows in training_data_df.

    Parameters:
    - training_data_df: DataFrame with multi-index (time_window, coin_id) and 'modeling_period_end' column
    - prices_df: DataFrame with 'coin_id', 'date', and 'price' columns
    - config: config.yaml
    - modeling_config: modeling_config.yaml

    Returns:
    - combined_target_variables: DataFrame with columns for 'time_window' and the configured
        target variable
    - combined_returns: DataFrame with columns 'returns' and 'time_window'
    """
    all_target_variables = []
    all_returns = []

    for time_window in training_data_df.index.get_level_values('time_window').unique():
        # Filter prices_df for the current time window
        current_prices_df = prices_df[
            (prices_df['date'] >= time_window) &
            (prices_df['date'] <= training_data_df.loc[time_window, 'modeling_period_end'].iloc[0])
        ]

        # Update training_data_config for the current time window
        current_training_data_config = config['training_data'].copy()
        current_training_data_config['modeling_period_start'] = time_window
        current_training_data_config['modeling_period_end'] = training_data_df.loc[time_window, 'modeling_period_end'].iloc[0]

        # Call create_target_variables function
        target_variables_df, returns_df = prp.create_target_variables(
            current_prices_df,
            current_training_data_config,
            modeling_config
        )

        # Add time_window information to the results
        target_variables_df['time_window'] = time_window
        returns_df['time_window'] = time_window

        # Store results
        all_target_variables.append(target_variables_df)
        all_returns.append(returns_df)

    # Combine results
    combined_target_variables = pd.concat(all_target_variables, ignore_index=True)
    combined_returns = pd.concat(all_returns, ignore_index=True)

    return combined_target_variables, combined_returns

# Usage
combined_target_variables, combined_returns = create_target_variables_for_all_time_windows(
    training_data_df,
    prices_df,
    config,
    modeling_config
)

In [None]:
combined_target_variables

In [None]:
combined_returns

## Window specific metrics

#### Window-specific metrics

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


# Prepare time window config files
config, metrics_config, modeling_config = mif.prepare_configs(modeling_config['modeling']['config_folder'], time_window)


In [None]:
all_flattened_filepaths

In [None]:

all_flattened_dfs[3]

In [None]:
all_flattened_filepaths.extend([flattened_market_data_filepath])

all_flattened_filepaths

#### Cohorts

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

def generate_wallet_cohort_features(
        profits_df,
        prices_df,
        config,
        metrics_config,
        modeling_config
    ):
    """
    Generates a window-specific feature set from the full all windows dataset. The window-specific
    features are saved as a csv and returned, along with the csv filepath.

    This function differs from the time_series set because it only flattens on date, since this
    dataset doesn't have coin_id.

    Params:
    - all_windows_time_series_df (DataFrame): df containing all metrics and indicators for a time
        series dataset.
    - config: config.yaml that has the dates for the specific time window
    - metrics_config: metrics_config.yaml
    - modeling_config: modeling_config.yaml

    Returns:
    - flattened_metrics_df (DataFrame): the flattened version of the original df, with columns for
        the configured aggregations and rolling metrics for all value columns and indicators.
    - flattened_metrics_filepath (string): the filepath to where the flattened_metrics_df is saved
    """

    # 1. Impute all required dates
    # ----------------------------
    # Identify all required imputation dates
    imputation_dates = pri.identify_imputation_dates(config)

    # Impute all required dates
    window_profits_df = pri.impute_profits_for_multiple_dates(profits_df, prices_df, imputation_dates, n_threads=24)
    window_profits_df = (window_profits_df[(window_profits_df['date'] >= pd.to_datetime(min(imputation_dates))) &
                                        (window_profits_df['date'] <= pd.to_datetime(max(imputation_dates)))])


    # 2. Generate metrics and indicators for all cohorts
    # --------------------------------------------------
    for cohort_name in metrics_config['wallet_cohorts']:

        # load configs
        dataset_metrics_config = metrics_config['wallet_cohorts'][cohort_name]
        dataset_config = config['datasets']['wallet_cohorts'][cohort_name]

        # identify wallets in the cohort based on the full lookback period
        cohort_summary_df = cwm.classify_wallet_cohort(window_profits_df, dataset_config, cohort_name)
        cohort_wallets = cohort_summary_df[cohort_summary_df['in_cohort']]['wallet_address']

        # If no cohort members were identified, continue
        if len(cohort_wallets) == 0:
            logger.info("No wallets identified as members of cohort '%s'", cohort_name)
            continue

        # Generate cohort buysell_metrics
        cohort_metrics_df = cwm.generate_buysell_metrics_df(profits_df,
                                                            config['training_data']['training_period_end'],
                                                            cohort_wallets)

        # Generate cohort indicator metrics
        cohort_metrics_df = ind.generate_time_series_indicators(cohort_metrics_df,
                                                                metrics_config['wallet_cohorts'][cohort_name],
                                                                'coin_id')

        # Flatten cohort metrics
        flattened_cohort_df, flattened_cohort_filepath = fe.generate_window_time_series_features(
            cohort_metrics_df,
            config,
            dataset_metrics_config,
            modeling_config
        )

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs




In [None]:
# Flatten the metrics DataFrame to be keyed only on coin_id
flattened_metrics_df = fe.flatten_coin_date_df(
    cohort_metrics_df,
    metrics_config['wallet_cohorts'][cohort_name],
    config['training_data']['training_period_end']  # Ensure data is up to training period end
)
flattened_metrics_df.columns

In [None]:
# Add time window modeling period start
flattened_metrics_df.loc[:,'time_window'] = config['training_data']['modeling_period_start']

# Save the flattened output and retrieve the file path
_, flattened_metrics_filepath = fe.save_flattened_outputs(
    flattened_metrics_df,
    os.path.join(
        modeling_config['modeling']['modeling_folder'],  # Folder to store flattened outputs
        'outputs/flattened_outputs'
    ),
    'market_data',  # Descriptive metadata for the dataset
    config['training_data']['modeling_period_start']  # Ensure data starts from modeling period
)

In [None]:
# Generate cohort buysell_metrics
cohort_metrics_df = cwm.generate_buysell_metrics_df(profits_df,
                                                    config['training_data']['training_period_end'],
                                                    cohort_wallets)


cohort_metrics_df.shape

In [None]:

# Generate cohort indicator metrics
cohort_metrics_df = ind.generate_time_series_indicators(cohort_metrics_df,
                                                        metrics_config['wallet_cohorts'][cohort_name],
                                                        'coin_id')

cohort_metrics_df.shape

In [None]:
cohort_metrics_df.describe()

In [None]:
time_series_df = cohort_metrics_df
start_date = config['training_data']['training_period_start']
end_date = config['training_data']['training_period_end']
id_column='coin_id'
drop_outside_date_range=True

# Convert params to datetime
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

# Create copy of df
time_series_df = time_series_df.copy()

# Drop all rows with any NaN values
time_series_df = time_series_df.dropna()

# Define a function to check if a date range has full coverage
def has_full_coverage(min_date, max_date):
    return (min_date <= start_date) and (max_date >= end_date)

if id_column:
    # Multi-series data
    series_data_range = time_series_df.groupby(id_column, observed=True)['date'].agg(['min', 'max'])
    full_duration_series = series_data_range[series_data_range.apply(lambda x: has_full_coverage(x['min'], x['max']), axis=1)].index
else:
    # Single-series data
    series_data_range = time_series_df['date'].agg(['min', 'max'])
    full_duration_series = [0] if has_full_coverage(series_data_range['min'], series_data_range['max']) else []

# Calculate coverage statistics
full_coverage_count = len(full_duration_series)

# # Split the dataframe
# if id_column:
#     # Convert id column to categorical to reduce memory usage
#     time_series_df[id_column] = time_series_df[id_column].astype('category')
#     full_coverage_df = time_series_df[time_series_df[id_column].isin(full_duration_series)]
#     partial_coverage_df = time_series_df[~time_series_df[id_column].isin(full_duration_series)]
# else:
#     full_coverage_df = time_series_df if full_coverage_count else pd.DataFrame(columns=time_series_df.columns)
#     partial_coverage_df = time_series_df if not full_coverage_count else pd.DataFrame(columns=time_series_df.columns)

# logger.info("Split df with dimensions %s into %s full coverage records and %s partial coverage records.",
#             time_series_df.shape,
#             len(full_coverage_df),
#             len(partial_coverage_df))

# if drop_outside_date_range:
#     # Remove rows outside the date range for both dataframes
#     full_coverage_df = (full_coverage_df[(full_coverage_df['date'] >= start_date) &
#                                             (full_coverage_df['date'] <= end_date)])
#     partial_coverage_df = (partial_coverage_df[(partial_coverage_df['date'] >= start_date) &
#                                                 (partial_coverage_df['date'] <= end_date)])

#     # Log the number of remaining records
#     total_remaining = len(full_coverage_df) + len(partial_coverage_df)
#     logger.info("After removing records outside the date range, %s records remain.",
#                 total_remaining)

# # return full_coverage_df, partial_coverage_df

time_series_df.shape
full_coverage_count

In [None]:
end_date

In [None]:
series_data_range = time_series_df.groupby(id_column, observed=True)['date'].agg(['min', 'max'])
series_data_range

In [None]:


cohort_metrics_df2.shape

In [None]:
cohort_metrics_df2

In [None]:

# If no cohort members were identified, continue
if len(cohort_wallets) == 0:
    logger.info("No wallets identified as members of cohort '%s'", cohort_name)
    continue

# Generate cohort buysell_metrics
cohort_metrics_df = cwm.generate_buysell_metrics_df(profits_df,
                                                    config['training_data']['training_period_end'],
                                                    cohort_wallets)

# Generate cohort indicator metrics
cohort_metrics_df = ind.generate_time_series_indicators(cohort_metrics_df,
                                                        metrics_config['wallet_cohorts'][cohort_name],
                                                        'coin_id')

# Split df to only training_period
cohort_metrics_df,_ = cwm.split_dataframe_by_coverage(cohort_metrics_df,
                                                    config['training_data']['training_period_start'],
                                                    config['training_data']['modeling_period_end'],
                                                    id_column='coin_id',
                                                    drop_outside_date_range=True)

cohort_metrics_df.shape

In [None]:

# # Add time window modeling period start
# cohort_metrics_df.loc[:,'time_window'] = config['training_data']['modeling_period_start']

cohort_metrics_df.columns

In [None]:
cohort_metrics_df.shape

In [None]:
# Market data: generate window-specific flattened metrics
flattened_market_data_df, flattened_market_data_filepath = fe.generate_window_time_series_features(
    market_data_df,
    config,
    metrics_config,
    modeling_config
)

In [None]:
dataset_metrics_config['total_bought']

In [None]:
cohort_metrics_df.columns

In [None]:
metrics_config['wallet_cohorts']['whales']

In [None]:
u.df_mem(profits_df)

In [None]:
"""
IN WINDOW FUNCTIONS

market_data_df: just filter to window
macro_trends_df: just filter to window

profits_df
1. identify all dates needed
    all cohort lookback window starts
    training_period_start
    training_period_end
    modeling_period_start
    modeling_period_end
2. impute them
3. filter df to only dates between earliest and latest dates


4. wallet cohorts and buysell metrics
5. indicators
6. filter to window
"""

# def build_time_window_model_input(n, window, config, metrics_config, modeling_config):
#     """
#     Generates training data for each of the config.training_data.additional_windows.

#     Params:
#         n (int): The lookback number of the time window (e.g 0,1,2)
#         window (Dict): The config override dict with the window's modeling_period_start
#         config: config.yaml
#         metrics_config: metrics_config.yaml
#         modeling_config: modeling_config.yaml

#     Returns:
#         model_data (Dict): Dictionary containing all of the modeling features and variables:
#             X_train, X_test (DataFrame): Model training features
#             y_train, y_test (pd.Series): Model target variables
#             returns_test (DataFrame): The actual returns of each coin_id in each time_window.
#                 - coin_id: Index (str)
#                 - time_window: Index (int)
#                 - returns: value column (float)
#     """

# Prepare the full configuration by applying overrides from the current trial config
config, metrics_config, modeling_config = prepare_configs(modeling_config['modeling']['config_folder'], window)

# Define window start and end dates
start_date = config['training_data']['training_period_start']
end_date = config['training_data']['modeling_period_end']

# Rebuild market data
market_data_df = dr.retrieve_market_data()
market_data_df, _ = cwm.split_dataframe_by_coverage(market_data_df, start_date, end_date, id_column='coin_id')
prices_df = market_data_df[['coin_id','date','price']].copy()

# Retrieve macro trends data
macro_trends_df = dr.retrieve_macro_trends_data()
macro_trends_df = cwm.generate_macro_trends_features(macro_trends_df, config)

# Rebuild profits_df
if 'profits_df' not in locals():
    profits_df = None
profits_df = rebuild_profits_df_if_necessary(config, prices_df, profits_df)

# Build the configured model input data for the nth window
X_train, X_test, y_train, y_test, returns_test = build_configured_model_input(
                                    profits_df,
                                    market_data_df,
                                    macro_trends_df,
                                    config,
                                    metrics_config,
                                    modeling_config)

# Add time window indices to dfs with coin_ids
X_train['time_window'] = n
X_train.set_index('time_window', append=True, inplace=True)
X_test['time_window'] = n
X_test.set_index('time_window', append=True, inplace=True)
returns_test['time_window'] = n
returns_test.set_index('time_window', append=True, inplace=True)

model_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'returns_test': returns_test
}

# return model_data


## Modeling Sequence

In [None]:


# Generate time_windows config overrides that will modify each window's config settings
time_windows = mif.generate_time_windows(config)

# Initialize empty lists to hold concatenated data
X_train_list, X_test_list = [], []
y_train_list, y_test_list = [], []
returns_test_list = []

for n, window in enumerate(time_windows):

    model_data = mif.build_time_window_model_input(n, window, config, metrics_config, modeling_config)

    # Append the current window's data to the lists
    X_train_list.append(model_data['X_train'])
    X_test_list.append(model_data['X_test'])
    y_train_list.append(model_data['y_train'])
    y_test_list.append(model_data['y_test'])
    returns_test_list.append(model_data['returns_test'])


# Concatenate all the data for each part
X_train = pd.concat(X_train_list, axis=0)
X_test = pd.concat(X_test_list, axis=0)
y_train = pd.concat(y_train_list, axis=0)
y_test = pd.concat(y_test_list, axis=0)
returns_test = pd.concat(returns_test_list, axis=0)

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs
logger.setLevel(logging.INFO)


# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(
                    X_train,
                    y_train,
                    modeling_config)

# 3.5 Evaluate and save the model performance on the test set to a CSV
metrics_dict, y_pred, y_pred_prob = m.evaluate_model(model, X_test, y_test, model_id, returns_test, modeling_config)

metrics_dict

In [None]:
feature_importances = model.feature_importances_
features = X_train.columns  # Feature names

# Create a DataFrame with feature names and importance
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

# Sort by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df.head(20)

In [None]:
importance_df

In [None]:

for module in modules:
    importlib.reload(module)


# Select y_pred_prob from the classifier, or y_pred from a regressor
predictions = y_pred_prob or y_pred
returns = returns_test['returns']
winsorization_cutoff = modeling_config["evaluation"]["winsorization_cutoff"]


ia.generate_profitability_curves(predictions, returns, winsorization_cutoff)

## Time Window Sequencing

In [None]:
market_data_df.shape

### checking profits_df downcasts

In [None]:
start_date = config['training_data']['earliest_window_start']
end_date = config['training_data']['modeling_period_end']
minimum_wallet_inflows = config['data_cleaning']['minimum_wallet_inflows']

# SQL query to retrieve profits data
query_sql = f"""
    -- STEP 1: retrieve profits data and apply USD inflows filter
    -------------------------------------------------------------
    with profits_base as (
        select coin_id
        ,date
        ,wallet_address
        ,profits_cumulative
        ,usd_balance
        ,usd_net_transfers
        ,usd_inflows
        ,usd_inflows_cumulative
        from core.coin_wallet_profits
        where date <= '{end_date}'
    ),

    usd_inflows_filter as (
        select coin_id
        ,wallet_address
        ,max(usd_inflows_cumulative) as total_usd_inflows
        from profits_base
        -- we don't need to include coin-wallet pairs that have no transactions between
        -- the start and end dates
        group by 1,2
    ),

    profits_base_filtered as (
        select pb.*
        from profits_base pb
        join usd_inflows_filter f on f.coin_id = pb.coin_id
            and f.wallet_address = pb.wallet_address
        where f.total_usd_inflows >= {minimum_wallet_inflows}
    ),


    -- STEP 2: create new records for all coin-wallet pairs as of the training_period_start
    ---------------------------------------------------------------------------------------
    -- compute the starting profits and balances as of the training_period_start
    training_start_existing_rows as (
        -- identify coin-wallet pairs that already have a balance as of the period end
        select *
        from profits_base_filtered
        where date = '{start_date}'
    ),
    training_start_needs_rows as (
        -- for coin-wallet pairs that don't have existing records, identify the row closest to the period end date
        select t.*
        ,cmd_previous.price as price_previous
        ,cmd_training.price as price_current
        ,row_number() over (partition by t.coin_id,t.wallet_address order by t.date desc) as rn
        from profits_base_filtered t
        left join training_start_existing_rows e on e.coin_id = t.coin_id
            and e.wallet_address = t.wallet_address

        -- obtain the last price used to compute the balance and profits data
        join core.coin_market_data cmd_previous on cmd_previous.coin_id = t.coin_id and cmd_previous.date = t.date

        -- obtain the training_period_start price so we can update the calculations
        join core.coin_market_data cmd_training on cmd_training.coin_id = t.coin_id and cmd_training.date = '{start_date}'
        where t.date < '{start_date}'
        and e.coin_id is null
    ),
    training_start_new_rows as (
        -- create a new row for the period end date by carrying the balance from the closest existing record
        select t.coin_id
        ,cast('{start_date}' as datetime) as date
        ,t.wallet_address
        -- profits_cumulative is the previous profits_cumulative + the change in profits up to the start_date
        ,((t.price_current / t.price_previous) - 1) * t.usd_balance + t.profits_cumulative as profits_cumulative
        -- usd_balance is previous balance * (1 + % change in price)
        ,(t.price_current / t.price_previous) * t.usd_balance as usd_balance
        -- there were no transfers
        ,0 as usd_net_transfers
        -- there were no inflows
        ,0 as usd_inflows
        -- no change since there were no inflows
        ,usd_inflows_cumulative as usd_inflows_cumulative

        from training_start_needs_rows t
        where rn=1

    ),

    -- STEP 3: merge all records together
    -------------------------------------
    profits_merged as (
        select * from profits_base_filtered
        -- transfers prior to the training period are summarized in training_start_new_rows
        where date >= '{start_date}'

        union all

        select * from training_start_new_rows
    )

    select coin_id
    ,date

    -- replace the memory-intensive address strings with integers
    ,DENSE_RANK() OVER (ORDER BY wallet_address) as wallet_address

    ,profits_cumulative
    ,usd_balance
    ,usd_net_transfers
    ,usd_inflows
    -- set a floor of $0.01 to avoid divide by 0 errors caused by rounding
    ,greatest(0.01,usd_inflows_cumulative) as usd_inflows_cumulative
    from profits_merged
"""

# Run the SQL query using dgc's run_sql method
profits_df = dgc().run_sql(query_sql)

logger.info('Converting columns to memory-optimized formats...')

# Convert coin_id to categorical and date to date
profits_df['coin_id'] = profits_df['coin_id'].astype('category')
profits_df['date'] = pd.to_datetime(profits_df['date'])

# Add total_return column
profits_df['total_return'] = (profits_df['profits_cumulative']
                                / profits_df['usd_inflows_cumulative'])



In [None]:
profits_df = safe_downcast(profits_df, 'wallet_address', 'int32')

In [None]:
market_data_df.head()

In [None]:
np.can_cast(market_data_df['market_cap'].dtype, 'int32', casting='safe')


In [None]:
market_data_df = safe_downcast(market_data_df, 'volume', 'int32')

### Market Data resequencing

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


# Generate time_windows config overrides that will modify each window's config settings
time_windows = mif.generate_time_windows(config)
n = 0
window = time_windows[n]

# Prepare the full configuration by applying overrides from the current trial config
config, metrics_config, modeling_config = mif.prepare_configs(modeling_config['modeling']['config_folder'], window)

# Generate time_windows config overrides that will modify each window's config settings
time_windows = mif.generate_time_windows(config)
n = 0
window = time_windows[n]


# market_data_df, _ = cwm.split_dataframe_by_coverage(market_data_df, start_date, end_date, id_column='coin_id')
# prices_df = market_data_df[['coin_id','date','price']].copy()
market_data_df_full = market_data_df.copy()
market_data_df.shape

In [None]:
market_data_df_full.head()

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


market_data_df = market_data_df_full.copy()
print(market_data_df.columns)
market_data_df = ind.generate_time_series_indicators('market_data', market_data_df, metrics_config)
print(market_data_df.columns)


In [None]:
isinstance(time_series_df.index, pd.RangeIndex)

In [None]:
market_data_df.tail()

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs




market_data_df = market_data_df_full.copy()
value_column = 'price'
value_column_indicators_config = metrics_config['time_series']['market_data'][value_column]['indicators']
id_column = 'coin_id'
market_data_df = ind.generate_column_time_series_indicators(
    market_data_df,
    value_column,
    value_column_indicators_config,
    id_column
)

market_data_df.columns

In [None]:
id_column = None
if not id_column:
    print('x')

In [None]:
time_series_df = time_series_df.reset_index()

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


# time_series_df = market_data_df[['date','coin_id','price']].copy()
time_series_df = market_data_df_full.copy()
config = config
value_column_indicators_config = metrics_config['time_series']['market_data']['price']['indicators']
value_column = 'price'
id_column='coin_id'

time_series_df = time_series_df.set_index(['coin_id','date'])

# Data Quality Checks and Formatting
if value_column not in time_series_df.columns:
    raise KeyError(f"Input DataFrame does not include column '{value_column}'.")

if time_series_df[value_column].isnull().any():
    raise ValueError(f"The '{value_column}' column contains null values, which are not allowed.")

# Indicator Calculations
# ----------------------
# If there is an id_column, group on it
if id_column:
    groupby_column = id_column
# If there isn't, create a dummy_column for grouping and remove it later
else:
    time_series_df['dummy_group'] = 1
    groupby_column = 'dummy_group'

# For each indicator, loop through all options and add the appropriate column
for indicator, indicator_config in value_column_indicators_config.items():
    if indicator == 'sma':
        windows = indicator_config['parameters']['window']
        for w in windows:
            ind_series = time_series_df.groupby(level=groupby_column, observed=True)[value_column].transform(
                lambda x: ind.calculate_sma(x, w))
            time_series_df[f"{value_column}_{indicator}_{w}"] = ind_series

    elif indicator == 'ema':
        windows = indicator_config['parameters']['window']
        for w in windows:
            ind_series = time_series_df.groupby(level=groupby_column, observed=True)[value_column].transform(
                lambda x: ind.calculate_ema(x, w))
            time_series_df[f"{value_column}_{indicator}_{w}"] = ind_series

    # elif indicator == 'rsi':
    #     windows = indicator_config['parameters']['window']
    #     for w in windows:
    #         ind_series = time_series_df.groupby(level=groupby_column, observed=True)['price'].transform(
    #             lambda x: calculate_rsi(x, w))
    #         time_series_df[f"{value_column}_{indicator}_{w}"] = ind_series

    # elif indicator == 'bollinger_bands_upper':
    #     windows = indicator_config['parameters']['window']
    #     num_std = indicator_config['parameters'].get('num_std', None)
    #     for w in windows:
    #         ind_series = time_series_df.groupby(level=groupby_column, observed=True)['price'].transform(
    #             lambda x: calculate_bollinger_bands(x, 'upper', w, num_std))
    #         time_series_df[f"{value_column}_{indicator}_{w}"] = ind_series

    # elif indicator == 'bollinger_bands_lower':
    #     windows = indicator_config['parameters']['window']
    #     num_std = indicator_config['parameters'].get('num_std', None)
    #     for w in windows:
    #         ind_series = time_series_df.groupby(level=groupby_column, observed=True)['price'].transform(
    #             lambda x: calculate_bollinger_bands(x, 'lower', w, num_std))
    #         time_series_df[f"{value_column}_{indicator}_{w}"] = ind_series

# Remove the dummy column if it was created
if groupby_column == 'dummy_group':
    time_series_df = time_series_df.drop('dummy_group', axis=1)

logger.info("Generated indicators for column '%s' :%s",
            value_column,
            list(value_column_indicators_config.keys()))


In [None]:
list(value_column_indicators_config.keys())

In [None]:
logger.info("%s",value_column_indicators_config.keys())

In [None]:
market_data_df.xs('9d6619f4-b44b-4ff4-9f68-1f563f57e060',level='coin_id').tail()

In [None]:
market_data_df.sample(15)

In [None]:
x = indicator_config['parameters'].get('num_std', None)
x

In [None]:
market_data_df.groupby(level='coin_id', observed=True)['price'].transform(

### indicators implementation

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


df = market_data_df.copy()
df = market_data_df.set_index(['coin_id','date'])


# Add Relative Strength Index (RSI)
df['rsi'] = df.groupby(level='coin_id', observed=True)['price'].transform(
    lambda x: ind.calculate_rsi(x, 14))
# Add Money Flow Index (MFI)
df = ind.add_mfi_column(df)

# Calculate MACD with EMAs
df['ema_12'] = df.groupby(level='coin_id', observed=True)['price'].transform(lambda x: ind.calculate_ema(x, 12))
df['ema_26'] = df.groupby(level='coin_id', observed=True)['price'].transform(lambda x: ind.calculate_ema(x, 26))
df = ind.add_crossover_column(df, 'ema_12', 'ema_26', drop_col1=True, drop_col2=True)

# Add Bollinger Bands
df = ind.add_bollinger_bands(df, include_middle=False)
# Add crossover for price and upper band
df = ind.add_crossover_column(df, 'price', 'bollinger_band_upper', drop_col1=False, drop_col2=True)
# Add crossover for price and lower band
df = ind.add_crossover_column(df, 'price', 'bollinger_band_lower', drop_col1=False, drop_col2=True)

# Calculate OBV
df['obv_price_volume'] = ind.generalized_obv(df['price'],df['volume'])


df.head()

## Junkyard

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# Define a function to calculate MFI within each group, similar to the crossovers function
def apply_mfi(group):
    # Reset index to avoid issues with the multi-index during group operations
    group = group.reset_index()
    group['mfi'] = ind.calculate_mfi(group['price'], group['volume'])

    # Set index back to the original multi-index
    return group.set_index(['coin_id', 'date'])

# Apply the function within each 'coin_id' group
df = df.groupby('coin_id', observed=True, group_keys=False).apply(apply_mfi)

# Display the updated DataFrame with the MFI column
df.head()

In [None]:
df.head(20)

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

df2 = df[['ema_12','ema_26']].copy()

df2 = ind.add_crossover_column(df2, 'ema_12', 'ema_26', drop_col1=True, drop_col2=True)
df2.head()

In [None]:
def identify_crossovers(series1, series2):
    """
    Identify crossovers between two time series.

    This function calculates the points where series1 crosses over series2.
    It handles NaN values by converting them to 0.

    Parameters:
    series1 (array-like): The first time series
    series2 (array-like): The second time series

    Returns:
    numpy.ndarray: An array of the same length as the input series, where:
        0 indicates no crossover
        1 indicates an upward crossover (series1 crosses above series2)
        -1 indicates a downward crossover (series1 crosses below series2)
    """
    diff = series1 - series2

    # Handle NaN values
    diff = np.nan_to_num(diff, nan=0.0)

    # Initialize crossovers array
    crossovers = np.zeros(len(series1))

    # Identify crossovers
    signs = np.sign(diff)
    sign_changes = signs[1:] != signs[:-1]
    crossover_indices = np.where(sign_changes)[0] + 1

    # Assign 1 for upward crossovers, -1 for downward crossovers
    crossovers[crossover_indices] = np.where(signs[crossover_indices] > 0, 1, -1)


In [None]:
df[['ema_12','ema_26']]

In [None]:

# Assuming `df` is your DataFrame with multi-index (coin_id, date) and ema_12, ema_26 columns

# Define a function that applies identify_crossovers to a group
def apply_crossovers(group):
    group['crossovers'] = identify_crossovers(group['ema_12'], group['ema_26'])
    return group

# Apply the function within each 'coin_id' group
df = df.groupby('coin_id', group_keys=False).apply(apply_crossovers)

# Display the resulting DataFrame with the new 'crossovers' column
df

In [None]:
import pandas as pd



# Display the resulting DataFrame with the new 'crossovers' column
df

## Tests failing

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# @pytest.fixture
def rolling_metrics_config():
    """Fixture providing a complex metrics configuration with rolling metrics."""
    return {
        "wallet_cohorts": {
            "whales": {
                "total_volume": {
                    "aggregations": {
                        "last": {
                            "scaling": "log"
                        }
                    },
                    "rolling": {
                        "aggregations": {
                            "mean": {
                                "scaling": "log"
                            }
                        },
                        "window_duration": 10,
                        "lookback_periods": 3
                    }
                }
            }
        }
    }
rolling_metrics_config=rolling_metrics_config()
# @pytest.fixture
def dummy_rolling_dataframe():
    """
    Fixture providing a dummy dataframe with a MultiIndex and sample data for rolling metrics.
    """
    index = pd.MultiIndex.from_product(
        [
            pd.to_datetime(['2023-01-01', '2023-01-02']),
            ['bitcoin', 'ethereum']
        ],
        names=['time_window', 'coin_id']
    )
    data = {
        'wallet_cohorts_whales_total_volume_last': [1000, 2000, 3000, 4000],
        'wallet_cohorts_whales_total_volume_mean_10d_period_1': [500, 600, 700, 800],
        'wallet_cohorts_whales_total_volume_mean_10d_period_2': [400, 500, 600, 700],
        'wallet_cohorts_whales_total_volume_mean_10d_period_3': [300, 400, 500, 600],
    }
    df = pd.DataFrame(data, index=index)
    return df
dummy_rolling_dataframe=dummy_rolling_dataframe()

# @pytest.mark.unit
# def test_scaling_processor_with_rolling_metrics(rolling_metrics_config, dummy_rolling_dataframe):
"""
Test the ScalingProcessor class for correct column mapping and scaling application with rolling metrics.
"""
# Instantiate the ScalingProcessor with the provided rolling_metrics_config
processor = prp.ScalingProcessor(rolling_metrics_config)

# Expected column_scaling_map based on the rolling_metrics_config
expected_column_scaling_map = {
    'wallet_cohorts_whales_total_volume_last': 'log',
    'wallet_cohorts_whales_total_volume_mean_10d_period_1': 'log',
    'wallet_cohorts_whales_total_volume_mean_10d_period_2': 'log',
    'wallet_cohorts_whales_total_volume_mean_10d_period_3': 'log',
}

# Assert that the column_scaling_map is as expected
assert processor.column_scaling_map == expected_column_scaling_map, (
    "Column scaling map does not match expected mapping."
)

# Apply scaling to the dummy_rolling_dataframe (as training data)
scaled_df = processor.apply_scaling(dummy_rolling_dataframe, is_train=True)

# Prepare expected scaled values for each column
# For each column, scaling is 'log', so we apply np.log1p to the original values

# Logical steps for 'wallet_cohorts_whales_total_volume_last':
# - Original values: [1000, 2000, 3000, 4000]
# - Apply np.log1p to each value to get the expected scaled values
original_values_last = dummy_rolling_dataframe['wallet_cohorts_whales_total_volume_last'].values
expected_values_last = np.log1p(original_values_last)

# Logical steps for 'wallet_cohorts_whales_total_volume_mean_10d_period_1':
# - Original values: [500, 600, 700, 800]
# - Apply np.log1p to each value
original_values_mean1 = dummy_rolling_dataframe[
    'wallet_cohorts_whales_total_volume_mean_10d_period_1'
].values
expected_values_mean1 = np.log1p(original_values_mean1)

# Logical steps for 'wallet_cohorts_whales_total_volume_mean_10d_period_2':
# - Original values: [400, 500, 600, 700]
# - Apply np.log1p to each value
original_values_mean2 = dummy_rolling_dataframe[
    'wallet_cohorts_whales_total_volume_mean_10d_period_2'
].values
expected_values_mean2 = np.log1p(original_values_mean2)

# Logical steps for 'wallet_cohorts_whales_total_volume_mean_10d_period_3':
# - Original values: [300, 400, 500, 600]
# - Apply np.log1p to each value
original_values_mean3 = dummy_rolling_dataframe[
    'wallet_cohorts_whales_total_volume_mean_10d_period_3'
].values
expected_values_mean3 = np.log1p(original_values_mean3)

# Now, compare the scaled values in scaled_df to the expected values calculated above

# Compare 'wallet_cohorts_whales_total_volume_last' values
np.testing.assert_allclose(
    scaled_df['wallet_cohorts_whales_total_volume_last'].values,
    expected_values_last,
    atol=1e-4,
    err_msg=(
        "Scaled values for 'wallet_cohorts_whales_total_volume_last' do not match "
        "expected log-transformed values."
    )
)

# Compare 'wallet_cohorts_whales_total_volume_mean_10d_period_1' values
np.testing.assert_allclose(
    scaled_df['wallet_cohorts_whales_total_volume_mean_10d_period_1'].values,
    expected_values_mean1,
    atol=1e-4,
    err_msg=(
        "Scaled values for 'wallet_cohorts_whales_total_volume_mean_10d_period_1' "
        "do not match expected log-transformed values."
    )
)

# Compare 'wallet_cohorts_whales_total_volume_mean_10d_period_2' values
np.testing.assert_allclose(
    scaled_df['wallet_cohorts_whales_total_volume_mean_10d_period_2'].values,
    expected_values_mean2,
    atol=1e-4,
    err_msg=(
        "Scaled values for 'wallet_cohorts_whales_total_volume_mean_10d_period_2' "
        "do not match expected log-transformed values."
    )
)

# Compare 'wallet_cohorts_whales_total_volume_mean_10d_period_3' values
np.testing.assert_allclose(
    scaled_df['wallet_cohorts_whales_total_volume_mean_10d_period_3'].values,
    expected_values_mean3,
    atol=1e-4,
    err_msg=(
        "Scaled values for 'wallet_cohorts_whales_total_volume_mean_10d_period_3' "
        "do not match expected log-transformed values."
    )
)

In [None]:
processor.column_scaling_map

In [None]:
wallet_cohorts_whales_total_volume_mean_10d_period_3

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

def scaling_1_metrics_config():
    """
    Fixture providing a sample metrics configuration dictionary.
    """
    return {
        'time_series': {
            'market_data': {
                'price': {
                    'aggregations': {
                        'std': {
                            'scaling': 'none'
                        }
                    }
                },
                'volume': {
                    'aggregations': {
                        'sum': {
                            'scaling': 'standard'
                        }
                    }
                },
                'market_cap': {
                    'aggregations': {
                        'last': {
                            'scaling': 'log'
                        }
                    }
                }
            }
        }
    }

scaling_1_metrics_config=scaling_1_metrics_config()
def scaling_1_dummy_dataframe():
    """
    Fixture providing a dummy dataframe with a MultiIndex and sample data.
    """
    index = pd.MultiIndex.from_product(
        [
            pd.to_datetime(['2023-01-01', '2023-01-02']),
            ['bitcoin', 'ethereum']
        ],
        names=['time_window', 'coin_id']
    )
    data = {
        'time_series_market_data_price_std': [1.0, 2.0, 3.0, 4.0],
        'time_series_market_data_volume_sum': [100, 200, 300, 400],
        'time_series_market_data_market_cap_last': [1000, 2000, 3000, 4000]
    }
    df = pd.DataFrame(data, index=index)
    return df
scaling_1_dummy_dataframe=scaling_1_dummy_dataframe()
# @pytest.mark.unit
# def test_scaling_processor(scaling_1_metrics_config, scaling_1_dummy_dataframe):
"""
Test the ScalingProcessor class for correct column mapping and scaling application.
"""
# Instantiate the ScalingProcessor with the provided metrics_config
processor = prp.ScalingProcessor(scaling_1_metrics_config)

# Expected column_scaling_map based on the metrics_config
expected_column_scaling_map = {
    'time_series_market_data_price_std': 'none',
    'time_series_market_data_volume_sum': 'standard',
    'time_series_market_data_market_cap_last': 'log'
}

# Assert that the column_scaling_map is as expected
assert processor.column_scaling_map == expected_column_scaling_map, (
    "Column scaling map does not match expected mapping."
)

# Apply scaling to the dummy_dataframe (as training data)
scaled_df = processor.apply_scaling(scaling_1_dummy_dataframe, is_train=True)

# Prepare expected scaled values for each column

# For 'time_series_market_data_price_std', scaling is 'none',
# so values should remain the same as in the original dataframe.
expected_price_std = scaling_1_dummy_dataframe['time_series_market_data_price_std'].values

# For 'time_series_market_data_volume_sum', scaling is 'standard'.
# This means we need to standardize the values by subtracting the mean and dividing by the std deviation.
volume_values = scaling_1_dummy_dataframe['time_series_market_data_volume_sum'].values.reshape(-1, 1)
# Calculate mean and standard deviation of the volume values
volume_mean = volume_values.mean()
volume_std = volume_values.std()
# Standardize the volume values
expected_volume_sum = (volume_values - volume_mean) / volume_std
expected_volume_sum = expected_volume_sum.flatten()

# For 'time_series_market_data_market_cap_last', scaling is 'log'.
# We apply the natural logarithm to the values (using np.log1p to handle zero values safely).
market_cap_values = scaling_1_dummy_dataframe['time_series_market_data_market_cap_last'].values
expected_market_cap_last = np.log1p(market_cap_values)

# Now, we compare the scaled values in scaled_df to the expected values calculated above.

# Compare 'time_series_market_data_price_std' values
np.testing.assert_allclose(
    scaled_df['time_series_market_data_price_std'].values,
    expected_price_std,
    atol=1e-4,
    err_msg="Scaled values for 'price_std' do not match expected values."
)

# Compare 'time_series_market_data_volume_sum' values
np.testing.assert_allclose(
    scaled_df['time_series_market_data_volume_sum'].values,
    expected_volume_sum,
    atol=1e-4,
    err_msg="Scaled values for 'volume_sum' do not match expected standardized values."
)

# Compare 'time_series_market_data_market_cap_last' values
np.testing.assert_allclose(
    scaled_df['time_series_market_data_market_cap_last'].values,
    expected_market_cap_last,
    atol=1e-4,
    err_msg="Scaled values for 'market_cap_last' do not match expected log-transformed values."
)

In [None]:
processor.column_scaling_map

In [None]:
df.columns

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


In [None]:
flt.promote_indicators_to_metrics(metrics_config['macro_trends'])

In [None]:
def complex_metrics_config():
    """Complex metrics_config structure with nested aggregations"""
    return {
        "wallet_cohorts": {
            "whales": {
                "total_volume": {
                    "aggregations": {
                        "last": {
                            "scaling": "log"
                        }
                    },
                    "rolling": {
                        "aggregations": {
                            "mean": {
                                "scaling": "log"
                            }
                        },
                        "window_duration": 10,
                        "lookback_periods": 3
                    },
                    "indicators": {
                        "ema": {
                            "parameters": {
                                "window": [7]
                            },
                            "aggregations": {
                                "last": {
                                    "scaling": "none"
                                }
                            },
                            "rolling": {
                                "aggregations": {
                                    "last": {
                                        "scaling": "standard"
                                    }
                                },
                                "window_duration": 7,
                                "lookback_periods": 3
                            }
                        }
                    }
                }
            }
        }
    }
complex_metrics_config=complex_metrics_config()

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

processor = prp.ScalingProcessor(complex_metrics_config)

In [None]:
rolling_config = complex_metrics_config['wallet_cohorts']['whales']['total_volume']['rolling']
rolling_config

In [None]:
new_prefix = 'total_volume'

mapping = {}

if 'aggregations' in rolling_config:
    for agg_type, agg_config in rolling_config['aggregations'].items():
        if isinstance(agg_config, dict) and 'scaling' in agg_config:
            mapping[f"{new_prefix}_rolling_{agg_type}"] = agg_config['scaling']
if 'comparisons' in rolling_config:
    for comp_type, comp_config in rolling_config['comparisons'].items():
        if isinstance(comp_config, dict) and 'scaling' in comp_config:
            mapping[f"{new_prefix}_rolling_{comp_type}"] = comp_config['scaling']

mapping

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# Create a ScalingProcessor instance with the complex configuration
processor = prp.ScalingProcessor(complex_metrics_config)

# Define the expected keys based on the provided information
expected_keys = [
       'whales_total_volume_last',
       'whales_total_volume_mean_10d_period_1',
       'whales_total_volume_mean_10d_period_2',
       'whales_total_volume_mean_10d_period_3',
       'whales_total_volume_ema_7_last',
       'whales_total_volume_ema_7_last_7d_period_1',
       'whales_total_volume_ema_7_last_7d_period_2',
       'whales_total_volume_ema_7_last_7d_period_3'
]

# Verify that the column_scaling_map contains all expected keys
for key in expected_keys:
       # For each expected key, we check if it exists in the column_scaling_map
       # The assertion will fail if any key is missing
       assert key in processor.column_scaling_map, f"Expected key '{key}' not found in column_scaling_map"

# Verify that the number of keys in column_scaling_map matches the expected count
# This ensures that there are no unexpected additional keys
assert len(processor.column_scaling_map) == len(expected_keys), (
       f"Expected {len(expected_keys)} keys, but found {len(processor.column_scaling_map)} keys "
       f"in column_scaling_map"
)

# Print the actual keys for debugging purposes
print("Actual keys in column_scaling_map:", list(processor.column_scaling_map.keys()))





In [None]:
processor = prp.ScalingProcessor(complex_metrics_config)

processor.column_scaling_map

In [None]:
processor.column_scaling_map

In [None]:
processor.column_scaling_map

In [None]:
scaled_df["metric1"]

0    -1.41421356237
1   -0.707106781187
2                 0
3    0.707106781187
4     1.41421356237
Name: metric1, dtype: float64

In [None]:
processor.scalers

In [None]:
for key, value in config.items():
    print(value)

In [None]:
value.keys()

In [None]:
result[2:]

In [None]:
result['price_bollinger_bands_upper_2']

In [None]:
import pandas as pd
import numpy as np

sample_data = pd.DataFrame({
    'price': [100, 110, 105, 200, 220, 210]
})

upper_band = ind.calculate_bollinger_bands(sample_data['price'], 'upper', window=2, num_std=2)
print(upper_band)

In [None]:
expected_rsi

In [None]:
result['price_rsi_2']

In [None]:
expected_rsi

In [None]:
pd.DataFrame(result['price_rsi_2'], expected_rsi)

In [None]:
expected_columns

In [None]:
result_df

In [None]:
list(result_df.loc[result_df['coin_id'] == 'coin1', 'price_sma_2'].values) == list(expected_sma_2_coin1)

In [None]:
list(expected_sma_2_coin1)

In [None]:
result_mfi

In [None]:
expected_rsi.values