In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import re
import pdb
import datetime
import json
from datetime import datetime, timedelta
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load dotenv
load_dotenv()

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp


# reload all modules
modules = [u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)


## Aggregate function

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

training_data_df, target_variable_df, returns_df, join_logs_df = tw.generate_all_time_windows_model_inputs(config,metrics_config,modeling_config)

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# 1. Retrieve base datasets used by all windows
# ---------------------------------------------
macro_trends_df, market_data_df, profits_df, prices_df = tw.prepare_all_windows_base_data(config,
                                                                                            metrics_config)


# 2. Generate flattened features for each dataset in each window
# --------------------------------------------------------------
# Generate time_windows config overrides that will modify each window's config settings
time_windows = tw.generate_time_windows(config)

all_flattened_dfs = []
all_flattened_filepaths = []

for _, time_window in enumerate(time_windows):

    # Prepare time window config files
    window_config, window_metrics_config, window_modeling_config = (
        exp.prepare_configs(modeling_config['modeling']['config_folder'], time_window))

    # Generate flattened feature dfs for all datasets for the window
    window_flattened_dfs, window_flattened_filepaths = tw.generate_window_flattened_dfs(
        market_data_df,
        macro_trends_df,
        profits_df,
        prices_df,
        window_config,
        window_metrics_config,
        window_modeling_config
    )

    # Store window's flattened features
    all_flattened_dfs.extend(window_flattened_dfs)
    all_flattened_filepaths.extend(window_flattened_filepaths)


# 3. Combine features from all datasets in all time windows with target variables
# -------------------------------------------------------------------------------
# Combine all time windows for each dataset, the join the datasets together
concatenated_dfs = tw.concat_dataset_time_windows_dfs(all_flattened_filepaths,modeling_config)
training_data_df, join_logs_df = tw.join_dataset_all_windows_dfs(concatenated_dfs)


In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs
logger.setLevel(logging.DEBUG)

# Split target variables into the train/test/validation/future sets
sets_X_y_dict = ds.perform_train_test_validation_future_splits(training_data_df,
                                                                target_variable_df,
                                                                modeling_config)

# Preprocess X data for all sets
preprocessed_sets_X_y_dict = prp.preprocess_sets_X_y(sets_X_y_dict,config,metrics_config,modeling_config)



In [None]:
set = "train"
model_period = config['training_data']['modeling_period_start']
windows = config['training_data']['additional_windows']
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M')
filename = f"preprocessed_X_{set}_{timestamp}_model_period_{model_period}_{windows}_windows.csv"


,

In [None]:
for set,(X,y) in preprocessed_sets_X_y_dict.items():
    print(set)
    print(X.shape)
    print(len(y))

In [None]:
def save_preprocessed_outputs(set_key, preprocessed_X, y_data, config, modeling_config):
    """
    Saves the flattened DataFrame with descriptive metrics into a CSV file.

    Params:
    - set_key (str): The name of set (e.g. 'train', 'test', etc)
    - preprocessed_X (pd.DataFrame): DataFrame with MultiIndex on time_window,period containing all
        features for the set
    - y_data (pd.DataFrame): DataFrame with MultiIndex on time_window,period containing target
        variables for the set
    - config (dict): config.yaml
    - modeling_config (dict): modeling_config.yaml

    Returns:
    none
    """

    # Define filename with metric description and optional description
    model_period = config['training_data']['modeling_period_start']
    windows = config['training_data']['additional_windows']
    timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M')
    filename = f"preprocessed_X_{set_key}_{timestamp}_model_period_{model_period}_{windows}_windows.csv"

    # Join X and y
    preprocessed_df = preprocessed_X.join(y_data)

    # Save file
    output_dir = os.path.join(modeling_config['modeling']['modeling_folder'],
                            'outputs/preprocessed_outputs')
    output_path = os.path.join(output_dir, filename)
    preprocessed_df.to_csv(output_path, index=False)

    logger.debug("Saved preprocessed outputs to %s", output_path)




## Preprocessing

In [None]:
df = sets_X_y_dict['train_set'][0]
df.columns

In [None]:
df.columns

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


# Load your DataFrames (replace this with your actual data loading code)
datasets = {
    'train': df
}

# Initialize and run the preprocessor
preprocessor = prp.DataPreprocessor(config, metrics_config, modeling_config)
preprocessed_datasets = preprocessor.preprocess(datasets)

# Print results
for dataset_name, df in preprocessed_datasets.items():
    print(f"Columns in {dataset_name} set: {df.columns.tolist()}")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs

# Confirm there are no null values
if df.isnull().values.any():
    raise ValueError("Missing values detected in the DataFrame.")

# Convert all columns to numeric
df = prp.preprocess_categorical_and_boolean(df)

# Feature Selection
# Drop features specified in modeling_config['drop_features']
drop_features = modeling_config['preprocessing'].get('drop_features', [])
if drop_features:
    df = df.drop(columns=drop_features, errors='warn')

# # Apply feature selection based on sameness_threshold and retain_columns from dataset_config
# sameness_threshold = dataset_config.get('sameness_threshold', 1.0)
# retain_columns = dataset_config.get('retain_columns', [])

# # Drop columns with more than `sameness_threshold` of the same value, unless in retain_columns
# for column in df.columns:
#     if column not in retain_columns:
#         max_value_ratio = df[column].value_counts(normalize=True).max()
#         if max_value_ratio > sameness_threshold:
#             df = df.drop(columns=[column])
#             logger.debug("Dropped column %s due to sameness_threshold", column)


# # Step 4: Scaling and Transformation
# # ----------------------------------------------------
# # Apply scaling if df_metrics_config is provided
# if df_metrics_config:
#     df = apply_scaling(df, df_metrics_config)




In [None]:
from typing import Dict, Any
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs



def calculate_sameness_percentage(column: pd.Series) -> float:
    """
    Calculate the percentage of the most common value in a column.

    Parameters:
    column (pd.Series): The column to analyze.

    Returns:
    float: The percentage (0 to 1) of the most common value in the column.
    """
    return column.value_counts().iloc[0] / len(column)

def create_prefix_mapping(config: Dict[str, Any]) -> Dict[str, Dict[str, float]]:
    """
    Create a mapping of column prefixes to their config paths and sameness thresholds.

    Parameters:
    config (Dict[str, Any]): The configuration dictionary containing dataset information.

    Returns:
    Dict[str, Dict[str, float]]: A dictionary where keys are column prefixes and values are
    dictionaries containing 'path' (str) and 'threshold' (float) for each prefix.
    """
    mapping = {}

    for dataset_type, dataset_config in config['datasets'].items():
        for category, category_config in dataset_config.items():
            if isinstance(category_config, dict) and 'sameness_threshold' in category_config:
                prefix = f"{category}_"
                mapping[prefix] = {
                    'path': f"datasets.{dataset_type}.{category}",
                    'threshold': category_config['sameness_threshold']
                }
            elif isinstance(category_config, dict):
                for subcategory, subcategory_config in category_config.items():
                    if 'sameness_threshold' in subcategory_config:
                        prefix = f"{subcategory}_"
                        mapping[prefix] = {
                            'path': f"datasets.{dataset_type}.{category}.{subcategory}",
                            'threshold': subcategory_config['sameness_threshold']
                        }

    return mapping

def check_and_drop_columns(df: pd.DataFrame, config: Dict[str, Any]) -> pd.DataFrame:
    """
    Check column sameness and drop columns exceeding the threshold.

    This function analyzes each column in the DataFrame, calculates its sameness percentage,
    and drops columns that exceed the threshold specified in the configuration.

    Parameters:
    df (pd.DataFrame): The input DataFrame to process.
    config (Dict[str, Any]): The configuration dictionary containing sameness thresholds.

    Returns:
    pd.DataFrame: A new DataFrame with columns dropped based on the sameness criteria.

    Raises:
    ValueError: If any columns can't be mapped to a sameness threshold or if any config keys
                can't be mapped to columns.
    """
    prefix_mapping = create_prefix_mapping(config)
    columns_to_drop = []
    unmapped_columns = []
    used_config_keys = set()

    for column in df.columns:
        mapped = False
        for prefix, config_info in prefix_mapping.items():
            if column.startswith(prefix):
                mapped = True
                used_config_keys.add(prefix)
                sameness = calculate_sameness_percentage(df[column])
                if sameness > config_info['threshold']:
                    columns_to_drop.append(column)
                break
        if not mapped:
            unmapped_columns.append(column)

    unused_config_keys = set(prefix_mapping.keys()) - used_config_keys

    if unmapped_columns:
        raise ValueError(f"The following columns could not be mapped to a sameness threshold: {unmapped_columns}")

    if unused_config_keys:
        raise ValueError(f"The following config keys could not be mapped to columns: {unused_config_keys}")

    # Drop the columns
    df.drop(columns=columns_to_drop)
    logger.info("Dropped %s columns %s due to sameness thresholds.", len(columns_to_drop), columns_to_drop)

    return df




df_cleaned = check_and_drop_columns(df, config)

In [None]:
df_cleaned

In [None]:
# Step 2: Convert categorical and boolean columns to integers
# ---------------------------------------------------------------
# Convert categorical columns to one-hot encoding (get_dummies)
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
categorical_columns = [col for col in categorical_columns]
for col in categorical_columns:
    num_categories = df[col].nunique()
    if num_categories > 8:
        logger.warning("Column '%s' has %s categories, consider reducing categories.",
                        col, num_categories)
    df = pd.get_dummies(df, columns=[col], drop_first=True)


# Convert boolean columns to integers
df = df.apply(lambda col: col.astype(int) if col.dtype == bool else col)
df

## All windows datasets

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs


def generate_all_time_windows_model_inputs(config,metrics_config,modeling_config):
    """
    Generates the X and y splits for all sets across all time windows.

    Sequence:
    1. Retrieve the base datasets that contain records across all windows
    2. Loop through each time window and generate flattened features for the window
    3a. Concat each dataset's window dfs, then join all the dataset dfs with the target variable to
        create a comprehensive feature set keyed on coin_id.
    3b. Split the full feature set into train/test/validation/future sets.

    Params:
    - config, metrics_config, modeling_config: loaded config yaml files

    Returns:
    - sets_X_y_dict (dict[pd.DataFrame, pd.Series]): Dict with keys for each set type (e.g. train_set,
        future_set, etc) that contains the X and y data for the set.
    - returns_df (pd.DataFrame): DataFrame with MultiIndex on time_window,coin_id that contains a
        'returns' column showing actual returns during the each time_window's modeling period.
    - join_logs_df (pd.DataFrame): DataFrame showing the outcomes of each dataset's join and fill
        methods
    """

    # 1. Retrieve base datasets used by all windows
    # ---------------------------------------------
    macro_trends_df, market_data_df, profits_df, prices_df = tw.prepare_all_windows_base_data(config,
                                                                                              metrics_config)


    # 2. Generate flattened features for each dataset in each window
    # --------------------------------------------------------------
    # Generate time_windows config overrides that will modify each window's config settings
    time_windows = tw.generate_time_windows(config)

    all_flattened_dfs = []
    all_flattened_filepaths = []

    for n, time_window in enumerate(time_windows):

        # Prepare time window config files
        window_config, window_metrics_config, window_modeling_config = (
            exp.prepare_configs(modeling_config['modeling']['config_folder'], time_window))

        # Generate flattened feature dfs for all datasets for the window
        window_flattened_dfs, window_flattened_filepaths = tw.generate_window_flattened_dfs(
            market_data_df,
            macro_trends_df,
            profits_df,
            prices_df,
            window_config,
            window_metrics_config,
            window_modeling_config
        )

        # Store window's flattened features
        all_flattened_dfs.extend(window_flattened_dfs)
        all_flattened_filepaths.extend(window_flattened_filepaths)


    # 3. Combine features from all datasets in all time windows with target variables
    # -------------------------------------------------------------------------------
    # Combine all time windows for each dataset, the join the datasets together
    concatenated_dfs = tw.concat_dataset_time_windows_dfs(all_flattened_filepaths,modeling_config)
    training_data_df, join_logs_df = tw.join_dataset_all_windows_dfs(concatenated_dfs)

    # Create target variables for all time windows
    target_variable_df, returns_df, = tw.create_target_variables_for_all_time_windows(training_data_df,
                                                                                        prices_df,
                                                                                        config,
                                                                                        modeling_config)

    # Split target variables into the train/test/validation/future sets
    sets_X_y_dict = prp.perform_train_test_validation_future_splits(training_data_df,
                                                                    target_variable_df,
                                                                    modeling_config)

    return sets_X_y_dict, returns_df, join_logs_df

In [None]:
returns_df.head()

## Modeling Sequence

In [None]:


# Generate time_windows config overrides that will modify each window's config settings
time_windows = mif.generate_time_windows(config)

# Initialize empty lists to hold concatenated data
X_train_list, X_test_list = [], []
y_train_list, y_test_list = [], []
returns_test_list = []

for n, window in enumerate(time_windows):

    model_data = mif.build_time_window_model_input(n, window, config, metrics_config, modeling_config)

    # Append the current window's data to the lists
    X_train_list.append(model_data['X_train'])
    X_test_list.append(model_data['X_test'])
    y_train_list.append(model_data['y_train'])
    y_test_list.append(model_data['y_test'])
    returns_test_list.append(model_data['returns_test'])


# Concatenate all the data for each part
X_train = pd.concat(X_train_list, axis=0)
X_test = pd.concat(X_test_list, axis=0)
y_train = pd.concat(y_train_list, axis=0)
y_test = pd.concat(y_test_list, axis=0)
returns_test = pd.concat(returns_test_list, axis=0)

In [None]:
[importlib.reload(module) for module in modules]  # Reload all modules
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')  # Reload all configs
logger.setLevel(logging.INFO)


# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(
                    X_train,
                    y_train,
                    modeling_config)

# 3.5 Evaluate and save the model performance on the test set to a CSV
metrics_dict, y_pred, y_pred_prob = m.evaluate_model(model, X_test, y_test, model_id, returns_test, modeling_config)

metrics_dict

In [None]:
feature_importances = model.feature_importances_
features = X_train.columns  # Feature names

# Create a DataFrame with feature names and importance
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

# Sort by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df.head(20)

In [None]:
importance_df

In [None]:

for module in modules:
    importlib.reload(module)


# Select y_pred_prob from the classifier, or y_pred from a regressor
predictions = y_pred_prob or y_pred
returns = returns_test['returns']
winsorization_cutoff = modeling_config["evaluation"]["winsorization_cutoff"]


ia.generate_profitability_curves(predictions, returns, winsorization_cutoff)

## Junkyard

## Tests failing