In [None]:
import sys
import os
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
from utils import load_config, cw_filter_df, create_progress_bar
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


## Training Data (profits_df) Generation

In [None]:
importlib.reload(td)


# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,prices_log = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])

## Metrics and Feature Engineering

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')


# # retrieve transfers data
# transfers_df = td.retrieve_transfers_data(
#     config['training_data']['training_period_start'],
#     config['training_data']['modeling_period_start'],
#     config['training_data']['modeling_period_end']
#     )

# # compile profits_df
# profits_df = td.prepare_profits_data(transfers_df, prices_df)
# profits_df = td.calculate_wallet_profitability(profits_df)
# profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])


# # cohort configurations
# cohort_name = list(config['wallet_cohorts'].keys())[0]
# metric_description = f"{cohort_name}_cohort"
# cohort_metrics_config = metrics_config['wallet_cohorts'][cohort_name]

# # identify wallets in the cohort
# cohort_summary_df = td.classify_wallet_cohort(profits_df, config['wallet_cohorts'][cohort_name])
# cohort_wallets = cohort_summary_df[cohort_summary_df['in_cohort']==True]['wallet_address']

# # generate and flatten buysell_metrics
# buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets)

# # flatten, save, and preprocess the flattened df
# flattened_output_directory = os.path.join(modeling_config['modeling']['modeling_folder'],'outputs/flattened_outputs')

# flattened_buysell_metrics_df = fe.flatten_coin_date_df(
#     buysell_metrics_df,
#     cohort_metrics_config,
#     config['training_data']['training_period_end']
)
# flattened_df, flattened_filepath = fe.save_flattened_outputs(
#     flattened_buysell_metrics_df,
#     flattened_output_directory,
#     metric_description,
#     config['training_data']['modeling_period_start']
#     )
# preprocessed_df, preprocessed_filepath = fe.preprocess_coin_df(flattened_filepath, modeling_config, cohort_metrics_config)

# # create the training data df
# input_directory = f"{preprocessed_filepath.split('preprocessed_outputs/')[0]}preprocessed_outputs/"
# input_filenames = [
#     preprocessed_filepath.split('preprocessed_outputs/')[1]
# ]
# training_data_df = fe.create_training_data_df(input_directory, input_filenames)

# # create the target variable df
# target_variable_df,_ = fe.create_target_variables_mooncrater(prices_df, config['training_data'], modeling_config)

# # merge the two into the final model input df
# model_input_df = fe.prepare_model_input_df(training_data_df, target_variable_df, modeling_config['modeling']['target_column'])

# # split the df into train and test sets
# X_train, X_test, y_train, y_test = m.split_model_input(
#     model_input_df,
#     modeling_config['modeling']['target_column'],
#     modeling_config['modeling']['train_test_split'],
#     modeling_config['modeling']['random_state']
# )

# # 3.4 Train the model using the current configuration and log the results
# modeling_folder = modeling_config['modeling']['modeling_folder']
# model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['modeling']['model_params'])

# # 3.5 Evaluate the model's performance on the test set
# metrics = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

# # 3.6 Log the experiment results for this configuration
# m.log_trial_results(modeling_folder, model_id)

In [None]:
def sample_time_series_df():
    """Fixture that provides a sample DataFrame for the time series with multiple coin_ids."""
    data = {
        'coin_id': [1, 1, 1, 2, 2, 2],
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-01', '2023-01-02', '2023-01-03'],
        'price': [100, 110, 120, 200, 210, 220]
    }
    df = pd.DataFrame(data)
    return df


def sample_metrics_config():
    """Fixture that provides a sample metrics configuration for time series analysis."""
    return {
        'time_series': {
            'prices': {
                'sma': {
                    'parameters': {
                        'period': 2
                    }
                },
                'ema': {
                    'parameters': {
                        'period': 2
                    }
                }
            }
        }
    }

sample_time_series_df = sample_time_series_df()
sample_metrics_config = sample_metrics_config()

In [None]:
'price' in prices_df.columns

In [None]:
# @pytest.mark.unit
# def test_generate_time_series_metrics_different_periods(sample_time_series_df):
"""
Test the functionality of generate_time_series_metrics with different periods for SMA and EMA.
"""
# Adjust the sample_metrics_config for different periods
sample_metrics_config = {
    'time_series': {
        'prices': {
            'sma': {
                'parameters': {
                    'period': 3  # Different period for SMA
                }
            },
            'ema': {
                'parameters': {
                    'period': 2  # Different period for EMA
                }
            }
        }
    }
}

# Convert the date to datetime in the sample data
sample_time_series_df['date'] = pd.to_datetime(sample_time_series_df['date'])

# Run the generate_time_series_metrics function
result_df = cwm.generate_time_series_metrics(
    time_series_df=sample_time_series_df,
    metrics_config=sample_metrics_config,
    dataset_key='prices',
    colname='price'
)

# Expected columns in the result
expected_columns = ['coin_id', 'date', 'price', 'prices_sma_3', 'prices_ema_2']

# Assert that the columns exist in the result
assert all(col in result_df.columns for col in expected_columns), "Missing expected columns in the result."

# Expected SMA and EMA values for coin_id=1
expected_sma_1 = [float('nan'), float('nan'), 110.0]  # SMA for coin_id=1 with period=3
expected_ema_1 = [100.0, 106.666667, 115.555556]  # EMA for coin_id=1 with period=2

# Confirm that the SMA result matches the expected, with special logic to handle NaNs
for i, (expected, actual) in enumerate(zip(
    expected_sma_1,
    result_df[result_df['coin_id'] == 1]['prices_sma_3'].tolist()
)):
    if np.isnan(expected) and np.isnan(actual):
        continue  # Both values are NaN, so this is considered equal
    assert expected == actual, f"Mismatch at index {i}: expected {expected}, got {actual}"

# Confirm that the EMA result matches the expected
assert result_df[result_df['coin_id'] == 1]['prices_ema_2'].tolist() == pytest.approx(
    expected_ema_1,
    abs=1e-2
), "EMA calculation incorrect for coin_id=1"

# Expected SMA and EMA values for coin_id=2
expected_sma_2 = [float('nan'), float('nan'), 210.0]  # SMA for coin_id=2 with period=3
expected_ema_2 = [200.0, 206.666667, 215.555556]  # EMA for coin_id=2 with period=2

# Confirm that the SMA result matches the expected, with special logic to handle NaNs
for i, (expected, actual) in enumerate(zip(
    expected_sma_2,
    result_df[result_df['coin_id'] == 2]['prices_sma_3'].tolist()
)):
    if np.isnan(expected) and np.isnan(actual):
        continue  # Both values are NaN, so this is considered equal
    assert expected == actual, f"Mismatch at index {i}: expected {expected}, got {actual}"

# Confirm that the EMA result matches the expected
assert result_df[result_df['coin_id'] == 2]['prices_ema_2'].tolist() == pytest.approx(
    expected_ema_2,
    abs=1e-2
), "EMA calculation incorrect for coin_id=2"

In [None]:
expected_ema_1

In [None]:
len(result_df)

## Codespace

In [None]:
pd.NA

In [None]:
output = result_df[result_df['coin_id'] == 1]['prices_sma_2'].tolist()
output

In [None]:
type(output)

In [None]:
i = 0

print(type(expected_sma_1[i]))
print(type(output[i]))
print(expected_sma_1[i])
print(output[i])
print(expected_sma_1[i] == output[i])

In [None]:
output == expected_sma_1

In [None]:
coin_id = '004cb3d0-0803-4208-a9e1-c3457567ea3f'
coin_df = prices_df[prices_df['coin_id']==coin_id]
coin_df.shape

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

prices_metrics_df = cwm.generate_time_series_metrics(prices_df, metrics_config, dataset_key='prices', colname='price')
prices_metrics_df

In [None]:
prices_metrics_df

In [None]:


def generate_time_series_metrics(df: pd.DataFrame, config: dict) -> pd.DataFrame:
    """
    Generates specified time series metrics based on the config file.

    :param df: DataFrame with columns for coin_id (optional), date, and value.
    :param config: Dictionary specifying which metrics to calculate (e.g., SMA, EMA, Bollinger Bands).
    :return: DataFrame with metrics appended as columns.
    """
    # Ensure date is in datetime format and sorted
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by='date')

    # Check if coin_id exists
    if 'coin_id' in df.columns:
        # Apply metrics group by coin_id
        df = df.groupby('coin_id').apply(lambda group: apply_metrics(group, config)).reset_index(drop=True)
    else:
        # Apply metrics directly (no coin_id)
        df = apply_metrics(df, config)

    return df


def apply_metrics(df: pd.DataFrame, config: dict) -> pd.DataFrame:
    """
    Apply the required metrics to the DataFrame based on the config.
    """
    # Apply each metric based on config
    if 'sma' in config['metrics']:
        df = df.join(calculate_sma_ema(df['value'], sma_period=config.get('sma_period', 20)))

    # if 'bollinger_bands' in config['metrics']:
    #     df = df.join(calculate_bollinger_bands(df['value'], config.get('bb_period', 20)))

    # Add more metrics as needed

    return df


def calculate_sma_ema(timeseries: pd.Series, sma_period: int = 20, ema_period: int = 20) -> pd.DataFrame:
    """
    Calculates the Simple Moving Average (SMA) and Exponential Moving Average (EMA)
    for a given time series.

    :param timeseries: Pandas Series representing the values over time (indexed by date).
    :param sma_period: Period for calculating the SMA.
    :param ema_period: Period for calculating the EMA.
    :return: DataFrame containing the original series, SMA, and EMA.
    """
    # Calculate the Simple Moving Average (SMA)
    sma = timeseries.rolling(window=sma_period).mean()

    # Calculate the Exponential Moving Average (EMA)
    ema = timeseries.ewm(span=ema_period, adjust=False).mean()

    # Combine results into a DataFrame
    metrics_df = pd.DataFrame({
        'value': timeseries,
        'sma': sma,
        'ema': ema
    })

    return metrics_df

# Example usage:
timeseries = coin_df['price']  # assuming the df is already filtered for a single coin_id and sorted by date

metrics_df = calculate_sma_ema(timeseries)
metrics_df

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


time_series_config = metrics_config['time_series']['prices']

In [None]:
time_series_config