In [None]:
# pyright: reportMissingModuleSource=false
import sys
import os
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
from utils import load_config, cw_filter_df, create_progress_bar
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


## Metrics Generation

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# generate prices metrics metrics
prices_metrics_df,partial_prices_metrics_df = cwm.generate_time_series_metrics(prices_df, config, metrics_config, dataset_key='prices', colname='price')

print(buysell_metrics_df.shape)
print(prices_metrics_df.shape)

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')
logger.setLevel(logging.DEBUG)

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,prices_log = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])


# generate prices metrics metrics
prices_metrics_df,partial_prices_metrics_df = cwm.generate_time_series_metrics(prices_df, config, metrics_config, dataset_key='prices', colname='price')

print(prices_metrics_df.shape)

In [None]:
len(prices_metrics_df['coin_id'].unique())

Generated time series metrics. Out of 427 total coins, 230 had complete period coverage, 88 had partial coverage, and 109 had no coverage. 

In [None]:
training_period_start = pd.to_datetime(config['training_data']['training_period_start'])
training_period_end = pd.to_datetime(config['training_data']['training_period_end'])

# Get the difference in days as an integer
training_period_duration = (training_period_end - training_period_start).days

In [None]:
print(f"prices_df: {len(prices_df['coin_id'].unique())}")
print(f"prices_metrics_df: {len(prices_metrics_df['coin_id'].unique())}")
print(f"partial_prices_metrics_df: {len(partial_prices_metrics_df['coin_id'].unique())}")


In [None]:
prices_metrics_df.describe()

In [None]:
partial_prices_metrics_df.describe()

In [None]:
prices_metrics_df.describe()

In [None]:
partial_prices_metrics_df.describe()

In [None]:
partial_prices_metrics_df['coin_id'].unique()

In [None]:
partial_prices_metrics_df

## Preprocessing

In [None]:
# prices preprocessing
prices_metrics_config = metrics_config['time_series']['prices']
prices_description = 'prices_timeseries'

# flatten, save, and preprocess the flattened df
flattened_output_directory = os.path.join(modeling_config['modeling']['modeling_folder'],'outputs/flattened_outputs')

flattened_prices_metrics_df = fe.flatten_coin_date_df(
    prices_metrics_df,
    prices_metrics_config,
    config['training_data']['training_period_end']
)
flattened_prices_metrics_df, flattened_prices_metrics_filepath = fe.save_flattened_outputs(
    flattened_prices_metrics_df,
    flattened_output_directory,
    prices_description,
    config['training_data']['modeling_period_start']
)
prices_preprocessed_df, prices_preprocessed_filepath = fe.preprocess_coin_df(flattened_prices_metrics_filepath, modeling_config, prices_metrics_config)






### The Rest

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')




# create the training data df
input_filenames = [
    (buysell_preprocessed_filepath.split('preprocessed_outputs/')[1], 'drop_records'),
    (prices_preprocessed_filepath.split('preprocessed_outputs/')[1], 'drop_records')
]
training_data_df, merge_logs_df = fe.create_training_data_df(modeling_config['modeling']['modeling_folder'], input_filenames)

# create the target variable df
target_variable_df,_ = fe.create_target_variables_mooncrater(prices_df, config['training_data'], modeling_config)

# merge the two into the final model input df
model_input_df = fe.prepare_model_input_df(training_data_df, target_variable_df, modeling_config['modeling']['target_column'])

# split the df into train and test sets
X_train, X_test, y_train, y_test = m.split_model_input(
    model_input_df,
    modeling_config['modeling']['target_column'],
    modeling_config['modeling']['train_test_split'],
    modeling_config['modeling']['random_state']
)

# 3.4 Train the model using the current configuration and log the results
modeling_folder = modeling_config['modeling']['modeling_folder']
model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['modeling']['model_params'])

# 3.5 Evaluate the model's performance on the test set
metrics = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

# 3.6 Log the experiment results for this configuration
m.log_trial_results(modeling_folder, model_id)

metrics

In [None]:
def mock_input_files_colnames(tmpdir):
    """
    Unit test data for scenario with many duplicate columns and similar filenames.
    """
    # Create mock filenames and corresponding DataFrames
    filenames = [
        'buysell_metrics_2024-09-13_14-44_model_period_2024-05-01_v0.1.csv',
        'buysell_metrics_2024-09-13_14-45_model_period_2024-05-01_v0.1.csv',
        'buysell_metrics_megasharks_2024-09-13_14-45_model_period_2024-05-01_v0.1.csv',
        'buysell_metrics_megasharks_2024-09-13_14-45_model_period_2024-05-01_v0.2.csv',
        'price_metrics_2024-09-13_14-45_model_period_2024-05-01_v0.1.csv'
    ]

    # Create mock DataFrames for each file
    df1 = pd.DataFrame({'coin_id': [1, 2], 'buyers_new': [100, 200]})
    df2 = pd.DataFrame({'coin_id': [1, 2], 'buyers_new': [150, 250]})
    df3 = pd.DataFrame({'coin_id': [1, 2], 'buyers_new': [110, 210]})
    df4 = pd.DataFrame({'coin_id': [1, 2], 'buyers_new': [120, 220]})
    df5 = pd.DataFrame({'coin_id': [1, 2], 'buyers_new': [130, 230]})

    # Save each DataFrame as a CSV
    for i, df in enumerate([df1, df2, df3, df4, df5]):
        df.to_csv(os.path.join(tmpdir, filenames[i]), index=False)

    # Create a tuple list with filenames and 'fill_zeros' strategy
    input_files = [(filenames[i], 'fill_zeros') for i in range(len(filenames))]

    return tmpdir, input_files

tmpdir, input_files = mock_input_files_colnames('temp/')


In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


# def test_create_training_data_df(mock_input_files_colnames):
"""
Test column renaming logic for clarity when merging multiple files with similar filenames.
"""
# tmpdir, input_files = mock_input_files_colnames
print("Input files:", input_files)

# Call the function
merged_df, _ = fe.create_training_data_df(tmpdir, input_files)

# Check if the columns have the correct suffixes
expected_columns = [
    'coin_id',
    'buyers_new_buysell_metrics_2024-09-13_14-44',
    'buyers_new_buysell_metrics_2024-09-13_14-45',
    'buyers_new_buysell_metrics_megasharks_2024-09-13_14-45',
    'buyers_new_buysell_metrics_megasharks_2024-09-13_14-45_2',
    'buyers_new_price_metrics'
]

assert list(merged_df.columns) == expected_columns, \
    f"Expected columns: {expected_columns}, but got: {list(merged_df.columns)}"

In [None]:
print("Input files:", input_files)


## Codespace

In [None]:
input_files

In [None]:
import pytest
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')



def sample_time_series_df():
    """Fixture that provides a sample DataFrame for the time series with multiple coin_ids."""
    data = {
        'coin_id': [1, 1, 1, 2, 2, 2],
        'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-01', '2023-01-02', '2023-01-03'],
        'price': [100, 110, 120, 200, 210, 220]
    }
    df = pd.DataFrame(data)
    return df


def sample_metrics_config():
    """Fixture that provides a sample metrics configuration for time series analysis."""
    return {
        'time_series': {
            'prices': {
                'sma': {
                    'parameters': {
                        'period': 2
                    }
                },
                'ema': {
                    'parameters': {
                        'period': 2
                    }
                }
            }
        }
    }

sample_time_series_df = sample_time_series_df()
sample_metrics_config = sample_metrics_config()

# def test_generate_time_series_metrics_basic_functionality(sample_time_series_df, sample_metrics_config):
"""
Test the basic functionality of generate_time_series_metrics to ensure that SMA and EMA
are calculated correctly for a simple DataFrame with multiple coin_ids.
"""
# Convert the date to datetime in the sample data
sample_time_series_df['date'] = pd.to_datetime(sample_time_series_df['date'])

# Mock any necessary file or folder dependencies if needed
# (none in this specific case)

# Run the generate_time_series_metrics function
result_df,_ = cwm.generate_time_series_metrics(
    time_series_df=sample_time_series_df,
    metrics_config=sample_metrics_config,
    dataset_key='prices',
    colname='price'
)

# Expected columns in the result
expected_columns = ['coin_id', 'date', 'price', 'sma', 'ema']

# Assert that the columns exist in the result
assert all(col in result_df.columns for col in expected_columns), "Missing expected columns in the result."

# Assert that SMA and EMA are calculated correctly
expected_sma_1 = [100.0, 105.0, 115.0]  # SMA for coin_id=1 with period=2
expected_ema_1 = [100.0, 106.666667, 115.555556]  # EMA for coin_id=1 with period=2

# Confirm that the SMA result matches the expected, with special logic to handle NaNs
for i, (expected, actual) in enumerate(zip(
    expected_sma_1,
    result_df[result_df['coin_id'] == 1]['sma'].tolist()
)):
    if np.isnan(expected) and np.isnan(actual):
        continue  # Both values are NaN, so this is considered equal
    assert expected == actual, f"Mismatch at index {i}: expected {expected}, got {actual}"

# Confirm that the EMA result matches the expected
assert result_df[result_df['coin_id'] == 1]['ema'].tolist() == pytest.approx(
    expected_ema_1,
    abs=1e-2
), "EMA calculation incorrect for coin_id=1"

# Check for another coin_id
expected_sma_2 = [200.0, 205.0, 215.0]  # SMA for coin_id=2 with period=2
expected_ema_2 = [200.0, 206.666667, 215.555556]  # EMA for coin_id=2 with period=2

# Confirm that the SMA result matches the expected, with special logic to handle NaNs
for i, (expected, actual) in enumerate(zip(
    expected_sma_2,
    result_df[result_df['coin_id'] == 2]['sma'].tolist()
)):
    if np.isnan(expected) and np.isnan(actual):
        continue  # Both values are NaN, so this is considered equal
    assert expected == actual, f"Mismatch at index {i}: expected {expected}, got {actual}"

# Confirm that the EMA result matches the expected
assert result_df[result_df['coin_id'] == 2]['ema'].tolist() == pytest.approx(
    expected_ema_2,
    abs=1e-2
), "EMA calculation incorrect for coin_id=2"

# Confirm that the output df has the same number of rows as the input df
assert len(result_df) == len(sample_time_series_df), "Output row count does not match input row count"



In [None]:
result_df

In [None]:
# Define mock DataFrames
df1 = pd.DataFrame({'coin_id': [1, 2, 3], 'metric_1': [10, 20, 30]})
df2 = pd.DataFrame({'coin_id': [2, 3], 'metric_2': [200, 300]})

# List of input DataFrames with fill strategies
df_list = [(df1, 'fill_zeros', 'df1'), (df2, 'fill_zeros', 'df2')]

# Call the function
merged_df, _ = fe.merge_and_fill_training_data(df_list)

# Define the expected output
expected_df = pd.DataFrame({
    'coin_id': [1, 2, 3],
    'metric_1': [10, 20, 30],
    'metric_2': [0, 200, 300]  # Coin_id 1 should have metric_2 filled with 0
})

# Compare DataFrames
pd.testing.assert_frame_equal(merged_df, expected_df)

In [None]:
expected_df.dtypes

In [None]:
merged_df.dtypes

In [None]:
# Define mock DataFrames
df1 = pd.DataFrame({'coin_id': [1, 2, 3], 'metric_1': [10, 20, 30]})
df2 = pd.DataFrame({'coin_id': [2, 3], 'metric_2': [200, 300]})

# List of input DataFrames with fill strategies
df_list = [(df1, 'fill_zeros', 'df1'), (df2, 'drop_records', 'df2')]

# Call the function
merged_df, merge_logs = fe.merge_and_fill_training_data(df_list)

# Define the expected output
expected_df = pd.DataFrame({
    'coin_id': [2, 3],
    'metric_1': [20, 30],
    'metric_2': [200, 300]  # Coin_id 1 should be dropped
})

In [None]:
merge_logs

In [None]:

# Mock DataFrames
df1 = pd.DataFrame({
    'coin_id': [1, 2, 3],
    'metric_1': [10, 20, 30]
})

df2 = pd.DataFrame({
    'coin_id': [2, 3],
    'metric_2': [200, 300]
})

# Expected output when drop_records is applied: rows for coin 1 should be dropped
expected_df = pd.DataFrame({
    'coin_id': [2, 3],
    'metric_1': [20, 30],
    'metric_2': [200, 300]
})

# Run the function
merged_df, logs_df = fe.merge_and_fill_training_data([
    (df1, 'drop_records', 'df1'),
    (df2, 'drop_records', 'df2')
])

# Assert the merged DataFrame is correct
assert np.array_equal(merged_df.values, expected_df.values), "Merged DataFrame values do not match the expected DataFrame."


# Assert the logs are correct
# df1 should have no filled rows, and df2 should also have no filled rows (since we used drop_records)
expected_logs = pd.DataFrame({
    'file': ['df1', 'df2'],
    'original_count': [3, 2],
    'filled_count': [0, 0]
})

pd.testing.assert_frame_equal(logs_df.reset_index(drop=True), expected_logs.reset_index(drop=True))


# --------------------------

In [None]:
np.array_equal(merged_df.values,expected_df.values)


In [None]:
pd.testing.assert_frame_equal(merged_df, expected_df, check_dtype=False, , check_like=True)
