In [None]:
import sys
import os
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema
import progressbar


# load dotenv
load_dotenv()

# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df, create_progress_bar
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')
experiments_config = load_config('../config/experiments_config.yaml')


## Training Data (profits_df) Generation

In [None]:
config['data_cleaning']['max_gap_days']

In [None]:
importlib.reload(td)


# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,prices_log = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])

In [None]:
prices_df.shape

In [None]:
len(list(prices_log[prices_log['outcome']=='gaps above threshold']['coin_id']))

In [None]:
coin_id = '1e8030b4-452a-43b1-a834-2bf90ae640b5'
coin_df = prices_df[prices_df['coin_id'] == coin_id].sort_values('date', ascending=True).copy()

In [None]:
# Step 1: Reindex to create rows for all missing dates
coin_df = prices_df[prices_df['coin_id'] == coin_id].copy()

# Create the full date range
full_date_range = pd.date_range(start=coin_df['date'].min(), end=coin_df['date'].max(), freq='D')

# # Reindex to get all dates
# coin_df = coin_df.set_index('date').reindex(full_date_range).rename_axis('date').reset_index()
# coin_df['coin_id'] = coin_id  # Fills coin_id in the newly created rows

# # Step 2: Count the number of sequential missing dates
# missing_values = coin_df['price'].isnull().astype(int)
# consecutive_groups = coin_df['price'].notnull().cumsum()
# coin_df['missing_gap'] = missing_values.groupby(consecutive_groups).cumsum()

# # Check if there are no gaps at all
# if coin_df['missing_gap'].max() == 0:
#     outcomes.append({'coin_id': coin_id, 'outcome': 'no gaps'})
#     filled_results.append(coin_df)
#     continue

# # Check if any gaps exceed max_gap_days
# if coin_df['missing_gap'].max() > max_gap_days:
#     outcomes.append({'coin_id': coin_id, 'outcome': 'gaps above threshold'})
#     continue

# # Step 3: Forward-fill any gaps that are smaller than max_gap_days
# coin_df['price'] = coin_df['price'].ffill(limit=max_gap_days)

# # Remove rows with larger gaps that shouldn't be filled (already handled by check above)
# coin_df = coin_df[coin_df['missing_gap'] <= max_gap_days]


# # Append to the result list
# filled_results.append(coin_df)
# outcomes.append({'coin_id': coin_id, 'outcome': 'gaps below threshold'})

In [None]:

# Input data
prices_df_mixed_gaps = pd.DataFrame({
    'coin_id': ['coin1', 'coin1', 'coin1', 'coin2', 'coin2', 'coin2', 'coin3', 'coin3'],
    'date': pd.to_datetime(['2024-01-01', '2024-01-02', '2024-01-03',
                            '2024-01-01', '2024-01-04', '2024-01-05',
                            '2024-01-01', '2024-01-10']),
    'price': [100, 101, 102, 200, 204, 205, 300, 310]
})

# Expected data:
# - coin1 has no gaps.
# - coin2 has gaps below the threshold (filled for 2024-01-02, 2024-01-03).
# - coin3 has a gap too large (should be excluded).
expected_data = {
    'date': pd.to_datetime(['2024-01-01', '2024-01-02', '2024-01-03',  # coin1
                            '2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05']),  # coin2
    'coin_id': ['coin1', 'coin1', 'coin1',
                'coin2', 'coin2', 'coin2', 'coin2', 'coin2'],
    'price': [100.0, 101, 102,
                200, 200, 200, 204, 205]
}
expected_df = pd.DataFrame(expected_data)[['date', 'coin_id', 'price']]

# Run the function
prices_filled_df, outcomes_df = td.fill_prices_gaps(prices_df_mixed_gaps, max_gap_days)

# Reorder columns for comparison
prices_filled_df = prices_filled_df[['date', 'coin_id', 'price']]

# Assertions
pd.testing.assert_frame_equal(prices_filled_df, expected_df)
assert 'no gaps' in outcomes_df['outcome'].values
assert 'gaps below threshold' in outcomes_df['outcome'].values
assert 'gaps above threshold' in outcomes_df['outcome'].values


In [None]:
expected_df

In [None]:
prices_filled_df

In [None]:
prices_df_gaps_above_max

In [None]:
expected_df

In [None]:
prices_filled_df

In [None]:
expected_df

In [None]:


# Test Case 2: Gaps below max_gap_days

data = {
    'coin_id': ['coin1', 'coin1', 'coin1', 'coin2', 'coin2'],
    'date': pd.to_datetime(['2024-01-01', '2024-01-03', '2024-01-04',
                            '2024-01-01', '2024-01-03']),
    'price': [100, 102, 103, 200, 202]
}
expected_data = {
    'date': pd.to_datetime(['2024-01-01', '2024-01-02', '2024-01-03',
                            '2024-01-01', '2024-01-02', '2024-01-03']),
    'coin_id': ['coin1', 'coin1', 'coin1', 'coin2', 'coin2', 'coin2'],
    'price': [100, 100, 102, 200, 200, 202]
}
prices_df_gaps_below_max = pd.DataFrame(data)
max_gap_days = 2
prices_filled_df, outcomes_df = td.fill_prices_gaps(prices_df_gaps_below_max, max_gap_days)

expected_df = pd.DataFrame(expected_data)
prices_filled_df = prices_filled_df[['coin_id', 'date', 'price']]
pd.testing.assert_frame_equal(prices_filled_df, expected_df)

# Check the outcomes DataFrame
assert all(outcomes_df['outcome'] == 'gaps below threshold')

# # Test Case 2 (Version 2): Gaps exactly max_gap_days
# def test_gaps_at_max():
#     data = {
#         'coin_id': ['coin1', 'coin1', 'coin1', 'coin2', 'coin2'],
#         'date': pd.to_datetime(['2024-01-01', '2024-01-03', '2024-01-05',
#                                 '2024-01-01', '2024-01-03']),
#         'price': [100, 102, 103, 200, 202]
#     }
#     prices_df_gaps_at_max = pd.DataFrame(data)
#     max_gap_days = 2
#     prices_filled_df, outcomes_df = fill_prices_gaps(prices_df_gaps_at_max, max_gap_days)

#     print("Test 2 (Version 2): Gaps at Max")
#     print("Filled Prices DataFrame:\n", prices_filled_df)
#     print("Outcomes DataFrame:\n", outcomes_df)

# # Test Case 3: Gaps above max_gap_days
# def test_gaps_above_max():
#     data = {
#         'coin_id': ['coin1', 'coin1', 'coin1', 'coin2', 'coin2'],
#         'date': pd.to_datetime(['2024-01-01', '2024-01-05', '2024-01-06',
#                                 '2024-01-01', '2024-01-07']),
#         'price': [100, 105, 106, 200, 207]
#     }
#     prices_df_gaps_above_max = pd.DataFrame(data)
#     max_gap_days = 2
#     prices_filled_df, outcomes_df = fill_prices_gaps(prices_df_gaps_above_max, max_gap_days)

#     print("Test 3: Gaps Above Max")
#     print("Filled Prices DataFrame:\n", prices_filled_df)
#     print("Outcomes DataFrame:\n", outcomes_df)

# # Test Case 4: Mixed gaps scenario
# def test_mixed_gaps():
#     data = {
#         'coin_id': ['coin1', 'coin1', 'coin1', 'coin2', 'coin2', 'coin2', 'coin3', 'coin3'],
#         'date': pd.to_datetime(['2024-01-01', '2024-01-02', '2024-01-03',
#                                 '2024-01-01', '2024-01-04', '2024-01-05',
#                                 '2024-01-01', '2024-01-10']),
#         'price': [100, 101, 102, 200, 204, 205, 300, 310]
#     }
#     prices_df_mixed_gaps = pd.DataFrame(data)
#     max_gap_days = 2
#     prices_filled_df, outcomes_df = fill_prices_gaps(prices_df_mixed_gaps, max_gap_days)

#     print("Test 4: Mixed Gaps")
#     print("Filled Prices DataFrame:\n", prices_filled_df)
#     print("Outcomes DataFrame:\n", outcomes_df)

all(outcomes_df['outcome'] == 'no gaps')

In [None]:
prices_filled_df

In [None]:
prices_df_no_gaps

In [None]:
pd.testing.assert_frame_equal(prices_filled_df, prices_df_no_gaps)

## Metrics and Feature Engineering

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/metrics_config.yaml')
modeling_config = load_config('../config/modeling_config.yaml')


# identify cohort
cohort_summary_df = td.classify_wallet_cohort(profits_df, config['wallet_cohorts']['sharks'])

# generate and flatten buysell_metrics
cohort_wallets = cohort_summary_df[cohort_summary_df['in_cohort']==True]['wallet_address']
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets)

# flatten, save, and preprocess the flattened df
flattened_output_directory = os.path.join(modeling_config['modeling']['modeling_folder'],'outputs/flattened_outputs')
cohort_name = list(config['wallet_cohorts'].keys())[0]
metric_description = f"{cohort_name}_cohort"

flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])
flattened_df, flattened_filepath = fe.save_flattened_outputs(flattened_buysell_metrics_df, flattened_output_directory, metric_description, config['training_data']['modeling_period_start'])
preprocessed_df, preprocessed_filepath = fe.preprocess_coin_df(flattened_filepath, modeling_config, metrics_config)

# create the training data df
input_directory = f"{preprocessed_filepath.split('preprocessed_outputs/')[0]}preprocessed_outputs/"
input_filenames = [
    preprocessed_filepath.split('preprocessed_outputs/')[1]
]
training_data_df = fe.create_training_data_df(input_directory, input_filenames)

# create the target variable df
target_variable_df,_ = fe.create_target_variables_mooncrater(prices_df, config['training_data'], modeling_config)

# merge the two into the final model input df
model_input_df = fe.prepare_model_input_df(training_data_df, target_variable_df, modeling_config['modeling']['target_column'])

# split the df into train and test sets
X_train, X_test, y_train, y_test = m.split_model_input(
    model_input_df,
    modeling_config['modeling']['target_column'],
    modeling_config['modeling']['train_test_split'],
    modeling_config['modeling']['random_state']
)

# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['modeling']['model_params'])

# 3.5 Evaluate the model's performance on the test set
metrics = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

# 3.6 Log the experiment results for this configuration
m.log_trial_results(modeling_folder, model_id, experiment_id)