In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.DEBUG)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')


# Load all configs as global variables
global CONFIG, METRICS_CONFIG, MODELING_CONFIG, EXPERIMENTS_CONFIG, MODELING_FOLDER

CONFIG = u.load_config('../config/config.yaml')
METRICS_CONFIG = u.load_config('../config/metrics_config.yaml')
MODELING_CONFIG = u.load_config('../config/modeling_config.yaml')
EXPERIMENTS_CONFIG = u.load_config('../config/experiments_config.yaml')
MODELING_FOLDER = MODELING_CONFIG['modeling']['modeling_folder']
modeling_folder = MODELING_FOLDER

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')


## Overall Sequencing

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


start_date = config['training_data']['training_period_start']
end_date = config['training_data']['modeling_period_end']

# Retrieve market data
market_data_df = td.retrieve_market_data()
market_data_df, _ = cwm.split_dataframe_by_coverage(market_data_df, start_date, end_date, id_column='coin_id')
prices_df = market_data_df[['coin_id','date','price']].copy()

# retrieve profits data
profits_df = td.retrieve_profits_data(start_date, end_date)
profits_df, _ = cwm.split_dataframe_by_coverage(profits_df, start_date, end_date, id_column='coin_id')
profits_df, _ = td.clean_profits_df(profits_df, config['data_cleaning'])

# remove records from market_data_df that don't have transfers if configured to do so
if config['data_cleaning']['exclude_coins_without_transfers']:
    market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]


In [None]:
dates_to_impute = [
    config['training_data']['training_period_end'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end'],
]
profits_df = td.impute_profits_for_multiple_dates(profits_df, prices_df, dates_to_impute, n_threads=24)

In [None]:
metrics_config

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


# 3.3 Build the configured model input data (train/test data)
X_train, X_test, y_train, y_test = i.build_configured_model_input(profits_df, market_data_df, config, metrics_config, modeling_config)

# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['modeling']['model_params'])

# 3.5 Evaluate and save the model's performance on the test set to a CSV
evals = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

In [None]:
evals = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)
evals

In [None]:
metrics_config['time_series']

In [None]:
full_profits_df = profits_df.copy(deep=True)

In [None]:
logger.setLevel(logging.DEBUG)

target_date = config['training_data']['training_period_end']
new_rows_df = td.impute_profits_df_rows(profits_df, prices_df, target_date)

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.DEBUG)



In [None]:
profits_df = full_profits_df.copy(deep=True)
profits_df.shape

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.DEBUG)

dates_to_impute = [
    config['training_data']['training_period_end'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end'],
]



def impute_profits_for_multiple_dates(profits_df, prices_df, dates, n_threads):
    """
    Wrapper function to impute profits for multiple dates using multithreaded processing.

    Args:
        profits_df (pd.DataFrame): DataFrame containing dated profits data for coin-wallet pairs
        prices_df (pd.DataFrame): DataFrame containing price information
        dates (list): List of dates (str or datetime) for which to impute rows
        n_threads (int): The number of threads to use for imputation

    Returns:
        pd.DataFrame: Updated profits_df with imputed rows for all specified dates
    """
    logger = logging.getLogger(__name__)

    start_time = time.time()
    logger.info("Starting profits_df imputation for %s dates...", len(dates))

    new_rows_list = []

    for date in dates:
        new_rows_df = td.multithreaded_impute_profits_rows(profits_df, prices_df, date, n_threads)
        new_rows_list.append(new_rows_df)

    # Concatenate all new rows at once
    all_new_rows = pd.concat(new_rows_list, ignore_index=True)

    # Append all new rows to profits_df
    updated_profits_df = pd.concat([profits_df, all_new_rows], ignore_index=True)

    logger.info("Completed new row generation after %.2f seconds. Total rows after imputation: %s",
                time.time() - start_time,
                updated_profits_df.shape[0])

    return updated_profits_df

update_profits_df = impute_profits_for_multiple_dates(profits_df, prices_df, dates_to_impute, n_threads=24)

In [None]:
import multiprocessing
optimal_thread_count = multiprocessing.cpu_count()
optimal_thread_count


## Junkyard

In [None]:
# Create efficient columns
profits_df['coin_id'] = profits_df['coin_id'].astype('category')
coin_id_mapping = dict(enumerate(profits_df['coin_id'].cat.categories))
profits_df['coin_id'] = profits_df['coin_id'].cat.codes.astype('int16')

# Convert date column to store the difference in days relative to target_date
profits_df['date'] = (profits_df['date'] - target_date).dt.days.astype('int16

In [None]:
# # vars
# target_date = '2024-08-31'
# # new_rows_df = generate_new_row(profits_df, prices_df, target_date)

# target_date = pd.to_datetime(target_date)

# # # Create efficient indexes
# # profits_df = profits_df.set_index(['coin_id', 'wallet_address', 'date']).copy(deep=True)
# # prices_df = prices_df.set_index(['coin_id', 'date']).copy(deep=True)

# # # Identify pairs needing new rows
# # logger.debug('Identifying pairs that need a row for %s...', target_date)
# # all_pairs = profits_df.index.droplevel('date').unique()
# # existing_pairs = profits_df.loc(axis=0)[:, :, target_date].index.droplevel('date')
# # pairs_needing_rows = all_pairs.difference(existing_pairs)
# # logger.debug('Identified %s pairs that will need rows imputed.', len(pairs_needing_rows))


# new_rows = []

# logger.debug('Imputing new rows...')
# for coin_id, wallet_address in pairs_needing_rows:
#     # Get most recent record
#     recent_record = profits_df.loc[coin_id, wallet_address].loc[:target_date].iloc[-1]

#     # Get prices
#     price_previous = prices_df.loc[(coin_id, recent_record.name), 'price']
#     price_current = prices_df.loc[(coin_id, target_date), 'price']

#     # Calculate new values
#     price_ratio = price_current / price_previous
#     new_usd_balance = recent_record['usd_balance'] * price_ratio
#     profits_change = new_usd_balance - recent_record['usd_balance']
#     profits_cumulative = recent_record['profits_cumulative'] + profits_change

#     new_row = {
#         'coin_id': coin_id,
#         'wallet_address': wallet_address,
#         'date': target_date,
#         'profits_change': profits_change,
#         'profits_cumulative': profits_cumulative,
#         'usd_balance': new_usd_balance,
#         'usd_net_transfers': 0,
#         'usd_inflows': 0,
#         'usd_inflows_cumulative': recent_record['usd_inflows_cumulative'],
#         'total_return': profits_cumulative / max(recent_record['usd_inflows_cumulative'], 0.01)
#     }

#     new_rows.append(new_row)

# new_rows_df = pd.DataFrame(new_rows)

# logger.debug('Generated new_rows_df with shape %s.', new_rows_df.shape)


In [None]:
# Get the most recent data for pairs needing rows
most_recent_data = profits_df.loc[pairs_needing_rows]
most_recent_data = most_recent_data.groupby(level=['coin_id', 'wallet_address']).last().reset_index()

# Ensure the date column is properly formatted
most_recent_data['date'] = pd.to_datetime(most_recent_data['date'])

# Reset index of prices_df for the merge operation
prices_df_reset = prices_df.reset_index()

# Perform asof merge to get the most recent price before or on the date of each record
merged_data = pd.merge_asof(most_recent_data.sort_values('date'),
                            prices_df_reset.sort_values('date'),
                            on='date',
                            by='coin_id',
                            direction='backward')

# Now get the price at the target date
target_prices = prices_df.loc(axis=0)[:, target_date].reset_index()
target_prices = target_prices.rename(columns={'price': 'target_price'})

# Merge the target prices
merged_data = pd.merge(merged_data, target_prices[['coin_id', 'target_price']], on='coin_id', how='left')

# Calculate price ratio
merged_data['price_ratio'] = merged_data['target_price'] / merged_data['price']

logger.debug('Merged data shape: %s', merged_data.shape)
logger.debug('Merged data columns: %s', merged_data.columns.tolist())

In [None]:
# Get the most recent row for each pair needing a new row
most_recent_data = profits_df.loc[profits_df.index.isin(pairs_needing_rows, level=['coin_id', 'wallet_address'])]
# most_recent_data = most_recent_data.groupby(level=['coin_id', 'wallet_address']).last().reset_index()

# # Ensure the date column is properly formatted
# most_recent_data['date'] = pd.to_datetime(most_recent_data['date'])
# prices_df['date'] = pd.to_datetime(prices_df['date'])

# # Perform asof merge to get the most recent price before or on the date of each record
# merged_data = pd.merge_asof(most_recent_data.sort_values('date'),
#                             prices_df[['date', 'coin_id', 'price']].sort_values('date'),
#                             on='date',
#                             by='coin_id',
#                             direction='backward')

# # Now get the price at the target date
# target_prices = prices_df[prices_df['date'] == target_date][['coin_id', 'price']]
# target_prices = target_prices.rename(columns={'price': 'target_price'})

# # Merge the target prices
# merged_data = pd.merge(merged_data, target_prices, on='coin_id', how='left')

# # Calculate price ratio
# merged_data['price_ratio'] = merged_data['target_price'] / merged_data['price']


## tests failing