In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.DEBUG)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')


# Load all configs as global variables
global CONFIG, METRICS_CONFIG, MODELING_CONFIG, EXPERIMENTS_CONFIG, MODELING_FOLDER

CONFIG = u.load_config('../config/config.yaml')
METRICS_CONFIG = u.load_config('../config/metrics_config.yaml')
MODELING_CONFIG = u.load_config('../config/modeling_config.yaml')
EXPERIMENTS_CONFIG = u.load_config('../config/experiments_config.yaml')
MODELING_FOLDER = MODELING_CONFIG['modeling']['modeling_folder']
modeling_folder = MODELING_FOLDER

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')


## Overall Sequencing

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


start_date = config['training_data']['training_period_start']
end_date = config['training_data']['modeling_period_end']

# Retrieve market data
market_data_df = td.retrieve_market_data()
market_data_df, _ = cwm.split_dataframe_by_coverage(market_data_df, start_date, end_date, id_column='coin_id')
prices_df = market_data_df[['coin_id','date','price']].copy()

# retrieve profits data
profits_df = td.retrieve_profits_data(start_date, end_date)
profits_df, _ = cwm.split_dataframe_by_coverage(profits_df, start_date, end_date, id_column='coin_id')
profits_df, _ = td.clean_profits_df(profits_df, config['data_cleaning'])


# remove records from market_data_df that don't have transfers if configured to do so
if config['data_cleaning']['exclude_coins_without_transfers']:
    market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]


In [None]:
profits_df_full = profits_df.copy(deep=True)
market_data_df_full = market_data_df.copy(deep=True)
prices_df_full = prices_df.copy(deep=True)

# profits_df = profits_df_full.copy(deep=True)
# market_data_df = market_data_df_full.copy(deep=True)
# prices_df = prices_df_full.copy(deep=True)

In [None]:
# TO DO
# figure out why the prices joins remove rows, shouldn't prices be filled all the way?
logger.setLevel(logging.DEBUG)


profits_df = profits_df_full.copy(deep=True)
# market_data_df = market_data_df_full.copy(deep=True)
prices_df = prices_df_full.copy(deep=True)

# variables
# prices_df = prices_df_full.copy(deep=True)
# profits_df = partitions[7].copy(deep=True)
target_date = '2024-08-31'


def impute_profits_df_rows(profits_df, prices_df, target_date):
    """
    Impute rows for all coin-wallet pairs in profits_df on the target date using only
    vectorized functions, i.e. there are no groupby statements or for loops/lambda
    functions that iterate over each row. This is necessary due to the size and memory
    requirements of the input df.

    This function performs the following steps:
    1. Splits profits_df into records before and after the target date
    2. Filters for pairs needing new rows
    3. Identifies the last date for each coin-wallet pair
    4. Appends price columns for the last date and target date
    5. Calculates new values for pairs needing rows
    6. Concatenates the new rows with the original dataframe

    Args:
        profits_df (pd.DataFrame): DataFrame containing profit information
        prices_df (pd.DataFrame): DataFrame containing price information
        target_date (str or datetime): The date for which to impute rows

    Returns:
        profits_df_filled (pd.DataFrame): Updated profits DataFrame with imputed rows

    Raises:
        ValueError: If joining prices_df removes rows from profits_df
    """
    start_time = time.time()
    logger.info('%s Imputing rows for all coin-wallet pairs in profits_df on %s...',
                profits_df.shape,
                target_date)


    # Convert date to datetime
    target_date = pd.to_datetime(target_date)

    # Store shape for logging purposes
    start_shape = profits_df.shape

    # Create indices so we can use vectorized operations
    profits_df = profits_df.set_index(['coin_id', 'wallet_address', 'date'])
    prices_df = prices_df.set_index(['coin_id', 'date'])

    # Step 1: Split profits_df records before and after the target_date
    # -----------------------------------------------------------------
    profits_df_after_target = profits_df.xs(slice(target_date + pd.Timedelta('1 day'), None), level=2, drop_level=False)
    profits_df = profits_df.xs(slice(None, target_date), level=2, drop_level=False)

    logger.debug("%s <Step 1> Split profits_df into %s rows through the target_date and %s after target_date: %.2f seconds",
                    profits_df.shape,
                    len(profits_df),
                    len(profits_df_after_target),
                    time.time() - start_time)
    step_time = time.time()


    # Step 2: Filter profits_df to only pairs that need new rows
    # ----------------------------------------------------------
    # Create a boolean mask for rows at the target_date
    target_date_mask = profits_df.index.get_level_values('date') == target_date

    # Create a boolean mask for pairs that don't have a row at the target_date
    pairs_mask = ~profits_df.index.droplevel('date').isin(
        profits_df[target_date_mask].index.droplevel('date')
    )
    profits_df = profits_df[pairs_mask].sort_index()

    logger.debug("%s <Step 2> Identified %s coin-wallet pairs that need imputed rows: %.2f seconds",
                    profits_df.shape,
                    len(profits_df),
                    time.time() - step_time)
    step_time = time.time()


    # Step 3: Identify the last date for each coin-wallet pair
    # ----------------------------------------------
    # The logic here is that every row that doesn't have the same coin_id-wallet_address
    # combination as the previous row must indicate that the previous coin-wallet pair
    # just had its last date.

    # Create shifted index
    shifted_index = profits_df.index.to_frame().shift(-1)

    # Create boolean mask for last dates
    is_last_date = (profits_df.index.get_level_values('coin_id') != shifted_index['coin_id']) | \
                (profits_df.index.get_level_values('wallet_address') != shifted_index['wallet_address'])

    # Filter for last dates
    profits_df = profits_df[is_last_date]

    logger.debug("%s <Step 3> Filtered profits_df to only the last dates for each coin-wallet pair: %.2f seconds",
                    profits_df.shape,
                    time.time() - step_time)
    step_time = time.time()


    # Step 4: Append columns for previous_price (as of the last date) and price (as of the target_date)
    # -------------------------------------------------------------------------------------------------
    # Add price_previous by joining the price as of the last date for each coin-wallet pair
    prejoin_size = len(profits_df)
    profits_df = profits_df.join(prices_df['price'], on=['coin_id', 'date'], how='inner')
    profits_df = profits_df.rename(columns={'price': 'price_previous'})

    # Add price by joining the price as of the target_date
    prices_target_date = prices_df.xs(target_date, level='date')
    profits_df = profits_df.join(prices_target_date['price'], on='coin_id', how='inner')

    if len(profits_df) != prejoin_size:
        raise ValueError("Inner join to prices_df on coin_id-date removed %s rows from profits_df with "
                        "original length %s. There should be complete coverage for all rows in profits_df.",
                        prejoin_size-len(profits_df),
                        len(profits_df))

    logger.debug("%s <Step 4> Joined prices_df and added price and previous_price helper columns: %.2f seconds",
                    profits_df.shape,
                    time.time() - step_time)
    step_time = time.time()


    # Step 5: Calculate new values for pairs needing rows
    # ---------------------------------------------------
    new_rows_df = pd.DataFrame(index=profits_df.index)
    new_rows_df['date'] = target_date
    new_rows_df['profits_change'] = (profits_df['price'] / profits_df['price_previous'] - 1) * profits_df['usd_balance']
    new_rows_df['profits_cumulative'] = new_rows_df['profits_change'] + profits_df['profits_cumulative']
    new_rows_df['usd_balance'] = (profits_df['price'] / profits_df['price_previous']) * profits_df['usd_balance']
    new_rows_df['usd_net_transfers'] = 0
    new_rows_df['usd_inflows'] = 0
    new_rows_df['usd_inflows_cumulative'] = profits_df['usd_inflows_cumulative']
    new_rows_df['total_return'] = new_rows_df['profits_cumulative'] / new_rows_df['usd_inflows_cumulative']
    new_rows_df['price_previous'] = profits_df['price_previous']
    new_rows_df['price'] = profits_df['price']

    logger.debug("%s <Step 5> Calculated %s new rows: %.2f seconds",
                    profits_df.shape,
                    len(new_rows_df),
                    time.time() - step_time)
    step_time = time.time()


    # Step 6: Reset MultiIndex and concatenate dfs
    # --------------------------------------------
    new_rows_df = new_rows_df.reset_index(level='date', drop=True)
    new_rows_df = new_rows_df.reset_index().set_index(['coin_id', 'wallet_address', 'date'])

    profits_df_filled = pd.concat([profits_df, new_rows_df])

    logger.debug("%s <Step 6> Reset indices and added new rows to profits_df: %.2f seconds",
                    profits_df.shape,
                    time.time() - step_time)
    logger.info("%s Successfully merged profits_df %s with new_rows_df %s to get profits_df_filled %s after %.2f total seconds.",
                profits_df_filled.shape,
                start_shape,
                new_rows_df.shape,
                profits_df_filled.shape,
                time.time() - start_time)

    return profits_df_filled


df = impute_profits_df_rows(profits_df,prices_df,target_date)



In [None]:
profits_df_3 = profits_df.copy(deep=True)
profits_df_3.shape

In [None]:


# Step 4: Append columns for previous_price (as of the last date) and price (as of the target_date)
# -------------------------------------------------------------------------------------------------
# Add price_previous by joining the price as of the last date for each coin-wallet pair
profits_df_4 = profits_df.join(prices_df['price'], on=['coin_id', 'date'], how='inner')
profits_df_4 = profits_df_4.rename(columns={'price': 'price_previous'})

# # Add price by joining the price as of the target_date
# prices_target_date = prices_df.xs(target_date, level='date')
# profits_df = profits_df.join(prices_target_date['price'], on='coin_id', how='inner')

# logger.debug("%s <Step 4> Joined prices_df and added price and previous_price helper columns: %.2f seconds",
#                 profits_df.shape,
#                 time.time() - step_time)
# step_time = time.time()

profits_df_4.shape

In [None]:
import pandas as pd

# Assuming profits_df_3 and profits_df_4 are your DataFrames with MultiIndex

# Step 1: Perform a left join of profits_df_3 with profits_df_4
joined_df = profits_df_3.join(profits_df_4, how='left', lsuffix='_3', rsuffix='_4')

# Step 2: Create a boolean mask for rows in profits_df_3 but not in profits_df_4
# We'll use a column that should always be present in profits_df_4, let's say 'usd_balance'
missing_mask = joined_df['usd_balance_4'].isna()

# Step 3: Isolate the missing records
missing_records = joined_df[missing_mask]

# Step 4: Clean up the result to keep only profits_df_3 columns
columns_to_keep = [col for col in missing_records.columns if not col.endswith('_4')]
missing_records = missing_records[columns_to_keep]

# Step 5: Remove the '_3' suffix from column names
missing_records.columns = [col[:-2] if col.endswith('_3') else col for col in missing_records.columns]

print("Records in profits_df_3 but not in profits_df_4:")
print(missing_records.head())
print(f"Total missing records: {len(missing_records)}")

# Optional: If you want to reset the index to match the original structure
# missing_records = missing_records.reset_index()

In [None]:
# def create_partitions(profits_df, n_partitions):
#     """
#     Partition a DataFrame into multiple subsets based on unique coin_ids.

#     Parameters:
#     - profits_df (pd.DataFrame): The input DataFrame to be partitioned. Must contain
#         a 'coin_id' column.
#     - n_partitions (int): The number of partitions to create.

#     Returns:
#     - partition_dfs (List[pd.DataFrame]): A list of DataFrames, each representing
#         a partition of the original data.
#     """
#     # Get unique coin_ids and convert to a regular list
#     unique_coin_ids = profits_df['coin_id'].unique().tolist()

#     # Shuffle the list of coin_ids
#     np.random.shuffle(unique_coin_ids)

#     # Calculate the number of coin_ids per partition
#     coins_per_partition = len(unique_coin_ids) // n_partitions

#     # Create partitions
#     partition_dfs = []
#     for i in range(n_partitions):
#         start_idx = i * coins_per_partition
#         end_idx = start_idx + coins_per_partition if i < n_partitions - 1 else None
#         partition_coin_ids = unique_coin_ids[start_idx:end_idx]

#         # Create a boolean mask for the current partition
#         mask = profits_df['coin_id'].isin(partition_coin_ids)

#         # Add the partition to the list
#         partition_dfs.append(profits_df[mask])

#     return partition_dfs


# n_partitions = 8
# partitions = create_partitions(profits_df, n_partitions)

In [None]:
missing_prices_df = missing_records.reset_index()[['coin_id','date']].drop_duplicates()
missing_prices_df['coin_id'].unique()

In [None]:
prices_df.head()

In [None]:
pr

In [None]:
coin_id = '02785a31-24b2-403c-82d7-d3cb8783e1e6'
# prices_target_date = prices_df.xs(target_date, level='date')

prices_df.xs(coin_id, level='coin_id')

## Junkyard

In [None]:
# Create efficient columns
profits_df['coin_id'] = profits_df['coin_id'].astype('category')
coin_id_mapping = dict(enumerate(profits_df['coin_id'].cat.categories))
profits_df['coin_id'] = profits_df['coin_id'].cat.codes.astype('int16')

# Convert date column to store the difference in days relative to target_date
profits_df['date'] = (profits_df['date'] - target_date).dt.days.astype('int16

In [None]:
# # vars
# target_date = '2024-08-31'
# # new_rows_df = generate_new_row(profits_df, prices_df, target_date)

# target_date = pd.to_datetime(target_date)

# # # Create efficient indexes
# # profits_df = profits_df.set_index(['coin_id', 'wallet_address', 'date']).copy(deep=True)
# # prices_df = prices_df.set_index(['coin_id', 'date']).copy(deep=True)

# # # Identify pairs needing new rows
# # logger.debug('Identifying pairs that need a row for %s...', target_date)
# # all_pairs = profits_df.index.droplevel('date').unique()
# # existing_pairs = profits_df.loc(axis=0)[:, :, target_date].index.droplevel('date')
# # pairs_needing_rows = all_pairs.difference(existing_pairs)
# # logger.debug('Identified %s pairs that will need rows imputed.', len(pairs_needing_rows))


# new_rows = []

# logger.debug('Imputing new rows...')
# for coin_id, wallet_address in pairs_needing_rows:
#     # Get most recent record
#     recent_record = profits_df.loc[coin_id, wallet_address].loc[:target_date].iloc[-1]

#     # Get prices
#     price_previous = prices_df.loc[(coin_id, recent_record.name), 'price']
#     price_current = prices_df.loc[(coin_id, target_date), 'price']

#     # Calculate new values
#     price_ratio = price_current / price_previous
#     new_usd_balance = recent_record['usd_balance'] * price_ratio
#     profits_change = new_usd_balance - recent_record['usd_balance']
#     profits_cumulative = recent_record['profits_cumulative'] + profits_change

#     new_row = {
#         'coin_id': coin_id,
#         'wallet_address': wallet_address,
#         'date': target_date,
#         'profits_change': profits_change,
#         'profits_cumulative': profits_cumulative,
#         'usd_balance': new_usd_balance,
#         'usd_net_transfers': 0,
#         'usd_inflows': 0,
#         'usd_inflows_cumulative': recent_record['usd_inflows_cumulative'],
#         'total_return': profits_cumulative / max(recent_record['usd_inflows_cumulative'], 0.01)
#     }

#     new_rows.append(new_row)

# new_rows_df = pd.DataFrame(new_rows)

# logger.debug('Generated new_rows_df with shape %s.', new_rows_df.shape)


In [None]:
# Get the most recent data for pairs needing rows
most_recent_data = profits_df.loc[pairs_needing_rows]
most_recent_data = most_recent_data.groupby(level=['coin_id', 'wallet_address']).last().reset_index()

# Ensure the date column is properly formatted
most_recent_data['date'] = pd.to_datetime(most_recent_data['date'])

# Reset index of prices_df for the merge operation
prices_df_reset = prices_df.reset_index()

# Perform asof merge to get the most recent price before or on the date of each record
merged_data = pd.merge_asof(most_recent_data.sort_values('date'),
                            prices_df_reset.sort_values('date'),
                            on='date',
                            by='coin_id',
                            direction='backward')

# Now get the price at the target date
target_prices = prices_df.loc(axis=0)[:, target_date].reset_index()
target_prices = target_prices.rename(columns={'price': 'target_price'})

# Merge the target prices
merged_data = pd.merge(merged_data, target_prices[['coin_id', 'target_price']], on='coin_id', how='left')

# Calculate price ratio
merged_data['price_ratio'] = merged_data['target_price'] / merged_data['price']

logger.debug('Merged data shape: %s', merged_data.shape)
logger.debug('Merged data columns: %s', merged_data.columns.tolist())

In [None]:
# Get the most recent row for each pair needing a new row
most_recent_data = profits_df.loc[profits_df.index.isin(pairs_needing_rows, level=['coin_id', 'wallet_address'])]
# most_recent_data = most_recent_data.groupby(level=['coin_id', 'wallet_address']).last().reset_index()

# # Ensure the date column is properly formatted
# most_recent_data['date'] = pd.to_datetime(most_recent_data['date'])
# prices_df['date'] = pd.to_datetime(prices_df['date'])

# # Perform asof merge to get the most recent price before or on the date of each record
# merged_data = pd.merge_asof(most_recent_data.sort_values('date'),
#                             prices_df[['date', 'coin_id', 'price']].sort_values('date'),
#                             on='date',
#                             by='coin_id',
#                             direction='backward')

# # Now get the price at the target date
# target_prices = prices_df[prices_df['date'] == target_date][['coin_id', 'price']]
# target_prices = target_prices.rename(columns={'price': 'target_price'})

# # Merge the target prices
# merged_data = pd.merge(merged_data, target_prices, on='coin_id', how='left')

# # Calculate price ratio
# merged_data['price_ratio'] = merged_data['target_price'] / merged_data['price']


## tests failing