In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import re
import pdb
import datetime
import json
from datetime import datetime, timedelta
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar
from pyxirr import xirr


load_dotenv()

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp


# reload all modules
modules = [u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)


## Codespace

### Retrieve data

In [None]:
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')


# 1. Data Retrieval, Cleaning, Indicator Calculation
# --------------------------------------------------
# Market data: retrieve and clean full history
market_data_df = dr.retrieve_market_data()
market_data_df = dr.clean_market_data(market_data_df, config)

# Profits: retrieve and clean profits data spanning the earliest to latest training periods
profits_df = dr.retrieve_profits_data(config['training_data']['training_period_start'],
                                    config['training_data']['modeling_period_end'],
                                    config['data_cleaning']['minimum_wallet_inflows'])
profits_df, _ = dr.clean_profits_df(profits_df, config['data_cleaning'])

profits_df.head()


# 2. Filtering based on dataset overlap
# -------------------------------------
# Filter market_data to only coins with transfers data if configured to
if config['data_cleaning']['exclude_coins_without_transfers']:
    market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]
# Create prices_df: lightweight reference for other functions
prices_df = market_data_df[['coin_id','date','price']].copy()

# Filter profits_df to remove records for any coins that were removed in data cleaning
profits_df = profits_df[profits_df['coin_id'].isin(market_data_df['coin_id'])]



In [None]:
# 1. Impute all required dates
# ----------------------------
# Identify all required imputation dates
imputation_dates = [
    config['training_data']['training_period_start'],
    config['training_data']['training_period_end'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
]

# Impute all required dates
window_profits_df = pri.impute_profits_for_multiple_dates(profits_df, prices_df, imputation_dates, n_threads=24)
window_profits_df = (window_profits_df[(window_profits_df['date'] >= pd.to_datetime(min(imputation_dates))) &
                                    (window_profits_df['date'] <= pd.to_datetime(max(imputation_dates)))])

window_profits_df.head()

In [None]:
# Convert period start and end balances to transfers for cash flows calculations
def adjust_end_transfers(df, target_date):
    df.loc[df['date'] == target_date, 'usd_net_transfers'] -= df.loc[df['date'] == target_date, 'usd_balance']
    df.loc[df['date'] == target_date, 'usd_balance'] = 0
    return df

def adjust_start_transfers(df, target_date):
    df.loc[df['date'] == target_date, 'usd_net_transfers'] = df.loc[df['date'] == target_date, 'usd_balance']
    return df

adj_profits_df = window_profits_df.copy()

adj_profits_df = adjust_start_transfers(adj_profits_df,config['training_data']['training_period_start'])
adj_profits_df = adjust_end_transfers(adj_profits_df,config['training_data']['training_period_end'])
adj_profits_df = adjust_start_transfers(adj_profits_df,config['training_data']['modeling_period_start'])
adj_profits_df = adjust_end_transfers(adj_profits_df,config['training_data']['modeling_period_end'])

# Round final values
adj_profits_df['usd_net_transfers'] = np.trunc(adj_profits_df['usd_net_transfers'])

### Calculations

In [None]:
def calculate_wallets_xirr(profits_df, min_wallet_volume):
    """
    Calculates the XIRR of each wallet based on their cash flows across all coins they've
    interacted with in profits_df.

    Parameters:
    - profits_df (pd.DataFrame): shows daily coin-wallet transfers in USD
    - min_wallet_volume (int): wallets with less than this total USD volume will be excluded

    Returns:
    - xirr_df (pd.DataFrame): shows the XIRR of each wallet over the provided transactions
    """
    logger.info('Beginning XIRR calculation sequence...')

    # 1. Summarize cash flows on a wallet level
    # -----------------------------------------
    # Sum cash flows on a wallet level
    wallets_df = pd.DataFrame(profits_df.groupby(['wallet_address','date'])['usd_net_transfers'].sum())


    # 2. Filter wallets on data quality
    # ---------------------------------
    # Identify wallets with no transactions
    wallets_agg_df = wallets_df.groupby(level='wallet_address')['usd_net_transfers'].apply(lambda x: x.abs().sum())
    low_volume_wallets = wallets_agg_df[wallets_agg_df < min_wallet_volume].index

    # Remove transactionless wallets
    wallets_df_filtered = wallets_df[~wallets_df.index.get_level_values('wallet_address').isin(low_volume_wallets)]
    logger.info('Removed %s wallets with volume below $%s.', len(low_volume_wallets), min_wallet_volume)

    # Group by wallet_address and check for both positive and negative usd_net_transfers
    wallet_check = wallets_df_filtered.groupby('wallet_address')['usd_net_transfers'].apply(
        lambda x: (x > 0).any() and (x < 0).any()
    )
    wallets_missing_both = wallet_check[~wallet_check].index

    # Filter wallet addresses that do not have both positive and negative transfers
    wallets_df_filtered = wallets_df_filtered[~wallets_df_filtered.index.get_level_values('wallet_address').isin(wallets_missing_both)]
    logger.info('Removed %s wallets missing either a positive or negative transaction.', len(wallets_missing_both))


    # 3. Calculate XIRR
    # -----------------
    # Group by wallet_address (level of the MultiIndex) and calculate XIRR\
    start_time = time.time()
    logger.info('Calculating XIRR values...')
    xirr_results = wallets_df_filtered.groupby(level='wallet_address').apply(
        lambda df: xirr(df.index.get_level_values('date'), df['usd_net_transfers'])
    )
    logger.info('XIRR calculations complete after %.2f seconds.', time.time() - start_time)

    # Convert to DataFrame
    xirr_df = pd.DataFrame(xirr_results)
    xirr_df.columns = ['xirr']

    # Fill empty values with 0s
    xirr_df = xirr_df.fillna(0)


    return xirr_df

In [None]:
min_wallet_volume = 10000

# Filter to only training period
training_df = adj_profits_df[
    (adj_profits_df['date'] >= pd.to_datetime(config['training_data']['training_period_start'])) &
    (adj_profits_df['date'] <= pd.to_datetime(config['training_data']['training_period_end']))
]
training_df = training_df.sort_values(['wallet_address','coin_id','date'])
training_df.shape


training_xirr_df = calculate_wallets_xirr(training_df,min_wallet_volume)

In [None]:
# Filter to only modeling period
modeling_df = adj_profits_df[
    (adj_profits_df['date'] >= pd.to_datetime(config['training_data']['modeling_period_start'])) &
    (adj_profits_df['date'] <= pd.to_datetime(config['training_data']['modeling_period_end']))
]
modeling_df = modeling_df.sort_values(['wallet_address','coin_id','date'])
modeling_df.shape

modeling_xirr_df = calculate_wallets_xirr(modeling_df,min_wallet_volume=0)

In [None]:
xirr_df = training_xirr_df.rename(columns={'xirr': 'training_xirr'}).join(
    modeling_xirr_df.rename(columns={'xirr': 'modeling_xirr'}),
    how='left'
).fillna({'modeling_xirr': 0})


# Calculate percentiles
xirr_df["training_xirr_percentile"] = xirr_df["training_xirr"].rank(ascending=True, pct=True)
xirr_df["modeling_xirr_percentile"] = xirr_df["modeling_xirr"].rank(ascending=True, pct=True)


xirr_df.head()

In [None]:
xirr_df['training_xirr_percentile'].corr(xirr_df['modeling_xirr_percentile'])

In [None]:
# Calculate year fractions from the first date
start_date = dates.min()  # Use the earliest date as the reference
date_fractions = (dates - start_date).dt.days / 365.0
date_fractions = date_fractions.values

date_fractions

In [None]:

# Sum cash flows on a wallet level
wallets_df = pd.DataFrame(training_df.groupby(['wallet_address','date'])['usd_net_transfers'].sum())

# Identify wallets with no transactions
wallets_agg_df = wallets_df.groupby(level='wallet_address')['usd_net_transfers'].apply(lambda x: x.abs().sum())
low_volume_wallets = wallets_agg_df[wallets_agg_df < min_wallet_volume].index

# Remove transactionless wallets
wallets_df_filtered = wallets_df[~wallets_df.index.get_level_values('wallet_address').isin(low_volume_wallets)]

In [None]:
wallets_df_filtered.shape

In [None]:
# Group by wallet_address and check for both positive and negative usd_net_transfers
wallet_check = wallets_df_filtered.groupby('wallet_address')['usd_net_transfers'].apply(
    lambda x: (x > 0).any() and (x < 0).any()
)

# Filter wallet addresses that do not meet the condition
wallets_missing_both = wallet_check[~wallet_check].index
logger.info('Found %s wallets missing either a positive or negative transaction.', len(wallets_missing_both))





In [None]:
# w = '0x036783df7aec54b5dfca9e1f870577bbcca95481'
# wallets_df.loc[w]

# profits_df[profits_df['wallet_address']==w]


### XIRR sequence

In [None]:
wallets_df_filtered.head()

In [None]:
w = '0x0000000000000000000000000000000000000014'

dates = wallets_df.loc[w].index.values
cash_flows = wallets_df.loc[w]['usd_net_transfers']

xirr(dates,cash_flows)

In [None]:
# Group by wallet_address (level of the MultiIndex) and calculate XIRR
xirr_results = wallets_df_filtered.groupby(level='wallet_address').apply(
    lambda df: xirr(df.index.get_level_values('date'), df['usd_net_transfers'])
)

# Convert to DataFrame
xirr_df = pd.DataFrame(xirr_results)
xirr_df.columns = ['xirr']

# Display the resulting DataFrame
print(xirr_results.shape)
xirr_results.head()

In [None]:
xirr_df = pd.DataFrame(xirr_results)
xirr_df.columns = ['xirr']
xirr_df.head()

In [None]:
dates

In [None]:
cash_flows

In [None]:
x = xirr(dates,cash_flows)
x

In [None]:
c = '77e2cf4b-d18a-4026-a2f2-f083f48fe1be'
w = '0xaff2943cfe3e95f66142a1729079418d78e42236'

# u.cw_filter_df(training_df,c,w)

df = u.cw_filter_df(training_df,c,w)
df = df.sort_values('date')
df

In [None]:
dates = df['date']
cash_flows = df['usd_net_transfers']

In [None]:
from pyxirr import xirr

xirr(dates,cash_flows)

In [None]:
cash_flows.cumsum()

In [None]:
cash_flows

In [None]:
# Calculate year fractions from the first date
start_date = dates.min()  # Use the earliest date as the reference
date_fractions = (dates - start_date).dt.days / 365.0
date_fractions = date_fractions.values

date_fractions

In [None]:
date_fractions = (np.datetime64(dates) - np.datetime64(dates[0])).astype('timedelta64[D]') / np.timedelta64(1, 'Y')
date_fractions

## Junkyard

In [None]:
# query_sql = '''
#     with wallet_coins as (
#         select *
#         from (
#             select wallet_address
#             ,coin_id
#             ,max(usd_inflows_cumulative) as coin_inflows
#             from core.coin_wallet_profits
#             group by 1,2
#         )
#         where coin_inflows > 500
#     )

#     ,wallets as (
#         select *
#         from (
#             select wallet_address
#             ,count(coin_id) as total_tokens
#             ,sum(coin_inflows) as total_inflows
#             from wallet_coins wti
#             group by 1
#         )
#         where total_tokens between 3 and 50
#         and total_inflows < 20000000
#     )

#     select cwp.wallet_address
#     ,cwp.coin_id
#     ,cwp.date
#     ,round(cwp.usd_net_transfers) as usd_net_transfers
#     ,round(cwp.usd_balance) as usd_balance
#     ,round(cwp.usd_net_transfers/cmd.price) as token_transfers
#     ,round(cwp.usd_balance/cmd.price) as token_balance
#     ,cmd.price
#     from wallets w
#     join wallet_coins wc on wc.wallet_address = w.wallet_address
#     join core.coin_wallet_profits cwp on cwp.wallet_address = wc.wallet_address
#         and cwp.coin_id = wc.coin_id
#     join core.coin_market_data cmd on cmd.coin_id = cwp.coin_id
#         and cmd.date = cwp.date
#     order by 1,2,3
#     '''
# transfers_df = dgc().run_sql(query_sql)

# # Convert wallet_address to categorical, store the mapping, and convert the column to int32
# wallet_address_categorical = transfers_df['wallet_address'].astype('category')
# # wallet_address_mapping = wallet_address_categorical.cat.categories
# # transfers_df['wallet_address'] = wallet_address_categorical.cat.codes.astype('uint32')


# # Convert coin_id to categorical (original strings are preserved)
# transfers_df['coin_id'] = transfers_df['coin_id'].astype('category')

# # Convert all numerical columns to 32 bit, using safe_downcast to avoid overflow
# transfers_df = u.safe_downcast(transfers_df, 'usd_net_transfers', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'usd_balance', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'token_transfers', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'token_balance', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'price', 'float32')

# print(transfers_df.info())
# print(u.df_mem(transfers_df))
# transfers_df.head()

In [None]:
# query_sql = '''
#     with wallet_coins as (
#         select *
#         from (
#             select wallet_address
#             ,coin_id
#             ,max(usd_inflows_cumulative) as coin_inflows
#             from core.coin_wallet_profits
#             group by 1,2
#         )
#         where coin_inflows > 500
#     )

#     ,wallets as (
#         select *
#         from (
#             select wallet_address
#             ,count(coin_id) as total_tokens
#             ,sum(coin_inflows) as total_inflows
#             from wallet_coins wti
#             group by 1
#         )
#         where total_tokens between 3 and 50
#         and total_inflows < 20000000
#     )

#     ,coins as (
#         select wc.coin_id
#         from wallets w
#         join wallet_coins wc on wc.wallet_address = w.wallet_address
#         group by 1
#     )

#     select cmd.coin_id
#     ,cmd.date
#     ,cmd.price
#     ,cmd.market_cap
#     from coins c
#     join core.coin_market_data cmd on cmd.coin_id = c.coin_id
#     order by 1,2
#     '''
# prices_df = dgc().run_sql(query_sql)

# # Convert coin_id to categorical (original strings are preserved)
# prices_df['coin_id'] = prices_df['coin_id'].astype('category')

# # Convert all numerical columns to 32 bit, using safe_downcast to avoid overflow
# prices_df = u.safe_downcast(prices_df, 'price', 'float32')
# prices_df = u.safe_downcast(prices_df, 'market_cap', 'int32')

# print(prices_df.info())
# print(u.df_mem(prices_df))
# prices_df.head()