In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import re
import pdb
import datetime
import json
from datetime import datetime, timedelta
import yaml
from typing import Dict,Union,List,Any,Tuple
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.model_selection import train_test_split
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar

# load dotenv
load_dotenv()

# Custom format function for displaying |numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

# Dark mode charts
plt.rcParams['figure.facecolor'] = '#181818'  # Custom background color (dark gray in this case)
plt.rcParams['axes.facecolor'] = '#181818'
plt.rcParams['text.color'] = '#afc6ba'
plt.rcParams['axes.labelcolor'] = '#afc6ba'
plt.rcParams['xtick.color'] = '#afc6ba'
plt.rcParams['ytick.color'] = '#afc6ba'
plt.rcParams['axes.titlecolor'] = '#afc6ba'

# import local modules
# pyright: reportMissingImports=false
sys.path.append('..//src')
import utils as u
import training_data.data_retrieval as dr
import training_data.profits_row_imputation as pri
import coin_wallet_metrics.coin_wallet_metrics as cwm
import coin_wallet_metrics.indicators as ind
import feature_engineering.feature_generation as fg
import feature_engineering.time_windows_orchestration as tw
import feature_engineering.flattening as flt
import feature_engineering.data_splitting as ds
import feature_engineering.target_variables as tv
import feature_engineering.preprocessing as prp
import modeling as m
import insights.analysis as ia
import insights.experiments as exp


# reload all modules
modules = [u, dr, pri, cwm, ind, fg, tw, flt, ds, tv, prp, m, ia, exp]
[importlib.reload(module) for module in modules]

# load all configs
config, metrics_config, modeling_config, experiments_config = u.load_all_configs('../config')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)


## Codespace

### Retrieve data

In [None]:
# Market data: retrieve and clean full history
market_data_df = dr.retrieve_market_data()
market_data_df = dr.clean_market_data(market_data_df, config)

# Profits: retrieve and clean profits data spanning the earliest to latest training periods
profits_df = dr.retrieve_profits_data(config['training_data']['earliest_cohort_lookback_start'],
                                    config['training_data']['training_period_end'],
                                    config['data_cleaning']['minimum_wallet_inflows'])
profits_df, _ = dr.clean_profits_df(profits_df, config['data_cleaning'])

profits_df.head()

In [None]:
# query_sql = '''
#     with wallet_coins as (
#         select *
#         from (
#             select wallet_address
#             ,coin_id
#             ,max(usd_inflows_cumulative) as coin_inflows
#             from core.coin_wallet_profits
#             group by 1,2
#         )
#         where coin_inflows > 500
#     )

#     ,wallets as (
#         select *
#         from (
#             select wallet_address
#             ,count(coin_id) as total_tokens
#             ,sum(coin_inflows) as total_inflows
#             from wallet_coins wti
#             group by 1
#         )
#         where total_tokens between 3 and 50
#         and total_inflows < 20000000
#     )

#     select cwp.wallet_address
#     ,cwp.coin_id
#     ,cwp.date
#     ,round(cwp.usd_net_transfers) as usd_net_transfers
#     ,round(cwp.usd_balance) as usd_balance
#     ,round(cwp.usd_net_transfers/cmd.price) as token_transfers
#     ,round(cwp.usd_balance/cmd.price) as token_balance
#     ,cmd.price
#     from wallets w
#     join wallet_coins wc on wc.wallet_address = w.wallet_address
#     join core.coin_wallet_profits cwp on cwp.wallet_address = wc.wallet_address
#         and cwp.coin_id = wc.coin_id
#     join core.coin_market_data cmd on cmd.coin_id = cwp.coin_id
#         and cmd.date = cwp.date
#     order by 1,2,3
#     '''
# transfers_df = dgc().run_sql(query_sql)

# # Convert wallet_address to categorical, store the mapping, and convert the column to int32
# wallet_address_categorical = transfers_df['wallet_address'].astype('category')
# wallet_address_mapping = wallet_address_categorical.cat.categories
# transfers_df['wallet_address'] = wallet_address_categorical.cat.codes.astype('uint32')


# # Convert coin_id to categorical (original strings are preserved)
# transfers_df['coin_id'] = transfers_df['coin_id'].astype('category')

# # Convert all numerical columns to 32 bit, using safe_downcast to avoid overflow
# transfers_df = u.safe_downcast(transfers_df, 'usd_net_transfers', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'usd_balance', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'token_transfers', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'token_balance', 'float32')
# transfers_df = u.safe_downcast(transfers_df, 'price', 'float32')

# print(transfers_df.info())
# print(u.df_mem(transfers_df))
# transfers_df.head()

In [None]:
# query_sql = '''
#     with wallet_coins as (
#         select *
#         from (
#             select wallet_address
#             ,coin_id
#             ,max(usd_inflows_cumulative) as coin_inflows
#             from core.coin_wallet_profits
#             group by 1,2
#         )
#         where coin_inflows > 500
#     )

#     ,wallets as (
#         select *
#         from (
#             select wallet_address
#             ,count(coin_id) as total_tokens
#             ,sum(coin_inflows) as total_inflows
#             from wallet_coins wti
#             group by 1
#         )
#         where total_tokens between 3 and 50
#         and total_inflows < 20000000
#     )

#     ,coins as (
#         select wc.coin_id
#         from wallets w
#         join wallet_coins wc on wc.wallet_address = w.wallet_address
#         group by 1
#     )

#     select cmd.coin_id
#     ,cmd.date
#     ,cmd.price
#     ,cmd.market_cap
#     from coins c
#     join core.coin_market_data cmd on cmd.coin_id = c.coin_id
#     order by 1,2
#     '''
# prices_df = dgc().run_sql(query_sql)

# # Convert coin_id to categorical (original strings are preserved)
# prices_df['coin_id'] = prices_df['coin_id'].astype('category')

# # Convert all numerical columns to 32 bit, using safe_downcast to avoid overflow
# prices_df = u.safe_downcast(prices_df, 'price', 'float32')
# prices_df = u.safe_downcast(prices_df, 'market_cap', 'int32')

# print(prices_df.info())
# print(u.df_mem(prices_df))
# prices_df.head()

### Calculations

In [None]:
config

In [None]:
profits_df.head()

In [None]:
profits_df.dtypes

In [None]:
training_df = profits_df[
    (profits_df['date'] >= pd.to_datetime(config['training_data']['training_period_start'])) &
    (profits_df['date'] <= pd.to_datetime(config['training_data']['training_period_end']))
]

training_df.shape

In [None]:
len(training_df['wallet_address'].unique())

In [None]:
training_df.head()

In [None]:
# Group data and sum net transfers
grouped = training_df.groupby(['wallet_address', 'date'])['usd_net_transfers'].sum().reset_index()

# Ensure proper sorting by wallet and date
grouped = grouped.sort_values(['wallet_address', 'date'])


In [None]:
    wallet_irrs = {}

    # Process each wallet separately
    for wallet in grouped_df['wallet_address'].unique():
        # Get wallet's cash flows
        wallet_data = grouped_df[grouped_df['wallet_address'] == wallet].copy()

        # Convert dates to days since first transaction
        wallet_data['days'] = (wallet_data['date'] - wallet_data['date'].min()).dt.days

        # Create cash flow series
        cash_flows = []
        days = []
a
        # Add each cash flow
        for _, row in wallet_data.iterrows():
            cash_flows.append(row['usd_net_transfers'])
            days.append(row['days'])

        try:
            # Calculate IRR (converting to annual rate)
            irr = np.irr(cash_flows)
            annual_irr = (1 + irr)**365 - 1
            wallet_irrs[wallet] = annual_irr
        except:
            # Handle cases where IRR calculation fails
            wallet_irrs[wallet] = Nonegrouped

## Junkyard