In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')


# Load all configs as global variables
global CONFIG, METRICS_CONFIG, MODELING_CONFIG, EXPERIMENTS_CONFIG, MODELING_FOLDER

CONFIG = u.load_config('../config/config.yaml')
METRICS_CONFIG = u.load_config('../config/metrics_config.yaml')
MODELING_CONFIG = u.load_config('../config/modeling_config.yaml')
EXPERIMENTS_CONFIG = u.load_config('../config/experiments_config.yaml')
MODELING_FOLDER = MODELING_CONFIG['modeling']['modeling_folder']
modeling_folder = MODELING_FOLDER

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')


## Base Tables

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.DEBUG)


# retreive market data
market_data_df = td.retrieve_market_data()
market_data_df,_,_ = cwm.split_dataframe_by_coverage(
    market_data_df,
    start_date=config['training_data']['training_period_start'],
    end_date=config['training_data']['modeling_period_end'],
    id_column='coin_id'
)
prices_df = market_data_df[['coin_id','date','price']].copy()

# # retrieve transfers data
# transfers_df = td.retrieve_transfers_data(
#     config['training_data']['training_period_start'],
#     config['training_data']['modeling_period_start'],
#     config['training_data']['modeling_period_end']
#     )

# # compile profits_df
# # profits_df = td.prepare_profits_data(transfers_df, prices_df)
# # profits_df = td.calculate_wallet_profitability(profits_df)

profits_df,_ = td.clean_profits_df(profits_df2, config['data_cleaning'])

# # remove records from market_data_df that don't have transfers if configured to do so
# if config['data_cleaning']['exclude_coins_without_transfers']:
#     market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]


In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')


profits_df = td.retrieve_profits_data(
        start_date=config['training_data']['training_period_start'],
        end_date=config['training_data']['modeling_period_end']
)

print(profits_df.shape)


In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')


profits_df = td.retrieve_profits_data(
        start_date=config['training_data']['training_period_start'],
        end_date=config['training_data']['modeling_period_end']
)

print(profits_df.shape)


In [None]:
importlib.reload(td)
importlib.reload(cwm)

market_data_df = td.retrieve_market_data()
market_data_df,_,_ = cwm.split_dataframe_by_coverage(
    market_data_df,
    start_date=config['training_data']['training_period_start'],
    end_date=config['training_data']['modeling_period_end'],
    id_column='coin_id'
)

print(market_data_df['date'].memory_usage(deep=True))


In [None]:
profits_df.dtypes

In [None]:
profits_df_cleaned.dtypes

In [None]:

profits_df.memory_usage() / (1024 ** 3)

In [None]:
logger.setLevel(logging.DEBUG)
profits_df_cleaned,log = td.clean_profits_df(profits_df, config['data_cleaning'])


In [None]:
profits_df.head()

In [None]:
profits_df_cleaned.head()

In [None]:
profits_df_cleaned.dtypes

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
u.df_memory_usage(profits_df)

In [None]:
print(market_data_df['date'].memory_usage(deep=True))


In [None]:
market_data_df['date'] = pd.to_datetime(market_data_df['date']).dt.floor('D').values.astype('datetime64[D]')
print(market_data_df['date'].memory_usage(deep=True))


In [None]:
print(transfers_df['date'].memory_usage(deep=True))
transfers_df['date'] = transfers_df['date'].dt.date
print(transfers_df['date'].memory_usage(deep=True))


In [None]:
print(market_data_df['date'].memory_usage(deep=True))


In [None]:

market_data_df = market_data_df_full.copy(deep=True)
market_data_df['date'] = pd.PeriodIndex(market_data_df['date'], freq='D')
print(market_data_df['date'].memory_usage(deep=True))



In [None]:
market_data_df = market_data_df_full.copy(deep=True)


In [None]:

print(dt.astype('datetime64[D]'))  # Day: 30


In [None]:
market_data_df.memory_usage(deep=True)

In [None]:
market_data_df

In [None]:
market_data_df['date'] = pd.to_datetime(market_data_df['date'],format='%Y%m%d')
print(market_data_df['date'].memory_usage(deep=True))


In [None]:
market_data_df.index.date

In [None]:

# market_data_df = market_data_df_full.copy(deep=True)
market_data_df['date'] = market_data_df['date'].dt.date
print(market_data_df['date'].memory_usage(deep=True))

market_data_df.dtypes

In [None]:
market_data_df.head()

In [None]:

market_data_df = market_data_df_full.copy(deep=True)
market_data_df['date'] = market_data_df['date'].dt.normalize()
print(market_data_df['date'].memory_usage(deep=True))

In [None]:
market_data_df.dtypes

In [None]:
market_data_df.memory_usage()


In [None]:
market_data_df.dtypes

In [None]:
market_data_df.memory_usage()


In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.DEBUG)

profits_df = td.prepare_profits_data(transfers_df, prices_df)


In [None]:
profits_df.memory_usage()

In [None]:
transfers_df.dtypes

In [None]:
prices_df.dtypes

In [None]:
transfers_df.memory_usage()

In [None]:
transfers_df['date'] = transfers_df['date'].astype('datetime64[D]')
transfers_df.memory_usage()

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.DEBUG)

profits_df = td.prepare_profits_data(transfers_df, prices_df)

In [None]:
profits_df = td.calculate_wallet_profitability(profits_df)

In [None]:
u.df_memory_usage(profits_df)

In [None]:
profits_df.dtypes

In [None]:
# profits_df['wallet_address'] = profits_df['wallet_address'].cat.codes.astype('uint32')
# profits_df['coin_id'] = profits_df['coin_id'].astype('category')
u.df_memory_usage(profits_df)

In [None]:
profits_df['coin_id'] = profits_df['coin_id'].astype('category')


In [None]:
profits_df.memory_usage(deep=True)

In [None]:
u.memory_usage()

In [None]:
transfers_df['wallet_address'] = transfers_df['wallet_address'].cat.codes.astype('uint32')
u.df_memory_usage(transfers_df)

In [None]:
wallet_address_unique = transfers_df['wallet_address'].nunique()

wallet_address_unique

In [None]:
len(transfers_df)

In [None]:
transfers_df.dtypes

In [None]:
u.df_memory_usage(prices_df)

In [None]:
profits_df.shape
u.df_memory_usage(profits_df)

In [None]:

# 3.2 Retrieve or rebuild profits_df based on config changes
profits_df = i.rebuild_profits_df_if_necessary(config, modeling_folder, prices_df, profits_df)

# 3.3 Build the configured model input data (train/test data)
X_train, X_test, y_train, y_test = i.build_configured_model_input(profits_df, market_data_df, config, metrics_config, modeling_config)

# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(X_train, y_train, modeling_folder, modeling_config['modeling']['model_params'])

# 3.5 Evaluate and save the model's performance on the test set to a CSV
_ = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)


In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')


In [None]:

# compile profits_df
profits_df1 = td.prepare_profits_data(transfers_df, prices_df)
profits_df2 = td.calculate_wallet_profitability(profits_df1)
profits_df3,_ = td.clean_profits_df(profits_df2, config['data_cleaning'])

# remove records from market_data_df that don't have transfers if configured to do so
if config['data_cleaning']['exclude_coins_without_transfers']:
    market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df3['coin_id'])]




In [None]:
logger.info("profits_df1: %s",profits_df1.shape)
logger.info("profits_df2: %s",profits_df2.shape)
logger.info("profits_df3: %s",profits_df3.shape)


logger.info("profits_df1 coins: %s",len(set(profits_df1['coin_id'])))
logger.info("profits_df2 coins: %s",len(set(profits_df2['coin_id'])))
logger.info("profits_df3 coins: %s",len(set(profits_df3['coin_id'])))

In [None]:
logger.info(len(set(transfers_df['coin_id'])))
logger.info(len(set(prices_df['coin_id'])))


In [None]:
overlap = set(transfers_df['coin_id']).intersection(set(prices_df['coin_id']))
logger.info(len(overlap))

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')

# initial steps for this model
filtered_market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]

training_data_tuples = []
training_data_dfs = []

## tests failing

In [None]:
def price_data_transfers_df():
    """
    Create a sample transfers DataFrame for testing interactions with price data.
    """
    data = {
        'coin_id': ['BTC', 'BTC', 'MYRO', 'MYRO'],
        'wallet_address': ['wallet1', 'wallet1', 'wallet2', 'wallet2'],
        'date': [
            '2023-03-01', '2023-04-01',  # BTC wallet1 buys during training period
            '2023-02-20', '2023-03-10'   # MYRO wallet2 buys during training period (before price data)
        ],
        'net_transfers': [10.0, -10.0, 1000.0, -1000.0],  # Buys and sells
        'balance': [10.0, 0.0, 1000.0, 0.0]  # Balance adjustments after buy and sell
    }
    df = pd.DataFrame(data)
    df['coin_id'] = df['coin_id'].astype('category')
    df['date'] = pd.to_datetime(df['date'])
    return df



def price_data_prices_df():
    """
    Create a sample prices DataFrame with daily records between 2023-03-15 and 2023-04-01
    for both BTC and MYRO coins.
    """
    date_range = pd.date_range(start='2023-03-15', end='2023-04-01', freq='D')

    # Define prices for each coin and date (simple linear increase for this example)
    btc_prices = np.linspace(22000.0, 23000.0, len(date_range))
    myro_prices = np.linspace(12, 15, len(date_range))

    data = {
        'date': list(date_range) * 2,  # Repeat for both BTC and MYRO
        'coin_id': ['BTC'] * len(date_range) + ['MYRO'] * len(date_range),
        'price': list(btc_prices) + list(myro_prices)
    }

    df = pd.DataFrame(data)
    df['coin_id'] = df['coin_id'].astype('category')
    df['date'] = pd.to_datetime(df['date'])
    return df

price_data_transfers_df=price_data_transfers_df()
price_data_prices_df=price_data_prices_df()



In [None]:
price_data_transfers_df

In [None]:
price_data_prices_df

In [None]:
profits_df = td.prepare_profits_data(price_data_transfers_df, price_data_prices_df)
profits_df

In [None]:

# def test_price_data_interactions(price_data_transfers_df, price_data_prices_df):
# """
# Test interactions between wallet transfers and available price data.
# """
profits_df = td.prepare_profits_data(price_data_transfers_df, price_data_prices_df)
result = td.calculate_wallet_profitability(profits_df)

# Test scenario: Buy during training period before price data, sell after price data
wallet1_btc = result[(result['wallet_address'] == 'wallet1') & (result['coin_id'] == 'BTC')]
wallet1_btc_profits = (23000-22000) * 10
assert wallet1_btc.iloc[0]['date'] == pd.Timestamp('2023-03-15')  # First row should reflect earliest price data
assert wallet1_btc.iloc[0]['profits_cumulative'] == 0  # No profit on initial transfer in
assert wallet1_btc.iloc[1]['profits_cumulative'] == wallet1_btc_profits  # Profitability calculation should be valid

# Test scenario: Buy and sell before price data is available
wallet2_myro = result[(result['wallet_address'] == 'wallet2') & (result['coin_id'] == 'MYRO')]
assert wallet2_myro.empty  # No rows should exist, as no price data was available for the transaction


