In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.DEBUG)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')


# Load all configs as global variables
global CONFIG, METRICS_CONFIG, MODELING_CONFIG, EXPERIMENTS_CONFIG, MODELING_FOLDER
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
CONFIG = config
METRICS_CONFIG = metrics_config
MODELING_CONFIG = modeling_config
EXPERIMENTS_CONFIG = experiments_config
MODELING_FOLDER = MODELING_CONFIG['modeling']['modeling_folder']
modeling_folder = MODELING_FOLDER

## Overall Sequencing

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


start_date = config['training_data']['training_period_start']
end_date = config['training_data']['modeling_period_end']

# Retrieve market data
market_data_df = td.retrieve_market_data()
market_data_df, _ = cwm.split_dataframe_by_coverage(market_data_df, start_date, end_date, id_column='coin_id')
prices_df = market_data_df[['coin_id','date','price']].copy()


In [None]:
if 'profits_df' not in globals():
    profits_df = None

profits_df = i.rebuild_profits_df_if_necessary(
                config,
                modeling_folder,
                prices_df,
                profits_df)

# remove records from market_data_df that don't have transfers if configured to do so
if config['data_cleaning']['exclude_coins_without_transfers']:
    market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]


In [None]:
X_train, X_test, y_train, y_test = i.build_configured_model_input(
                                    profits_df,
                                    market_data_df,
                                    config,
                                    metrics_config,
                                    modeling_config)


In [None]:
# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(
                    X_train,
                    y_train,
                    modeling_folder,
                    modeling_config['modeling']['model_params'])

# 3.5 Evaluate and save the model's performance on the test set to a CSV
metrics_df = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)
metrics_df

In [None]:
import pandas as pd
import logging

logger = logging.getLogger(__name__)

def prepare_and_compute_performance(prices_df, training_data_config):
    """
    Prepares the data and computes price performance for each coin.

    Parameters:
    - prices_df: DataFrame containing price data with columns 'coin_id', 'date', and 'price'.
    - training_data_config: Configuration with modeling period dates.

    Returns:
    - performance_df: DataFrame with columns 'coin_id' and 'performance'.
    - outcomes_df: DataFrame tracking outcomes for each coin.
    """
    prices_df = prices_df.copy()
    prices_df['date'] = pd.to_datetime(prices_df['date'])
    modeling_period_start = pd.to_datetime(training_data_config['modeling_period_start'])
    modeling_period_end = pd.to_datetime(training_data_config['modeling_period_end'])

    modeling_period_df = prices_df.loc[
        prices_df['date'].between(modeling_period_start, modeling_period_end)
    ]

    # Check for missing data
    start_price_coins = set(modeling_period_df[modeling_period_df['date'] == modeling_period_start]['coin_id'])
    end_price_coins = set(modeling_period_df[modeling_period_df['date'] == modeling_period_end]['coin_id'])
    all_coins = set(modeling_period_df['coin_id'])
    coins_missing_price = all_coins - (start_price_coins & end_price_coins)

    if coins_missing_price:
        missing = ', '.join(map(str, coins_missing_price))
        raise ValueError(f"Missing price for coins at start or end date: {missing}")

    # Compute performance
    performances = []
    outcomes = []
    for coin_id, group in modeling_period_df.groupby('coin_id'):
        price_start = group[group['date'] == modeling_period_start]['price'].values[0]
        price_end = group[group['date'] == modeling_period_end]['price'].values[0]
        performance = (price_end - price_start) / price_start
        performances.append({'coin_id': coin_id, 'performance': performance})
        outcomes.append({'coin_id': coin_id, 'outcome': 'performance calculated'})

    performance_df = pd.DataFrame(performances)
    outcomes_df = pd.DataFrame(outcomes)

    return performance_df, outcomes_df

def calculate_mooncrater_targets(performance_df, modeling_config):
    """
    Calculates 'is_moon' and 'is_crater' target variables based on performance.

    Parameters:
    - performance_df: DataFrame with columns 'coin_id' and 'performance'.
    - modeling_config: Configuration for modeling with target variable thresholds.

    Returns:
    - target_variables_df: DataFrame with columns 'coin_id', 'is_moon', and 'is_crater'.
    """
    moon_threshold = modeling_config['target_variables']['moon_threshold']
    crater_threshold = modeling_config['target_variables']['crater_threshold']
    moon_minimum_percent = modeling_config['target_variables']['moon_minimum_percent']
    crater_minimum_percent = modeling_config['target_variables']['crater_minimum_percent']

    target_variables_df = performance_df.copy()
    target_variables_df['is_moon'] = (target_variables_df['performance'] >= moon_threshold).astype(int)
    target_variables_df['is_crater'] = (target_variables_df['performance'] <= crater_threshold).astype(int)

    total_coins = len(target_variables_df)
    moons = target_variables_df['is_moon'].sum()
    craters = target_variables_df['is_crater'].sum()

    # Ensure minimum percentage for moons and craters
    if moons / total_coins < moon_minimum_percent:
        additional_moons_needed = int(total_coins * moon_minimum_percent) - moons
        moon_candidates = target_variables_df[target_variables_df['is_moon'] == 0].nlargest(additional_moons_needed, 'performance')
        target_variables_df.loc[moon_candidates.index, 'is_moon'] = 1

    if craters / total_coins < crater_minimum_percent:
        additional_craters_needed = int(total_coins * crater_minimum_percent) - craters
        crater_candidates = target_variables_df[target_variables_df['is_crater'] == 0].nsmallest(additional_craters_needed, 'performance')
        target_variables_df.loc[crater_candidates.index, 'is_crater'] = 1

    return target_variables_df[['coin_id', 'is_moon', 'is_crater']]

def create_target_variables(prices_df, training_data_config, modeling_config):
    """
    Main function to create target variables based on price performance.

    Parameters:
    - prices_df: DataFrame containing price data with columns 'coin_id', 'date', and 'price'.
    - training_data_config: Configuration with modeling period dates.
    - modeling_config: Configuration for modeling with target variable settings.

    Returns:
    - target_variables_df: DataFrame with target variables.
    - performance_df: DataFrame with price performance data.
    - outcomes_df: DataFrame tracking outcomes for each coin.
    """
    performance_df, outcomes_df = prepare_and_compute_performance(prices_df, training_data_config)

    target_variable_type = modeling_config.get('target_variable_type', 'mooncrater')

    if target_variable_type == 'mooncrater':
        target_variables_df = calculate_mooncrater_targets(performance_df, modeling_config)
    else:
        raise ValueError(f"Unsupported target variable type: {target_variable_type}")

    # Log results
    total_coins = len(target_variables_df)
    moons = target_variables_df['is_moon'].sum()
    craters = target_variables_df['is_crater'].sum()

    logger.info(
        "Target variables created for %s coins with %s/%s (%s) moons and %s/%s (%s) craters.",
        total_coins, moons, total_coins, f"{moons/total_coins:.2%}",
        craters, total_coins, f"{craters/total_coins:.2%}"
    )

    return target_variables_df, performance_df, outcomes_df

## Junkyard

## tests failing

In [None]:
asdf==None

In [None]:
'profits_df' in globals()