In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.DEBUG)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')


# Load all configs as global variables
global CONFIG, METRICS_CONFIG, MODELING_CONFIG, EXPERIMENTS_CONFIG, MODELING_FOLDER
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
CONFIG = config
METRICS_CONFIG = metrics_config
MODELING_CONFIG = modeling_config
EXPERIMENTS_CONFIG = experiments_config
MODELING_FOLDER = MODELING_CONFIG['modeling']['modeling_folder']
modeling_folder = MODELING_FOLDER

## Overall Sequencing

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


start_date = config['training_data']['training_period_start']
end_date = config['training_data']['modeling_period_end']

# Retrieve market data
market_data_df = td.retrieve_market_data()
market_data_df, _ = cwm.split_dataframe_by_coverage(market_data_df, start_date, end_date, id_column='coin_id')
prices_df = market_data_df[['coin_id','date','price']].copy()

# retrieve profits data if necessary
if 'profits_df' not in globals():
    profits_df = None
profits_df = i.rebuild_profits_df_if_necessary(
                config,
                modeling_folder,
                prices_df,
                profits_df)


# filter market_data rows without transfers if configured to do so
if config['data_cleaning']['exclude_coins_without_transfers']:
    market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]
    prices_df = market_data_df[['coin_id','date','price']].copy()


In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


X_train, X_test, y_train, y_test, performance_df = i.build_configured_model_input(
                                    profits_df,
                                    market_data_df,
                                    config,
                                    metrics_config,
                                    modeling_config)

# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(
                    X_train,
                    y_train,
                    modeling_folder,
                    modeling_config['modeling']['model_params'])

# 3.5 Evaluate and save the model's performance on the test set to a CSV
metrics_df = m.evaluate_model(model, X_test, y_test, model_id, modeling_config)

In [None]:
performance_df

In [None]:
modeling_folder = modeling_config['modeling']['modeling_folder']

# Construct the performance metrics folder path
evaluation_folder = os.path.join(modeling_folder, "outputs", "performance_metrics")
predictions_folder = os.path.join(modeling_folder, "outputs", "predictions")

# Ensure the evaluation and predictions folders exist
if not os.path.exists(evaluation_folder):
    raise FileNotFoundError(f"The evaluation folder '{evaluation_folder}' does not exist.")
if not os.path.exists(predictions_folder):
    raise FileNotFoundError(f"The predictions folder '{predictions_folder}' does not exist.")

# Predict the probabilities and the labels
y_pred_prob = model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class
y_pred = model.predict(X_test)

# Save predictions to CSV with 'coin_id' as the index
predictions_df = pd.DataFrame({
    "y_pred_prob": y_pred_prob,
    "y_pred": y_pred
}, index=X_test.index)  # Use the index which includes 'coin_id'
# predictions_filename = os.path.join(predictions_folder, f"predictions_{model_id}.csv")
# predictions_df.to_csv(predictions_filename, index=True)

# # Calculate metrics
# metrics_dict = {
#     "accuracy": accuracy_score(y_test, y_pred),
#     "precision": precision_score(y_test, y_pred),
#     "recall": recall_score(y_test, y_pred),
#     "f1_score": f1_score(y_test, y_pred),
#     "roc_auc": roc_auc_score(y_test, y_pred_prob),
#     "log_loss": log_loss(y_test, y_pred_prob),
#     "confusion_matrix": confusion_matrix(y_test, y_pred).tolist()  # stored as list
# }

# # Save metrics to a CSV
# metrics_df = pd.DataFrame([metrics_dict])
# metrics_filename = os.path.join(evaluation_folder, f"metrics_{model_id}.csv")
# metrics_df.to_csv(metrics_filename, index=False)


In [None]:
performance_df.set_index('coin_id')

In [None]:
merged_df = performance_df.join(predictions_df, on='coin_id', how='inner')
predictions = merged_df['y_pred_prob']
performances = merged_df['performance']

In [None]:
predictions

In [None]:
performances

In [None]:
def calculate_running_profitability_score(predictions, performances):
    """
    Calculates the running profitability score for the entire series.

    Args:
    - predictions (numpy.array or pandas.Series): The model's predictions (probabilities or values).
    - performances (numpy.array or pandas.Series): The actual performance values.

    Returns:
    - tuple: Two numpy arrays - (x_values, y_values)
        x_values: Percentage of total picks (0 to 1)
        y_values: Running profitability scores

    Raises:
    - ValueError: If predictions and performances have different lengths.
    """
    if len(predictions) != len(performances):
        raise ValueError("Predictions and performances must have the same length")

    # Create a DataFrame with predictions and performances
    df = pd.DataFrame({'predictions': predictions, 'performances': performances})

    # Sort by predictions in descending order
    df_sorted = df.sort_values('predictions', ascending=False)

    total_picks = len(df_sorted)
    cumulative_model_returns = np.cumsum(df_sorted['performances'])

    # Calculate best possible returns for each number of picks
    best_possible_returns = np.sort(performances)[::-1]  # Sort performances in descending order
    cumulative_best_returns = np.cumsum(best_possible_returns)

    # Calculate running profitability scores
    running_scores = np.divide(
        cumulative_model_returns,
        cumulative_best_returns,
        out=np.zeros_like(cumulative_model_returns),
        where=cumulative_best_returns != 0
    )

    # Create x-values (percentage of total picks)
    x_values = np.arange(1, total_picks + 1) / total_picks

    return x_values, running_scores


x_values,running_scores = calculate_running_profitability_score(predictions, performances)

In [None]:
# Create a DataFrame with predictions and performances
df = pd.DataFrame({'predictions': predictions, 'performances': performances})

# Sort by predictions in descending order
df_sorted = df.sort_values('predictions', ascending=False)

# cumulative_model_returns = np.cumsum(df_sorted['performances'])

# # Calculate best possible returns for each number of picks
# best_possible_returns = np.sort(performances)[::-1]  # Sort performances in descending order
# cumulative_best_returns = np.cumsum(best_possible_returns)

# # Calculate running profitability scores
# running_profitability_scores = np.divide(
#     cumulative_model_returns,
#     cumulative_best_returns,
#     out=np.zeros_like(cumulative_model_returns),
#     where=cumulative_best_returns != 0
# )

# return running_profitability_scoresx_values

df_sorted

In [None]:
running_scoresx_values

In [None]:
running_scores.plot(kind='line')
plt.show()

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)

def normal_input_data():
    """
    Fixture to provide normal input data for testing.

    Returns:
        tuple: Containing predictions and performances as numpy arrays.
    """
    predictions = np.array([0.6, 0.9, 0.2, 0.7, 0.3, 0.5, 0.1, 0.8, 0.4])
    performances = np.array([1, 0, 1, 1, 0, 1, 0, 1, 0])
    return predictions, performances
normal_input_data=normal_input_data()
# @pytest.mark.unit
# def test_calculate_running_profitability_score_normal_case(normal_input_data):
"""
Test the calculate_running_profitability_score function with normal input.

This test verifies that the function correctly calculates running profitability
scores for a typical set of predictions and performances where predictions
are good but not perfect.
"""
predictions, performances = normal_input_data

x_values, y_values = m.calculate_running_profitability_score(predictions, performances)

expected_x = np.array([0.11111111, 0.22222222, 0.33333333, 0.44444444, 0.55555556,
                        0.66666667, 0.77777778, 0.88888889, 1.        ])
expected_y = np.array([0.        , 0.5       , 0.66666667, 0.75      , 0.6       ,
                        0.66666667, 0.71428571, 0.75      , 0.66666667])

assert np.allclose(x_values, expected_x, atol=1e-4)
assert np.allclose(y_values, expected_y, atol=1e-4)

# Additional assertions to check specific properties
assert y_values[0] == 0.0  # First value should be 0.0 in this case
assert np.all(y_values <= 1.0)  # All values should be <= 1.0
assert np.all(y_values >= 0.0)  # All values should be >= 0.0
assert np.all(np.diff(x_values) > 0)  # x_values should be strictly increasing

In [None]:
x_values

In [None]:
correct_order=0
y_valuescorrect_order = 0
for i in range(len(predictions)):
    for j in range(i+1, len(predictions)):
        if (predictions[i] > predictions[j] and performances[i] >= performances[j]) or \
           (predictions[i] < predictions[j] and performances[i] <= performances[j]):
            correct_order += 1

total_comparisons = len(predictions) * (len(predictions) - 1) // 2
correctness_percentage = correct_order / total_comparisons * 100

print(f"Percentage of correct orderings: {correctness_percentage:.2f}%")

In [None]:
predictions = np.array([0.6, 0.9, 0.2, 0.7, 0.3, 0.5, 0.1, 0.8, 0.4])
performances = np.array([1, 0, 1, 1, 0, 1, 0, 1, 0])
# Create a DataFrame with predictions and performances
df = pd.DataFrame({'predictions': predictions, 'performances': performances})
df.sort_values('predictions')

In [None]:
performances

In [None]:
x_values

In [None]:
y_values

In [None]:
import pandas as pd
import logging

logger = logging.getLogger(__name__)



def calculate_mooncrater_targets(performance_df, modeling_config):
    """
    Calculates 'is_moon' and 'is_crater' target variables based on performance.

    Parameters:
    - performance_df: DataFrame with columns 'coin_id' and 'performance'.
    - modeling_config: Configuration for modeling with target variable thresholds.

    Returns:
    - target_variables_df: DataFrame with columns 'coin_id', 'is_moon', and 'is_crater'.
    """
    moon_threshold = modeling_config['target_variables']['moon_threshold']
    crater_threshold = modeling_config['target_variables']['crater_threshold']
    moon_minimum_percent = modeling_config['target_variables']['moon_minimum_percent']
    crater_minimum_percent = modeling_config['target_variables']['crater_minimum_percent']

    target_variables_df = performance_df.copy()
    target_variables_df['is_moon'] = (target_variables_df['performance'] >= moon_threshold).astype(int)
    target_variables_df['is_crater'] = (target_variables_df['performance'] <= crater_threshold).astype(int)

    total_coins = len(target_variables_df)
    moons = target_variables_df['is_moon'].sum()
    craters = target_variables_df['is_crater'].sum()

    # Ensure minimum percentage for moons and craters
    if moons / total_coins < moon_minimum_percent:
        additional_moons_needed = int(total_coins * moon_minimum_percent) - moons
        moon_candidates = target_variables_df[target_variables_df['is_moon'] == 0].nlargest(additional_moons_needed, 'performance')
        target_variables_df.loc[moon_candidates.index, 'is_moon'] = 1

    if craters / total_coins < crater_minimum_percent:
        additional_craters_needed = int(total_coins * crater_minimum_percent) - craters
        crater_candidates = target_variables_df[target_variables_df['is_crater'] == 0].nsmallest(additional_craters_needed, 'performance')
        target_variables_df.loc[crater_candidates.index, 'is_crater'] = 1

    return target_variables_df[['coin_id', 'is_moon', 'is_crater']]


In [None]:
importlib.reload(fe)
target_variables_df, performance_df, outcomes_df = fe.create_target_variables(market_data_df, config['training_data'], modeling_config)

In [None]:
performance_df

## Junkyard

## tests failing

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)




def valid_prices_df():
    """
    Fixture to create a sample DataFrame with valid price data for multiple coins.
    """
    return pd.DataFrame({
        'coin_id': ['BTC', 'ETH', 'XRP'] * 2,
        'date': ['2023-01-01', '2023-01-01', '2023-01-01', '2023-12-31', '2023-12-31', '2023-12-31'],
        'price': [30000, 2000, 0.5, 35000, 2500, 0.6]
    })
valid_prices_df=valid_prices_df()

def valid_training_data_config():
    """
    Fixture to create a sample training data configuration.
    """
    return {
        'modeling_period_start': '2023-01-01',
        'modeling_period_end': '2023-12-31'
    }
valid_training_data_config=valid_training_data_config()

def no_change_prices_df():
    """
    Fixture to create a sample DataFrame with no price change for some coins.
    """
    return pd.DataFrame({
        'coin_id': ['BTC', 'ETH', 'XRP'] * 2,
        'date': ['2023-01-01', '2023-01-01', '2023-01-01', '2023-12-31', '2023-12-31', '2023-12-31'],
        'price': [30000, 2000, 0.5, 30000, 2500, 0.5]
    })
no_change_prices_df=no_change_prices_df()

def negative_performance_prices_df():
    """
    Fixture to create a sample DataFrame with negative performance for some coins.
    """
    return pd.DataFrame({
        'coin_id': ['BTC', 'ETH', 'XRP'] * 2,
        'date': ['2023-01-01', '2023-01-01', '2023-01-01', '2023-12-31', '2023-12-31', '2023-12-31'],
        'price': [30000, 2000, 0.5, 25000, 2500, 0.4]
    })
negative_performance_prices_df=negative_performance_prices_df()

def multiple_datapoints_prices_df():
    """
    Fixture to create a sample DataFrame with multiple data points between start and end dates.
    """
    return pd.DataFrame({
        'coin_id': ['BTC', 'ETH', 'XRP'] * 4,
        'date': ['2023-01-01', '2023-01-01', '2023-01-01',
                 '2023-06-15', '2023-06-15', '2023-06-15',
                 '2023-09-30', '2023-09-30', '2023-09-30',
                 '2023-12-31', '2023-12-31', '2023-12-31'],
        'price': [30000, 2000, 0.5,
                  32000, 2200, 0.55,
                  34000, 2400, 0.58,
                  35000, 2500, 0.6]
    })
multiple_datapoints_prices_df=multiple_datapoints_prices_df()


"""
Test prepare_and_compute_performance function with multiple data points between start and end dates.

This test ensures that the function correctly calculates performance using only start and end dates,
ignoring intermediate data points.
"""
performance_df, outcomes_df = fe.prepare_and_compute_performance(multiple_datapoints_prices_df, valid_training_data_config)

expected_performance = pd.DataFrame({
    'coin_id': ['BTC', 'ETH', 'XRP'],
    'performance': [0.1667, 0.25, 0.2]
})

assert (np.isclose(performance_df['performance'].values, expected_performance['performance'].values, rtol=1e-4, atol=1e-4)).all()

expected_outcomes = pd.DataFrame({
    'coin_id': ['BTC', 'ETH', 'XRP'],
    'outcome': ['performance calculated'] * 3
})

assert np.array_equal(outcomes_df.values, expected_outcomes.values)

In [None]:
assert np.all(np.isclose(performance_df['performance'].values, expected_performance['performance'].values, rtol=1e-4, atol=1e-4))


In [None]:
expected_performance

In [None]:

def negative_performance_prices_df():
    """
    Fixture to create a sample DataFrame with negative performance for some coins.
    """
    return pd.DataFrame({
        'coin_id': ['BTC', 'ETH', 'XRP'] * 2,
        'date': ['2023-01-01', '2023-01-01', '2023-01-01', '2023-12-31', '2023-12-31', '2023-12-31'],
        'price': [30000, 2000, 0.5, 25000, 2500, 0.4]
    })
negative_performance_prices_df=negative_performance_prices_df()

def test_prepare_and_compute_performance_negative(negative_performance_prices_df, valid_training_data_config):
    """
    Test prepare_and_compute_performance function with negative performance for some coins.

    This test ensures that the function correctly calculates negative performance values
    for coins with price decreases and correct performance for others.
    """
    performance_df, outcomes_df = fe.prepare_and_compute_performance(negative_performance_prices_df, valid_training_data_config)

    expected_performance = pd.DataFrame({
        'coin_id': ['BTC', 'ETH', 'XRP'],
        'performance': [-0.1667, 0.25, -0.2]
    })

    assert np.array_equal(performance_df.values, expected_performance.values)

    for actual, expected in zip(performance_df['performance'], expected_performance['performance']):
        assert actual == pytest.approx(expected, abs=1e-4)

@pytest.fixture
def multiple_datapoints_prices_df():
    """
    Fixture to create a sample DataFrame with multiple data points between start and end dates.
    """
    return pd.DataFrame({
        'coin_id': ['BTC', 'ETH', 'XRP'] * 4,
        'date': ['2023-01-01', '2023-01-01', '2023-01-01',
                 '2023-06-15', '2023-06-15', '2023-06-15',
                 '2023-09-30', '2023-09-30', '2023-09-30',
                 '2023-12-31', '2023-12-31', '2023-12-31'],
        'price': [30000, 2000, 0.5,
                  32000, 2200, 0.55,
                  34000, 2400, 0.58,
                  35000, 2500, 0.6]
    })

@pytest.mark.unit
def test_prepare_and_compute_performance_multiple_datapoints(multiple_datapoints_prices_df, valid_training_data_config):
    """
    Test prepare_and_compute_performance function with multiple data points between start and end dates.

    This test ensures that the function correctly calculates performance using only start and end dates,
    ignoring intermediate data points.
    """
    performance_df, outcomes_df = fe.prepare_and_compute_performance(multiple_datapoints_prices_df, valid_training_data_config)

    expected_performance = pd.DataFrame({
        'coin_id': ['BTC', 'ETH', 'XRP'],
        'performance': [0.1667, 0.25, 0.2]
    })

    assert np.array_equal(performance_df.values, expected_performance.values)

    for actual, expected in zip(performance_df['performance'], expected_performance['performance']):
        assert actual == pytest.approx(expected, abs=1e-4)

In [None]:
def test_prepare_and_compute_performance_negative(negative_performance_prices_df, valid_training_data_config):
    """
    Test prepare_and_compute_performance function with negative performance for some coins.

    This test ensures that the function correctly calculates negative performance values
    for coins with price decreases and correct performance for others.
    """
    performance_df, outcomes_df = fe.prepare_and_compute_performance(negative_performance_prices_df, valid_training_data_config)

    expected_performance = pd.DataFrame({
        'coin_id': ['BTC', 'ETH', 'XRP'],
        'performance': [-0.1667, 0.25, -0.2]
    })

    assert np.array_equal(performance_df.values, expected_performance.values)

    for actual, expected in zip(performance_df['performance'], expected_performance['performance']):
        assert actual == pytest.approx(expected, abs=1e-4)

@pytest.fixture
def multiple_datapoints_prices_df():
    """
    Fixture to create a sample DataFrame with multiple data points between start and end dates.
    """
    return pd.DataFrame({
        'coin_id': ['BTC', 'ETH', 'XRP'] * 4,
        'date': ['2023-01-01', '2023-01-01', '2023-01-01',
                 '2023-06-15', '2023-06-15', '2023-06-15',
                 '2023-09-30', '2023-09-30', '2023-09-30',
                 '2023-12-31', '2023-12-31', '2023-12-31'],
        'price': [30000, 2000, 0.5,
                  32000, 2200, 0.55,
                  34000, 2400, 0.58,
                  35000, 2500, 0.6]
    })

@pytest.mark.unit
def test_prepare_and_compute_performance_multiple_datapoints(multiple_datapoints_prices_df, valid_training_data_config):
    """
    Test prepare_and_compute_performance function with multiple data points between start and end dates.

    This test ensures that the function correctly calculates performance using only start and end dates,
    ignoring intermediate data points.
    """
    performance_df, outcomes_df = fe.prepare_and_compute_performance(multiple_datapoints_prices_df, valid_training_data_config)

    expected_performance = pd.DataFrame({
        'coin_id': ['BTC', 'ETH', 'XRP'],
        'performance': [0.1667, 0.25, 0.2]
    })

    assert np.array_equal(performance_df.values, expected_performance.values)

    for actual, expected in zip(performance_df['performance'], expected_performance['performance']):
        assert actual == pytest.approx(expected, abs=1e-4)