In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.DEBUG)

# Custom format function for displaying numbers/
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')


# Load all configs as global variables
global CONFIG, METRICS_CONFIG, MODELING_CONFIG, EXPERIMENTS_CONFIG, MODELING_FOLDER
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
CONFIG = config
METRICS_CONFIG = metrics_config
MODELING_CONFIG = modeling_config
EXPERIMENTS_CONFIG = experiments_config
MODELING_FOLDER = MODELING_CONFIG['modeling']['modeling_folder']
modeling_folder = MODELING_FOLDER

## Overall Sequencing

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


start_date = config['training_data']['training_period_start']
end_date = config['training_data']['modeling_period_end']

# Retrieve market data
if 'prices_df' not in globals():
    market_data_df = td.retrieve_market_data()
    market_data_df, _ = cwm.split_dataframe_by_coverage(market_data_df, start_date, end_date, id_column='coin_id')
    prices_df = market_data_df[['coin_id','date','price']].copy()

# retrieve profits data if necessary
if 'profits_df' not in globals():
    profits_df = None
profits_df = i.rebuild_profits_df_if_necessary(
                config,
                modeling_folder,
                prices_df,
                profits_df)

# filter market_data rows without transfers if configured to do so
if config['data_cleaning']['exclude_coins_without_transfers']:
    market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]
    prices_df = market_data_df[['coin_id','date','price']].copy()


In [None]:
modeling_config.get('target_variable_type', 'mooncrater')

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)


X_train, X_test, y_train, y_test, returns_test = i.build_configured_model_input(
                                    profits_df,
                                    market_data_df,
                                    config,
                                    metrics_config,
                                    modeling_config)

# 3.4 Train the model using the current configuration and log the results
model, model_id = m.train_model(
                    X_train,
                    y_train,
                    modeling_folder,
                    modeling_config)

# 3.5 Evaluate and save the model performance on the test set to a CSV
metrics_dict = m.evaluate_model(model, X_test, y_test, model_id, returns_test, modeling_config)
metrics_dict

In [None]:
metrics_dict

In [None]:
# calculate the normal AUC on inverted numbers
auc = m.calculate_profitability_auc(y_pred_prob,
                            returns_df_test['returns'],
                            modeling_config["evaluation"]["metrics"]["profitability_auc"]["top_percentage_filter"],
                            modeling_config["evaluation"]["winsorization_cutoff"])

auc

In [None]:
importlib.reload(m)
# calculate the normal AUC on inverted numbers
downside_auc = m.calculate_downside_profitability_auc(y_pred_prob,
                            returns_df_test['returns'],
                            modeling_config["evaluation"]["metrics"]["profitability_auc"]["top_percentage_filter"],
                            modeling_config["evaluation"]["winsorization_cutoff"])

downside_auc

In [None]:
# make negative returns the highest values
returns_neg = returns_df_test * -1

# find the inverse of model predictions
predictions_neg = 1 - predictions

# calculate the normal AUC on inverted numbers
downside_auc = m.calculate_profitability_auc(predictions_neg,
                            returns_neg,
                            modeling_config["evaluation"]["metrics"]["profitability_auc"]["top_percentage_filter"],
                            modeling_config["evaluation"]["winsorization_cutoff"])


In [None]:
# make negative returns the highest values
returns_neg = returns_df_test * -1

# find the inverse of model predictions
predictions_neg = 1 - predictions

# calculate the normal AUC on inverted numbers
downside_auc = m.calculate_profitability_auc(predictions_neg,
                            returns_neg,
                            modeling_config["evaluation"]["metrics"]["profitability_auc"]["top_percentage_filter"],
                            modeling_config["evaluation"]["winsorization_cutoff"])

downside_auc

In [None]:
returns_df_test = returns_df

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, confusion_matrix, log_loss

modeling_folder = modeling_config['modeling']['modeling_folder']

# Construct the performance metrics folder path
evaluation_folder = os.path.join(modeling_folder, "outputs", "performance_metrics")
predictions_folder = os.path.join(modeling_folder, "outputs", "predictions")

# Ensure the evaluation and predictions folders exist
if not os.path.exists(evaluation_folder):
    raise FileNotFoundError(f"The evaluation folder '{evaluation_folder}' does not exist.")
if not os.path.exists(predictions_folder):
    raise FileNotFoundError(f"The predictions folder '{predictions_folder}' does not exist.")

# Predict the probabilities and the labels
y_pred_prob = model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class
y_pred = model.predict(X_test)

# Save predictions to CSV with 'coin_id' as the index
predictions_df = pd.DataFrame({
    "y_pred_prob": y_pred_prob,
    "y_pred": y_pred
}, index=X_test.index)  # Use the index which includes 'coin_id'
predictions_filename = os.path.join(predictions_folder, f"predictions_{model_id}.csv")
predictions_df.to_csv(predictions_filename, index=True)

# Calculate requested metrics
metrics_request = modeling_config['evaluation']['metrics']
metrics_dict = {}

if "accuracy" in metrics_request:
    metrics_dict["accuracy"] = accuracy_score(y_test, y_pred)
if "precision" in metrics_request:
    metrics_dict["precision"] = precision_score(y_test, y_pred)
if "recall" in metrics_request:
    metrics_dict["recall"] = recall_score(y_test, y_pred)
if "f1_score" in metrics_request:
    metrics_dict["f1_score"] = f1_score(y_test, y_pred)
if "roc_auc" in metrics_request:
    metrics_dict["roc_auc"] = roc_auc_score(y_test, y_pred_prob)
if "log_loss" in metrics_request:
    metrics_dict["log_loss"] = log_loss(y_test, y_pred_prob)
if "confusion_matrix" in metrics_request:
    metrics_dict["confusion_matrix"] = confusion_matrix(y_test, y_pred).tolist()  # stored as list
if "profitability_auc" in metrics_request:
    metrics_dict["profitability_auc"] = m.calculate_profitability_auc(
                                                y_pred_prob,
                                                returns_df_test['returns'],
                                                metrics_request["profitability_auc"]["top_percentage_filter"],
                                                modeling_config["evaluation"]["winsorization_cutoff"]
                                                )

In [None]:
metrics_dict

In [None]:
returns_df

In [None]:
y_test

In [None]:
'accuracy' in modeling_config['evaluation']['metrics']

In [None]:
# Calculate metrics
metrics_dict = {}
metrics_request = modeling_config['evaluation']['metrics']


if "accuracy" in metrics_request:

    metrics_dict['accuracy'] =

In [None]:
merged_df = returns_df.join(predictions_df, on='coin_id', how='inner')
predictions = merged_df['y_pred_prob']
performances = merged_df['performance']

In [None]:
running_profitability_scores = m.calculate_running_profitability_score(predictions, performances)

In [None]:
def calculate_summary_score(running_scores, top_n_percentile=1.0):
    """
    Summarizes the running profitability scores into a single score using
    trapezoidal integration (similar to AUC), focusing on the top_n_percentile.

    Args:
    - running_scores (numpy.array or list): The running profitability scores.
    - top_n_percentile (float, optional): The top percentile of scores to consider (between 0 and 1).
                                           Defaults to 1.0 (use 100% of the data).

    Returns:
    - float: The summarized score over the specified percentile.
    """
    # Ensure the top_n_percentile is between 0 and 1
    if not 0 < top_n_percentile <= 1:
        raise ValueError("top_n_percentile must be between 0 and 1")

    total_picks = len(running_scores)
    top_n_count = int(total_picks * top_n_percentile)

    # Limit the x_values and running_scores to the top_n_percentile
    x_values = np.arange(1, total_picks + 1) / total_picks
    x_values = x_values[:top_n_count]
    running_scores = running_scores[:top_n_count]

    # Use trapezoidal integration to calculate the area under the curve for the top_n_percentile
    auc_score = np.trapezoid(running_scores, x=x_values)

    return auc_score




In [None]:
for x in range(10):
    percentile = (x+1)/10
    auc_score = calculate_summary_score(running_profitability_scores,percentile)
    print(auc_score)

In [None]:
merged_df.sample(10)

In [None]:
import numpy as np
import pandas as pd
from scipy import integrate

def calculate_profitability_auc(predictions, performances, top_percentage_filter=1.0):
    """
    Calculates the Profitability AUC (Area Under the Curve) metric for the top percentage of predictions.

    Args:
    - predictions (numpy.array or pandas.Series): The model's predictions (probabilities or values).
    - performances (numpy.array or pandas.Series): The actual performance values.
    - top_percentage_filter (float): The top percentage of predictions to consider, between 0 and 1.

    Returns:
    - profitability_auc (float): The Profitability AUC score for the filtered data, ranging from 0 to 1.
    """
    if not 0 < top_percentage_filter <= 1:
        raise ValueError("top_percentage_filter must be between 0 and 1")

    # Sort predictions and performances
    df = pd.DataFrame({'predictions': predictions, 'performances': performances})
    df_sorted = df.sort_values('predictions', ascending=False)

    # Filter top percentage
    n_top = int(len(predictions) * top_percentage_filter)
    if n_top < 2:
        raise ValueError("Filtered dataset is too small for meaningful calculation")

    df_filtered = df_sorted.head(n_top)

    # Calculate running profitability scores for filtered data
    running_scores = calculate_running_profitability_score(
        df_filtered['predictions'], df_filtered['performances'])

    # Create x-axis values (fraction of filtered predictions)
    x = np.linspace(0, 1, len(running_scores))

    # Calculate the area under the curve using NumPy's trapezoidal rule
    auc = np.trapz(running_scores, x)

    return auc  # Return AUC without normalization


In [None]:
importlib.reload(m)

predictions = np.array([0.55, 0.07, 0.14, 0.02, 0.07, 0.64, 0.04, 0.00, 0.02, 0.39])
performances = np.array([0.46, -0.1, -0.09, -0.09, -0.01, 0.57, -0.1, -0.01, -0.02, 2.62])
top_percentage_filter = 0.2

# Step 1: Sort predictions and performances
sorted_indices = np.argsort(predictions)[::-1]
sorted_predictions = predictions[sorted_indices]
sorted_performances = performances[sorted_indices]

# Step 2: Determine the number of top predictions to consider
n_top = int(len(predictions) * top_percentage_filter)
assert n_top == 2

# Step 3: Filter the top predictions and performances
top_predictions = np.round(sorted_predictions[:n_top], 3)
top_performances = np.round(sorted_performances[:n_top], 3)
np.testing.assert_array_equal(top_predictions, np.array([0.640, 0.550]))
np.testing.assert_array_equal(top_performances, np.array([0.570, 0.460]))

# Step 4: Calculate running profitability scores
cumulative_model_returns = np.round(np.cumsum(top_performances), 3)
best_possible_returns = np.round(np.sort(performances)[::-1][:n_top], 3)
cumulative_best_returns = np.round(np.cumsum(best_possible_returns), 3)
running_profitability_scores = np.round(cumulative_model_returns / cumulative_best_returns, 3)
np.testing.assert_allclose(cumulative_model_returns, np.array([0.570, 1.030]))
np.testing.assert_allclose(best_possible_returns, np.array([2.620, 0.570]))
np.testing.assert_allclose(cumulative_best_returns, np.array([2.620, 3.190]))
np.testing.assert_allclose(running_profitability_scores, np.array([0.218, 0.323]))

# Step 5: Calculate the area under the curve
x = np.linspace(0, 1, n_top)
expected_auc = np.trapezoid(running_profitability_scores, x)
np.testing.assert_almost_equal(expected_auc, 0.27025, decimal=3)

# Final step: Compare with the function output
calculated_auc = m.calculate_profitability_auc(predictions, performances, top_percentage_filter)
np.testing.assert_almost_equal(calculated_auc, expected_auc, decimal=3,
                                err_msg="Calculated Profitability AUC doesn't match expected value")

In [None]:
predictions
returns

In [None]:
returns * -1

In [None]:
predictions

In [None]:
1 - predictions

In [None]:
importlib.reload(m)
m.calculate_running_profitability_score(predictions,returns)

In [None]:
predictions

In [None]:
calculate_profitability_auc(predictions, performances, top_percentage_filter=0.2)

In [None]:
predictions

In [None]:
performances

In [None]:
np.sort(performances)[::-1]  # Sort performances in descending order


In [None]:
df_sorted.sort_values('performances',ascending=False)['performances'].cumsum()

In [None]:
# # Calculate best possible returns by sorting by performance descending
df.sort_values('performances',ascending=False)['performances'].cumsum()


In [None]:
df_filtered

df_sorted = df_filtered.sort_values('predictions', ascending=False)


In [None]:
df_filtered['performances']

In [None]:
# Create a DataFrame with predictions and performances

# Calculate the cumulative profits of the model predictions
df_sorted = df_filtered.sort_values('predictions', ascending=False)
cumulative_model_returns = np.cumsum(df_sorted['performances'])

# Calculate best possible returns for each number of picks
best_possible_returns = np.sort(df_filtered['performances'])[::-1]  # Sort performances in descending order
cumulative_best_returns = np.cumsum(best_possible_returns)

# # Calculate running profitability scores
# running_profitability_scores = np.divide(
#     cumulative_model_returns,
#     cumulative_best_returns,
#     out=np.zeros_like(cumulative_model_returns),
#     where=cumulative_best_returns != 0
# )


In [None]:
cumulative_model_returns

In [None]:
cumulative_best_returns

In [None]:
# Create a DataFrame with predictions and performances
df = pd.DataFrame({'predictions': predictions, 'performances': returns})

# Calculate the cumulative profits of the model predictions
df_sorted = df.sort_values('predictions', ascending=False)
cumulative_model_returns = np.cumsum(df_sorted['performances'])

# Calculate best possible returns for each number of picks
best_possible_returns = np.sort(performances)[::-1]  # Sort performances in descending order
cumulative_best_returns = np.cumsum(best_possible_returns)

# Calculate running profitability scores
running_profitability_scores = np.divide(
    cumulative_model_returns,
    cumulative_best_returns,
    out=np.zeros_like(cumulative_model_returns),
    where=cumulative_best_returns != 0
)


running_profitability_scores

In [None]:
# df_filtered['returns']=df_filtered['performances']
df_filtered

In [None]:
returns

In [None]:
performances==returns

In [None]:
importlib.reload(m)

predictions = np.array([0.55, 0.07, 0.14, 0.02, 0.07, 0.64, 0.04, 0.00, 0.02, 0.39])
returns = np.array([0.46, -0.1, -0.09, -0.09, -0.01, 0.57, -0.1, -0.01, -0.02, 2.62])


# Confirm percentage is between 0 and 1
if not 0 < top_percentage_filter <= 1:
    raise ValueError("top_percentage_filter must be between 0 and 1")

# Calculate the full range of profitability scores
running_scores = m.calculate_running_profitability_score(predictions, returns)

# Calculate how many scores to look at based on the percentage filter
n_top = int(len(predictions) * top_percentage_filter)
if n_top < 2:
    raise ValueError("Filtered dataset is too small for meaningful calculation")

# Limit the scores for AUC calculation to the percentile input
filtered_running_scores = running_scores[:n_top]

# Create x-axis values (fraction of filtered predictions)
x = np.linspace(0, 1, len(filtered_running_scores))

# Calculate the area under the curve using NumPy's trapezoidal rule
auc = np.trapezoid(filtered_running_scores, x)

auc

In [None]:
running_scores[:n_top]

In [None]:
predictions = np.array([0.55, 0.07, 0.14, 0.02, 0.07, 0.64, 0.04, 0.00, 0.02, 0.39])
returns = np.array([0.46, -0.1, -0.09, -0.09, -0.01, 0.57, -0.1, -0.01, -0.02, 2.62])

calculate_running_profitability_score(predictions, returns)

if not 0 < top_percentage_filter <= 1:
    raise ValueError("top_percentage_filter must be between 0 and 1")

# Sort predictions and returns
df = pd.DataFrame({'predictions': predictions, 'returns': returns})

# Filter top percentage
n_top = int(len(predictions) * top_percentage_filter)
if n_top < 2:
    raise ValueError("Filtered dataset is too small for meaningful calculation")


# Sort by model predictions and take the n_top returns
model_returns = (df.sort_values('predictions', ascending=False) # sort by model predictions
                      ['returns'].head(2))

# Sort by actual returns and take the n_top returns
actual_returns = (df.sort_values('returns', ascending=False) # sort by model predictions
                      ['returns'].head(2))

# Calculate running profitability scores for filtered data
running_scores = m.calculate_running_profitability_score(model_returns,actual_returns)

# Create x-axis values (fraction of filtered predictions)
x = np.linspace(0, 1, len(running_scores))

# Calculate the area under the curve using NumPy's trapezoidal rule
auc = np.trapezoid(running_scores, x)

running_scores

In [None]:
actual_returns

In [None]:
df_filtered

In [None]:
df_filtered

In [None]:
running_scores

In [None]:
running_profitability_scores.plot(kind='line')
plt.show()

## Junkyard

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')
logger.setLevel(logging.INFO)

def normal_input_data():
    """
    Fixture to provide normal input data for testing.

    Returns:
        tuple: Containing predictions and performances as numpy arrays.
    """
    predictions = np.array([0.6, 0.9, 0.2, 0.7, 0.3, 0.5, 0.1, 0.8, 0.4])
    performances = np.array([1, 0, 1, 1, 0, 1, 0, 1, 0])
    return predictions, performances
normal_input_data=normal_input_data()
# @pytest.mark.unit
# def test_calculate_running_profitability_score_normal_case(normal_input_data):
"""
Test the calculate_running_profitability_score function with normal input.

This test verifies that the function correctly calculates running profitability
scores for a typical set of predictions and performances where predictions
are good but not perfect.
"""
predictions, performances = normal_input_data

x_values, y_values = m.calculate_running_profitability_score(predictions, performances)

expected_x = np.array([0.11111111, 0.22222222, 0.33333333, 0.44444444, 0.55555556,
                        0.66666667, 0.77777778, 0.88888889, 1.        ])
expected_y = np.array([0.        , 0.5       , 0.66666667, 0.75      , 0.6       ,
                        0.66666667, 0.71428571, 0.75      , 0.66666667])

assert np.allclose(x_values, expected_x, atol=1e-4)
assert np.allclose(y_values, expected_y, atol=1e-4)

# Additional assertions to check specific properties
assert y_values[0] == 0.0  # First value should be 0.0 in this case
assert np.all(y_values <= 1.0)  # All values should be <= 1.0
assert np.all(y_values >= 0.0)  # All values should be >= 0.0
assert np.all(np.diff(x_values) > 0)  # x_values should be strictly increasing

In [None]:
x_values

In [None]:
correct_order=0
y_valuescorrect_order = 0
for i in range(len(predictions)):
    for j in range(i+1, len(predictions)):
        if (predictions[i] > predictions[j] and performances[i] >= performances[j]) or \
           (predictions[i] < predictions[j] and performances[i] <= performances[j]):
            correct_order += 1

total_comparisons = len(predictions) * (len(predictions) - 1) // 2
correctness_percentage = correct_order / total_comparisons * 100

print(f"Percentage of correct orderings: {correctness_percentage:.2f}%")

In [None]:
predictions = np.array([0.6, 0.9, 0.2, 0.7, 0.3, 0.5, 0.1, 0.8, 0.4])
performances = np.array([1, 0, 1, 1, 0, 1, 0, 1, 0])
# Create a DataFrame with predictions and performances
df = pd.DataFrame({'predictions': predictions, 'performances': performances})
df.sort_values('predictions')

In [None]:
performances

In [None]:
x_values

In [None]:
y_values

In [None]:
import pandas as pd
import logging

logger = logging.getLogger(__name__)



def calculate_mooncrater_targets(returns_df, modeling_config):
    """
    Calculates 'is_moon' and 'is_crater' target variables based on performance.

    Parameters:
    - returns_df: DataFrame with columns 'coin_id' and 'performance'.
    - modeling_config: Configuration for modeling with target variable thresholds.

    Returns:
    - target_variables_df: DataFrame with columns 'coin_id', 'is_moon', and 'is_crater'.
    """
    moon_threshold = modeling_config['target_variables']['moon_threshold']
    crater_threshold = modeling_config['target_variables']['crater_threshold']
    moon_minimum_percent = modeling_config['target_variables']['moon_minimum_percent']
    crater_minimum_percent = modeling_config['target_variables']['crater_minimum_percent']

    target_variables_df = returns_df.copy()
    target_variables_df['is_moon'] = (target_variables_df['performance'] >= moon_threshold).astype(int)
    target_variables_df['is_crater'] = (target_variables_df['performance'] <= crater_threshold).astype(int)

    total_coins = len(target_variables_df)
    moons = target_variables_df['is_moon'].sum()
    craters = target_variables_df['is_crater'].sum()

    # Ensure minimum percentage for moons and craters
    if moons / total_coins < moon_minimum_percent:
        additional_moons_needed = int(total_coins * moon_minimum_percent) - moons
        moon_candidates = target_variables_df[target_variables_df['is_moon'] == 0].nlargest(additional_moons_needed, 'performance')
        target_variables_df.loc[moon_candidates.index, 'is_moon'] = 1

    if craters / total_coins < crater_minimum_percent:
        additional_craters_needed = int(total_coins * crater_minimum_percent) - craters
        crater_candidates = target_variables_df[target_variables_df['is_crater'] == 0].nsmallest(additional_craters_needed, 'performance')
        target_variables_df.loc[crater_candidates.index, 'is_crater'] = 1

    return target_variables_df[['coin_id', 'is_moon', 'is_crater']]


In [None]:
importlib.reload(fe)
target_variables_df, returns_df, outcomes_df = fe.create_target_variables(market_data_df, config['training_data'], modeling_config)

In [None]:
profits_df[profits_df['wallet_address']==6217496]

## tests failing

In [None]:
def make_sample_profits_df():
    data = {
        'coin_id': [
            'eeccf0b6-aaaa-464c-a23e-f2fc9e73a350', '3941a874-dbdf-4f53-a38e-a1f4a80855f9',
            '1d05fab6-0fc3-4caa-9859-81a5bdf2a7c6', '1d05fab6-0fc3-4caa-9859-81a5bdf2a7c6',
            '3941a874-dbdf-4f53-a38e-a1f4a80855f9', 'eeccf0b6-aaaa-464c-a23e-f2fc9e73a350',
            '1d05fab6-0fc3-4caa-9859-81a5bdf2a7c6', '3941a874-dbdf-4f53-a38e-a1f4a80855f9',
            'eeccf0b6-aaaa-464c-a23e-f2fc9e73a350', '1d05fab6-0fc3-4caa-9859-81a5bdf2a7c6',
            '3941a874-dbdf-4f53-a38e-a1f4a80855f9', 'eeccf0b6-aaaa-464c-a23e-f2fc9e73a350'
        ],
        'date': [
            '2024-07-18', '2024-07-18', '2024-07-18', '2024-08-31', '2024-08-31', '2024-08-31',
            '2024-09-01', '2024-09-01', '2024-09-01', '2024-09-15', '2024-09-15', '2024-09-15'
        ],
        'wallet_address': [
            6217496, 6217496, 6217496, 6217496, 6217496, 6217496, 6217496, 6217496, 6217496,
            6217496, 6217496, 6217496
        ],
        'profits_cumulative': [
            494.1894, -3.555027, 23462700.0, 23462700.0, -5.156542, 494.1894,
            23462700.0, -5.554576, 494.1894, 23462700.0, -5.694496, 494.1894
        ],
        'usd_balance': [
            0.0, 12.06798, 3.426631e-09, 2.643053e-09, 10.46647, 0.0,
            2.568300e-09, 10.06843, 0.0, 2.637883e-09, 9.928511, 0.0
        ],
        'usd_net_transfers': [0.0] * 12,
        'usd_inflows': [0.0] * 12,
        'usd_inflows_cumulative': [
            10150.54, 15.62301, 23047870.0, 23047870.0, 15.62301, 10150.54,
            23047870.0, 15.62301, 10150.54, 23047870.0, 15.62301, 10150.54
        ],
        'total_return': [
            0.048686, -0.227551, 1.017998, 1.017998, -0.330061, 0.048686,
            1.017998, -0.355538, 0.048686, 1.017998, -0.364494, 0.048686
        ]
    }

    return pd.DataFrame(data)

sample_profits_df = make_sample_profits_df()
sample_profits_df

In [None]:
data_cleaning_config = config['data_cleaning']
profits_df = make_sample_profits_df()

# 1. Remove wallets with higher or lower total profits than the profitability_filter
# ----------------------------------------------------------------------------------
# Group by wallet_address and calculate the total profitability
wallet_profits_agg_df = profits_df.sort_values('date').groupby(
    'wallet_address', observed=True)['profits_cumulative'].last().reset_index()

# Identify wallet_addresses with total profitability that exceeds the threshold
# pylint: disable=C0301
exclusions_profits_df = wallet_profits_agg_df[
    (wallet_profits_agg_df['profits_cumulative'] >= data_cleaning_config['profitability_filter']) |
    (wallet_profits_agg_df['profits_cumulative'] <= -data_cleaning_config['profitability_filter'])
][['wallet_address']]

# Merge to filter out the records with those wallet addresses
profits_cleaned_df = profits_df.merge(
    exclusions_profits_df, on='wallet_address', how='left', indicator=True)
profits_cleaned_df = profits_cleaned_df[profits_cleaned_df['_merge'] == 'left_only']
profits_cleaned_df.drop(columns=['_merge'], inplace=True)

# 2. Remove wallets with higher total inflows than the inflows_filter
# -------------------------------------------------------------------
# Group by wallet_address and calculate the total inflows
wallet_inflows_agg_df = profits_df.groupby(
    'wallet_address', observed=True)['usd_inflows'].sum().reset_index()

# Identify wallet addresses where total inflows exceed the threshold
exclusions_inflows_df = wallet_inflows_agg_df[
    wallet_inflows_agg_df['usd_inflows'] >= data_cleaning_config['inflows_filter']
][['wallet_address']]

# Merge to filter out the records with those wallet addresses
profits_cleaned_df = profits_cleaned_df.merge(
    exclusions_inflows_df, on='wallet_address', how='left', indicator=True)
profits_cleaned_df = profits_cleaned_df[profits_cleaned_df['_merge'] == 'left_only']
profits_cleaned_df.drop(columns=['_merge'], inplace=True)

# Convert coin_id to categorical
profits_df['coin_id'] = profits_df['coin_id'].astype('category')

# 3. Prepare exclusions_df and output logs
# ----------------------------------------
# prepare exclusions_logs_df
exclusions_profits_df['profits_exclusion'] = True
exclusions_inflows_df['inflows_exclusion'] = True
exclusions_logs_df = exclusions_profits_df.merge(
    exclusions_inflows_df, on='wallet_address', how='outer')

# Fill NaN values with False for missing exclusions
exclusions_logs_df['profits_exclusion'] = (exclusions_logs_df['profits_exclusion']
                                            .astype(bool).fillna(False))

exclusions_logs_df['inflows_exclusion'] = (exclusions_logs_df['inflows_exclusion']
                                            .astype(bool).fillna(False))
# log outputs
logger.debug("Identified %s coin-wallet pairs beyond profit threshold of $%s and %s pairs"
                "beyond inflows filter of %s.",
                exclusions_profits_df.shape[0],
                data_cleaning_config['profitability_filter'],
                exclusions_inflows_df.shape[0],
                data_cleaning_config['inflows_filter'])


In [None]:
exclusions_logs_df

In [None]:
profits_df = td.retrieve_profits_data(config['training_data']['training_period_start'],
                                        config['training_data']['modeling_period_end'],
                                        config['data_cleaning']['minimum_wallet_inflows'])
split_profits_df, _ = cwm.split_dataframe_by_coverage(profits_df,
                                                config['training_data']['training_period_start'],
                                                config['training_data']['modeling_period_end'],
                                                id_column='coin_id')
cleaned_profits_df, _ = td.clean_profits_df(split_profits_df, config['data_cleaning'])

In [None]:
cleaned_df, exclusions_df = cleaned_profits_df

# Check that every excluded wallet breached at least one threshold
exclusions_with_breaches = exclusions_df.merge(profits_df, on='wallet_address', how='inner')

# Calculate the total profits and inflows per wallet
wallet_agg_df = exclusions_with_breaches.sort_values('date').groupby('wallet_address', observed=True).agg({
    'profits_cumulative': 'last',
    'usd_inflows': 'sum'
}).reset_index()