In [1]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

# load dotenv
load_dotenv()

# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')
experiments_config = load_config('../config/config_experiments.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [2]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe) 
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')

### Training Data Generation

In [3]:
logger.setLevel(logging.INFO)

# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])



[14/Sep/2024 12:25:53] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[14/Sep/2024 12:25:53] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[14/Sep/2024 12:27:08] INFO [dreams_core.core.retrieve_transfers_data:333] retrieved transfers_df with shape (35657183, 5) after 74.9 seconds.
[14/Sep/2024 12:27:08] INFO [dreams_core.core.prepare_profits_data:374] Preparing profits_df data...
[14/Sep/2024 12:28:31] INFO [dreams_core.core.calculate_wallet_profitability:554] Generated profits df after 36.06 seconds
[14/Sep/2024 12:29:09] INFO [dreams_core.core.clean_profits_df:625] Finished cleaning profits_df after 38.51 seconds.


### Metrics and Feature Engineering

In [24]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe) 
importlib.reload(m) 
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')


# identify sharks
shark_coins_df = td.classify_shark_coins(profits_df, config['training_data'])
shark_wallets_df = td.classify_shark_wallets(shark_coins_df,config['training_data'])

# generate and flatten buysell_metrics
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)

# flatten, save, and preprocess the flattened df
output_directory = '..//modeling/outputs/flattened_outputs/'
metric_description = 'buysell_metrics'
modeling_period_start = config['training_data']['modeling_period_start']
version = '0.1'

flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])
flattened_df, flattened_filepath = fe.save_flattened_outputs(flattened_buysell_metrics_df, output_directory, metric_description, modeling_period_start, version)
preprocessed_df, preprocessed_filepath = fe.preprocess_coin_df(flattened_filepath, modeling_config, metrics_config)

# create the training data df
input_directory = f"{preprocessed_filepath.split('preprocessed_outputs/')[0]}preprocessed_outputs/"
input_filenames = [
    preprocessed_filepath.split('preprocessed_outputs/')[1]
]
training_data_df = fe.create_training_data_df(input_directory, input_filenames)

# create the target variable df
target_variable_df,_ = fe.create_target_variables_mooncrater(prices_df, config['training_data'], modeling_config)

# merge the two into the final model input df
model_input_df = fe.prepare_model_input_df(training_data_df, target_variable_df, modeling_config['modeling']['target_column'])

[14/Sep/2024 12:50:11] INFO [dreams_core.core.classify_shark_coins:691] creation of shark_coins_df complete.
[14/Sep/2024 12:50:11] INFO [dreams_core.core.generate_buysell_metrics_df:33] Preparing buysell_metrics_df...
[14/Sep/2024 12:50:15] INFO [dreams_core.core.generate_buysell_metrics_df:99] Generated buysell_metrics_df after 3.49 seconds.
[14/Sep/2024 12:50:15] INFO [dreams_core.core.flatten_coin_date_df:84] Flattening columns ['buyers_new', 'total_bought', 'total_sold', 'total_buyers'] into coin-level features...
[14/Sep/2024 12:50:15] INFO [dreams_core.core.flatten_coin_date_df:100] Flattened input df into coin-level features with shape (89, 57) after 0.10 seconds.
[14/Sep/2024 12:50:15] INFO [dreams_core.core.preprocess_coin_df:423] Preprocessed file saved at: ..//modeling/outputs/preprocessed_outputs/buysell_metrics_0.1_2024-09-14_12-50_model_period_2024-03-01_preprocessed.csv
[14/Sep/2024 12:50:15] INFO [dreams_core.core.preprocess_coin_df:424] Dropped 1 columns: buyers_new_m

### Modeling

In [26]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe) 
importlib.reload(m) 
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')
modeling_folder = modeling_config['modeling']['modeling_folder']

model_input_df.shape

# split train/test sets
X_train,X_test,y_train,y_test = m.split_model_input(model_input_df, modeling_config['modeling']['target_column'], test_size=0.2, random_state=42)

# # train model
model, model_id = m.train_model(X_train, y_train, modeling_folder, model_params=None)

# evaluate model
metrics_dict = m.evaluate_model(model, X_test, y_test, model_id, modeling_folder)

# log experiment results
experiment_log = m.log_experiment_results(modeling_folder, model_id)

metrics_dict

{'accuracy': 0.6111111111111112,
 'precision': np.float64(0.5833333333333334),
 'recall': np.float64(0.7777777777777778),
 'f1_score': np.float64(0.6666666666666666),
 'roc_auc': np.float64(0.6419753086419753),
 'log_loss': 0.6697705414644598}

In [23]:
import pytest
importlib.reload(m) 

# Create a DataFrame where the target has only one class and 10 rows
data = create_dataframe({'feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'feature2': [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]}, [1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

with pytest.raises(ValueError, match="y_train or y_test contains only one class"):
    m.split_model_input(data, 'target')

AssertionError: Regex pattern did not match.
 Regex: 'y_train or y_test contains only one class'
 Input: 'Target is heavily imbalanced. Consider rebalancing or using specialized techniques.'

In [14]:
data

Unnamed: 0,feature1,feature2,target
0,1,4,0
1,2,5,1
2,3,6,0


In [None]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

# load dotenv
load_dotenv()

# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)

# load configs
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')

# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [102]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe) 
importlib.reload(m) 
config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')
modeling_folder = modeling_config['modeling']['modeling_folder']


m.log_experiment_results(modeling_folder, model_id)

ValueError: too many values to unpack (expected 2)

In [34]:
def mock_input_df():
    """
    Creates a mock DataFrame and saves it as a CSV for testing.
    The CSV file is saved in the 'tests/test_modeling/outputs/flattened_outputs' directory.
    
    Returns:
    - input_path: Path to the CSV file.
    - df: Original mock DataFrame.
    """
    data = {
        'feature_1_sum': [1, 2, 3],
        'feature_to_drop': [10, 20, 30],
        'feature_3': [100, 200, 300]
    }
    df = pd.DataFrame(data)
    input_path = 'temp/mock_input.csv'
    df.to_csv(input_path, index=False)
    return input_path, df



def mock_modeling_config():
    """
    Returns a mock modeling configuration dictionary.
    The configuration includes preprocessing options such as features to drop.
    """
    return {
        'preprocessing': {
            'drop_features': ['feature_to_drop']
        }
    }


def mock_metrics_config():
    """
    Returns a mock metrics configuration dictionary.
    This configuration includes settings for scaling different features.
    """
    return {
        'metrics': {
            'feature_1': {
                'aggregations': {
                    'sum': {'scaling': 'standard'}
                }
            }
        }
    }

input_path, df  = mock_input_df()
mock_modeling_config = mock_modeling_config()
mock_metrics_config = mock_metrics_config()

In [36]:
output_df

Unnamed: 0,feature_1_sum,feature_3
0,-1.22474487139,100
1,0.0,200
2,1.22474487139,300


In [57]:
importlib.reload(fe) 

# def test_preprocess_coin_df_scaling(mock_modeling_config, mock_metrics_config, mock_input_df):
"""
Tests that the preprocess_coin_df function correctly applies scaling to the specified features.

Steps:
- Preprocesses the mock DataFrame by applying standard scaling to 'feature_1'.
- Asserts that the column is scaled correctly.
- Cleans up the test files after execution.
"""
input_path, original_df = mock_input_df()

# Call the function
output_df, output_path = fe.preprocess_coin_df(input_path, mock_modeling_config, mock_metrics_config)


# Check that 'feature_1' is scaled (mean should be near 0 and std should be near 1)
scaled_column = output_df['feature_1_sum']
assert abs(scaled_column.mean()) < 1e-6, "Standard scaling not applied correctly to 'feature_1_sum'."
assert abs(np.std(scaled_column) - 1) < 1e-6, "Standard scaling not applied correctly to 'feature_1_sum'."

# # Cleanup (remove the test files)
# os.remove(output_path)
# os.remove(input_path)

[13/Sep/2024 22:43:46] INFO [dreams_core.core.preprocess_coin_df:422] Preprocessed file saved at: temp/mock_input_preprocessed.csv
[13/Sep/2024 22:43:46] INFO [dreams_core.core.preprocess_coin_df:423] Dropped 1 columns: feature_to_drop


In [41]:
scaled_column = output_df['feature_1_sum']
# abs(scaled_column.mean())
abs(scaled_column.std() - 1)


np.float64(0.22474487139158894)

In [51]:
scaled_column

0   -1.22474487139
1                0
2    1.22474487139
Name: feature_1_sum, dtype: float64

In [46]:
output_df.describe()

Unnamed: 0,feature_1_sum,feature_3
count,3.0,3
mean,0.0,200
std,1.22474487139,100
min,-1.22474487139,100
25%,-0.612372435696,150
50%,0.0,200
75%,0.612372435696,250
max,1.22474487139,300


In [44]:
scaled_column.std() - 1

np.float64(0.22474487139158894)

In [45]:
from sklearn.preprocessing import StandardScaler
scaled_array = StandardScaler().fit_transform([[1], [2], [3]]).flatten()
scaled_array

array([-1.22474487,  0.        ,  1.22474487])

In [12]:
for metric, settings in metrics_config.items():
    print(f'metric: {metric}')
    print(f'setting: {settings}')

metric: metrics
setting: {'buyers_new': {'aggregations': {'sum': {'scaling': 'standard'}, 'mean': {'scaling': 'minmax'}, 'std': {'scaling': 'standard'}}, 'rolling': {'stats': {'sum': {'scaling': 'minmax'}, 'max': {'scaling': 'standard'}}, 'comparisons': {'change': {'scaling': 'standard'}, 'pct_change': {'scaling': 'None'}}, 'window_duration': 7, 'lookback_periods': 8}}, 'total_bought': {'aggregations': {'sum': {'scaling': 'standard'}, 'median': {'scaling': 'None'}}, 'rolling': {'stats': {'sum': {'scaling': 'minmax'}}, 'comparisons': {'change': {'scaling': 'standard'}}, 'window_duration': 7, 'lookback_periods': 8}}, 'total_sold': {'aggregations': {'sum': {'scaling': 'standard'}}}, 'total_buyers': {'aggregations': {'sum': {'scaling': 'None'}}}}


In [13]:
metrics_config.items()

dict_items([('metrics', {'buyers_new': {'aggregations': {'sum': {'scaling': 'standard'}, 'mean': {'scaling': 'minmax'}, 'std': {'scaling': 'standard'}}, 'rolling': {'stats': {'sum': {'scaling': 'minmax'}, 'max': {'scaling': 'standard'}}, 'comparisons': {'change': {'scaling': 'standard'}, 'pct_change': {'scaling': 'None'}}, 'window_duration': 7, 'lookback_periods': 8}}, 'total_bought': {'aggregations': {'sum': {'scaling': 'standard'}, 'median': {'scaling': 'None'}}, 'rolling': {'stats': {'sum': {'scaling': 'minmax'}}, 'comparisons': {'change': {'scaling': 'standard'}}, 'window_duration': 7, 'lookback_periods': 8}}, 'total_sold': {'aggregations': {'sum': {'scaling': 'standard'}}}, 'total_buyers': {'aggregations': {'sum': {'scaling': 'None'}}}})])

In [47]:
import numpy as np

# Original array [1, 2, 3]
std_original = np.std([1, 2, 3])

# Scaled array [-1.2247, 0, 1.2247]
scaled_array = [-1.22474487, 0, 1.22474487]
std_scaled = np.std(scaled_array)

std_original

np.float64(0.816496580927726)

In [56]:
print(np.std(scaled_column))

print(scaled_column.std())

0.9999999999999999
1.224744871391589


In [55]:
scaled_array.std()

AttributeError: 'list' object has no attribute 'std'