In [2]:
import sys
import os
import time
import logging
import datetime
from datetime import datetime, timedelta
import yaml
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

# load dotenv
load_dotenv()


# import local files if necessary
sys.path.append('..//src')
from utils import load_config, cw_filter_df
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')

In [11]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe) 

config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')

#### Load the datasets

In [3]:
# retrieve and clean prices data
prices_df = td.retrieve_prices_data()
prices_df,_ = td.fill_prices_gaps(prices_df,config['data_cleaning']['max_gap_days'])
logger.info(f"Prices data shape: {prices_df.shape}")

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])

# identify sharks
shark_coins_df = td.classify_shark_coins(profits_df, config['training_data'])
shark_wallets_df = td.classify_shark_wallets(shark_coins_df,config['training_data'])

# generate and flatten buysell_metrics
cohort_wallets = shark_wallets_df[shark_wallets_df['is_shark']==True]['wallet_address'].unique()
cohort_coins = shark_coins_df['coin_id'].unique()
buysell_metrics_df = cwm.generate_buysell_metrics_df(profits_df,config['training_data']['training_period_end'],cohort_wallets,cohort_coins)
flattened_buysell_metrics_df = fe.flatten_coin_date_df(buysell_metrics_df,metrics_config,config['training_data']['training_period_end'])


[12/Sep/2024 23:04:29] INFO [dreams_core.core.retrieve_prices_data:42] retrieved prices data with shape (120763, 3)
[12/Sep/2024 23:04:29] INFO [dreams_core.core.fill_prices_gaps:126] 382 coins had no gaps, 19 coins had gaps filled, and 38 coins were dropped due to large gaps.
[12/Sep/2024 23:04:29] INFO [dreams_core.core.<module>:11] Prices data shape: (110929, 3)
[12/Sep/2024 23:04:58] INFO [dreams_core.core.retrieve_transfers_data:414] retrieved transfers_df with shape (23823401, 5) after 28.1 seconds.
[12/Sep/2024 23:04:58] INFO [dreams_core.core.prepare_profits_data:455] Preparing profits_df data...
[12/Sep/2024 23:05:49] INFO [dreams_core.core.calculate_wallet_profitability:635] Generated profits df after 20.17 seconds
[12/Sep/2024 23:06:12] INFO [dreams_core.core.clean_profits_df:706] Finished cleaning profits_df after 22.70 seconds.
[12/Sep/2024 23:06:21] INFO [dreams_core.core.classify_shark_coins:772] creation of shark_coins_df complete.
[12/Sep/2024 23:06:21] INFO [dreams_co

In [14]:
input_path = '..//modeling/outputs/flattened_outputs/flattened_output_2024-09-13_00-36_trainingstart_2024-05-01.csv'

flattened_df = pd.read_csv(input_path)
flattened_df.columns

Index(['coin_id', 'buyers_new_sum', 'buyers_new_mean', 'buyers_new_median',
       'buyers_new_std', 'buyers_new_sum_7d_period_1',
       'buyers_new_max_7d_period_1', 'buyers_new_median_7d_period_1',
       'buyers_new_change_7d_period_1', 'buyers_new_pct_change_7d_period_1',
       'buyers_new_sum_7d_period_2', 'buyers_new_max_7d_period_2',
       'buyers_new_median_7d_period_2', 'buyers_new_change_7d_period_2',
       'buyers_new_pct_change_7d_period_2', 'buyers_new_sum_7d_period_3',
       'buyers_new_max_7d_period_3', 'buyers_new_median_7d_period_3',
       'buyers_new_change_7d_period_3', 'buyers_new_pct_change_7d_period_3',
       'buyers_new_sum_7d_period_4', 'buyers_new_max_7d_period_4',
       'buyers_new_median_7d_period_4', 'buyers_new_change_7d_period_4',
       'buyers_new_pct_change_7d_period_4', 'buyers_new_sum_7d_period_5',
       'buyers_new_max_7d_period_5', 'buyers_new_median_7d_period_5',
       'buyers_new_change_7d_period_5', 'buyers_new_pct_change_7d_period_5',


pd.read_csv('..')

In [18]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe) 

config = load_config('../config/config.yaml')
metrics_config = load_config('../config/config_metrics.yaml')
modeling_config = load_config('../config/config_modeling.yaml')


logger.setLevel(logging.DEBUG)

def preprocess_coin_df(input_path, modeling_config):
    """
    Preprocess the flattened coin DataFrame by applying feature selection.
    
    Params:
    - input_path (str): Path to the flattened CSV file.
    - modeling_config (dict): Configuration with modeling-specific parameters.

    Returns:
    - full_path (str): The full path to the saved preprocessed CSV file.
    """
    # Step 1: Load the flattened data
    df = pd.read_csv(input_path)

    # Step 2: Check for missing values and raise an error if any are found
    if df.isnull().values.any():
        raise ValueError("Missing values detected in the DataFrame.")

    # Step 3: Apply feature selection (using drop_features)
    drop_features = modeling_config['preprocessing'].get('drop_features', [])
    initial_columns = set(df.columns)
    df = df.drop(columns=drop_features, errors='ignore')
    dropped_columns = initial_columns - set(df.columns)

    # Step 4: Generate output path and filename based on input
    base_filename = os.path.basename(input_path).replace(".csv", "")
    output_filename = f"{base_filename}_preprocessed.csv"
    output_path = os.path.join(os.path.dirname(input_path).replace("flattened_outputs", "preprocessed_outputs"), output_filename)

    # Step 5: Save the preprocessed data
    df.to_csv(output_path, index=False)

    # Log the changes made
    logger.debug(f"Preprocessed file saved at: {output_path}")
    logger.debug(f"Dropped {len(dropped_columns)} columns: {', '.join(dropped_columns)}")

    return output_path