In [None]:
# pyright: reportMissingImports=false
# pyright: reportMissingModuleSource=false

import uuid
import random
import hashlib
import os
import sys
import time
import logging
import datetime
import json
from datetime import datetime, timedelta
import yaml
import pytest
import importlib
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import requests
import pandas_gbq
from sklearn.model_selection import ParameterGrid, ParameterSampler
from scipy.signal import argrelextrema
from dreams_core.googlecloud import GoogleCloud as dgc
from dreams_core import core as dc
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar


# load dotenv
load_dotenv()


# import local files if necessary
# pyright: reportMissingImports=false
sys.path.append('..//src')
import training_data as td
importlib.reload(td)
import feature_engineering as fe
importlib.reload(fe)
import coin_wallet_metrics as cwm
importlib.reload(cwm)
import modeling as m
importlib.reload(m)
import insights as i
importlib.reload(i)
import utils as u
importlib.reload(u)


# configure logger
logger = dc.setup_logger()
logger.setLevel(logging.INFO)

# Custom format function for displaying numbers
pd.set_option('display.float_format', lambda x: f'{x:.12g}')
# pd.reset_option('display.float_format')


# Load all configs as global variables
global CONFIG, METRICS_CONFIG, MODELING_CONFIG, EXPERIMENTS_CONFIG, MODELING_FOLDER

CONFIG = u.load_config('../config/config.yaml')
METRICS_CONFIG = u.load_config('../config/metrics_config.yaml')
MODELING_CONFIG = u.load_config('../config/modeling_config.yaml')
EXPERIMENTS_CONFIG = u.load_config('../config/experiments_config.yaml')
MODELING_FOLDER = MODELING_CONFIG['modeling']['modeling_folder']

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')


## Experiment Setup

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')


experiment_id = i.run_experiment(modeling_config)

In [None]:
experiment_id = '0926_whale_cohort_cutoffs_844325ab-00f6-4da9-8584-bf9026340770'

In [None]:
trial_df = i.generate_trial_df(modeling_config['modeling']['modeling_folder'], experiment_id)

In [None]:
i.plot_top_feature_importance(MODELING_FOLDER, experiment_id, top_n=10)

In [None]:
trial_df.head(10)

In [None]:
trial_df.columns

In [None]:
value_vars = ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc', 'log_loss']

id_vars = [col for col in trial_df.columns if col not in (value_vars + ['confusion_matrix'])]


In [None]:
# Melt the dataframe
pd.melt(df, id_vars=id_vars, value_vars=value_vars,
                    var_name='metric', value_name='value')

In [None]:
def summarize_feature_performance(trial_df):
    """
    Summarizes the performance of the values for each feature that was experimented on.
    """
    # List of performance metrics
    value_vars = ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc', 'log_loss']

    # List of columns to melt (all except performance metrics)
    id_vars = [col for col in trial_df.columns if col not in (value_vars + ['confusion_matrix'])]

    # Melt the dataframe
    melted_df = pd.melt(df, id_vars=id_vars, value_vars=value_vars,
                        var_name='metric', value_name='value')

    # Create a list to store the results
    results = []

    # Iterate through each feature
    for feature in id_vars:
        # Group by feature value and metric, then calculate the mean
        grouped = melted_df.groupby([feature, 'metric'])['value'].mean().unstack()

        # Calculate the count of models for each feature value
        model_count = melted_df.groupby(feature)['metric'].count().div(len(value_vars)).astype(int)

        # Reset index and rename columns
        grouped = grouped.reset_index()
        grouped.columns.name = None
        grouped = grouped.rename(columns={col: f'avg_{col}' for col in value_vars})

        # Rename the feature column and add the model count
        grouped = grouped.rename(columns={feature: 'value'})
        grouped['model_count'] = grouped['value'].map(model_count)
        grouped['feature'] = feature

        # Reorder columns
        column_order = ['feature', 'value', 'model_count'] + [f'avg_{metric}' for metric in value_vars]
        grouped = grouped[column_order]

        # Append to results
        results.append(grouped)

    # Concatenate all results
    final_df = pd.concat(results, ignore_index=True)

    # Sort the dataframe
    final_df = final_df.sort_values(['feature', 'value'])

    # Reset index
    final_df = final_df.reset_index(drop=True)

    return final_df

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')

feature_performance_df = i.summarize_feature_performance(trial_df)


In [None]:
df = feature_performance_df
feature_performance_df

In [None]:
performance_columns

In [None]:
import pandas as pd

# Define the columns that start with 'avg'
columns_to_format = [col for col in feature_performance_df.columns if col.startswith('avg')]

# Apply conditional formatting to those columns
feature_performance_df_styled = feature_performance_df.style.background_gradient(subset=columns_to_format, cmap='RdYlGn')

# Display the styled DataFrame
feature_performance_df_styled

In [None]:
%matplotlib inline


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Assuming the data is already in a DataFrame called 'df'
# If it's not, you'll need to read it from a file or create it from the provided data

# List of columns to apply conditional formatting
performance_columns = ['avg_accuracy', 'avg_precision', 'avg_recall', 'avg_f1_score', 'avg_roc_auc', 'avg_log_loss']

def color_scale(series, cmap='RdYlGn', low=0, high=1):
    return [f'background-color: {plt.cm.get_cmap(cmap)(x)}'
            for x in plt.Normalize(low, high)(series.astype(float))]

def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')

def highlight_min(s, props=''):
    return np.where(s == np.nanmin(s.values), props, '')

# Apply styling
styled_df = df.style.apply(color_scale, subset=performance_columns)

# Highlight max values in green and min values in red for each column
for col in performance_columns:
    styled_df = styled_df.apply(highlight_max, props='color: green; font-weight: bold', subset=[col])
    styled_df = styled_df.apply(highlight_min, props='color: red; font-weight: bold', subset=[col])

# Format numeric columns to display 3 decimal places
styled_df = styled_df.format({col: '{:.3f}' for col in performance_columns})

# Display the styled DataFrame
styled_df

In [None]:
import pandas as pd
import numpy as np

# Assuming the data is already in a DataFrame called 'df'
# If it's not, you'll need to read it from a file or create it from the provided data

# List of columns to apply conditional formatting
performance_columns = ['avg_accuracy', 'avg_precision', 'avg_recall', 'avg_f1_score', 'avg_roc_auc', 'avg_log_loss']

def color_scale(series, cmap='RdYlGn', low=0, high=1):
    return [f'background-color: {color}' for color in plt.cm.get_cmap(cmap)(np.linspace(low, high, len(series)))]

def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')

def highlight_min(s, props=''):
    return np.where(s == np.nanmin(s.values), props, '')

# Apply styling
styled_df = df.style.apply(color_scale, subset=performance_columns)

# Highlight max values in green and min values in red for each column
for col in performance_columns:
    styled_df = styled_df.apply(highlight_max, props='color: green; font-weight: bold', subset=[col])
    styled_df = styled_df.apply(highlight_min, props='color: red; font-weight: bold', subset=[col])

# Format numeric columns to display 3 decimal places
styled_df = styled_df.format({col: '{:.3f}' for col in performance_columns})

# Display the styled DataFrame
styled_df

In [None]:
import pandas as pd
import numpy as np

# Assuming the data is already in a DataFrame called 'df'
# If it's not, you'll need to read it from a file or create it from the provided data

# List of columns to apply conditional formatting
performance_columns = ['avg_accuracy', 'avg_precision', 'avg_recall', 'avg_f1_score', 'avg_roc_auc', 'avg_log_loss']

def color_scale(val, min_val, max_val):
    """
    Returns a string representing a color on a scale from red to yellow to green.
    """
    if pd.isna(val):
        return ''
    normalized = (val - min_val) / (max_val - min_val)
    r = int(255 * (1 - normalized))
    g = int(255 * normalized)
    b = 0
    return f'background-color: rgb({r},{g},{b})'

def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')

def highlight_min(s, props=''):
    return np.where(s == np.nanmin(s.values), props, '')

# Apply styling
def style_dataframe(df):
    styled = feature_performance_df.style

    # Apply color scale to performance columns
    for col in performance_columns:
        min_val = df[col].min()
        max_val = df[col].max()
        styled = styled.applymap(lambda x: color_scale(x, min_val, max_val), subset=[col])

    # Highlight max values in green and min values in red for each column
    for col in performance_columns:
        styled = styled.apply(highlight_max, props='color: #00FFFF; font-weight: bold', subset=[col])
        styled = styled.apply(highlight_min, props='color: #202020; font-weight: bold', subset=[col])

    # Format numeric columns to display 3 decimal places
    styled = styled.format({col: '{:.3f}' for col in performance_columns})

    return styled

# Apply styling to the DataFrame
styled_df = style_dataframe(feature_performance_df)

# Display the styled DataFrame
styled_df

In [None]:
import pandas as pd

# Assuming df is your original dataframe

# Melt the dataframe to reshape it for easier grouping
df_melted = pd.melt(df, id_vars=["modeling_config.target_variables.moon_threshold",
                                 "config.datasets.wallet_cohorts.whales.wallet_minimum_inflows"],
                    value_vars=["accuracy", "precision", "recall", "f1_score", "roc_auc", "log_loss"],
                    var_name="metric", value_name="value")

df_melted

# # Group by each feature and compute the mean of the metrics
# df_avg = df_melted.groupby(["modeling_config.target_variables.moon_threshold",
#                             "config.datasets.wallet_cohorts.whales.wallet_minimum_inflows", "metric"]).mean().reset_index()

# # Pivot the dataframe to get desired format
# df_pivoted = df_avg.pivot(index=["modeling_config.target_variables.moon_threshold",
#                                  "config.datasets.wallet_cohorts.whales.wallet_minimum_inflows"],
#                           columns="metric", values="value").reset_index()

# # Rename the columns to indicate averages
# df_pivoted.columns = ['feature', 'value', 'avg_accuracy', 'avg_precision', 'avg_recall', 'avg_f1_score', 'avg_roc_auc', 'avg_log_loss']

# # Display the final structured dataframe
# df_pivoted

In [None]:
i.plot_top_feature_importance(MODELING_FOLDER, experiment_id, top_n=10)

In [None]:
i.analyze_experiment(MODELING_FOLDER, experiment_id, top_n=10)

## Base Tables

In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')

# retreive market data
market_data_df = td.retrieve_market_data()
market_data_df,_ = td.fill_market_data_gaps(market_data_df,config['data_cleaning']['max_gap_days'])
market_data_df,_,_ = cwm.split_dataframe_by_coverage(
    market_data_df,
    start_date=config['training_data']['training_period_start'],
    end_date=config['training_data']['modeling_period_end'],
    id_column='coin_id'
)
prices_df = market_data_df[['coin_id','date','price']].copy()

# retrieve transfers data
transfers_df = td.retrieve_transfers_data(
    config['training_data']['training_period_start'],
    config['training_data']['modeling_period_start'],
    config['training_data']['modeling_period_end']
    )

# compile profits_df
profits_df = td.prepare_profits_data(transfers_df, prices_df)
profits_df = td.calculate_wallet_profitability(profits_df)
profits_df,_ = td.clean_profits_df(profits_df, config['data_cleaning'])

# remove records from market_data_df that don't have transfers if configured to do so
if config['data_cleaning']['exclude_coins_without_transfers']:
    market_data_df = market_data_df[market_data_df['coin_id'].isin(profits_df['coin_id'])]


In [None]:
importlib.reload(td)
importlib.reload(cwm)
importlib.reload(fe)
importlib.reload(m)
importlib.reload(i)
importlib.reload(u)
config = u.load_config('../config/config.yaml')
metrics_config = u.load_config('../config/metrics_config.yaml')
modeling_config = u.load_config('../config/modeling_config.yaml')
experiments_config = u.load_config('../config/experiments_config.yaml')


## tests failing