## Full Training Data Sequence

### retrieve datasets

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))



# Retrieve datasets
profits_df,market_data_df = wo.retrieve_datasets()

# Define wallet cohort after cleaning
training_wallet_metrics_df,wallet_cohort = wo.define_wallet_cohort(profits_df,market_data_df)

# Generate profits_df for all training windows and the modeling period
training_profits_df, training_windows_profits_dfs, modeling_profits_df, validation_profits_df = wo.split_profits_df(profits_df,
                                                                               market_data_df,wallet_cohort)


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()
wallets_metrics_config = u.load_config('../config/wallets_metrics_config.yaml')
wallets_features_config = yaml.safe_load(Path('../config/wallets_features_config.yaml').read_text(encoding='utf-8'))


# Market data: add indicators
market_indicators_data_df = ind.generate_time_series_indicators(market_data_df,
                                                        wallets_metrics_config['time_series']['market_data'],
                                                        'coin_id')


# Transfers data retrieval for the wallet_ids in temp.wallet_modeling_cohort
transfers_data_df = wcf.retrieve_transfers_data()

### generate features

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Generate features for the full training dataset
training_wallet_features_df = wf.calculate_wallet_features(training_profits_df, market_indicators_data_df,
                                                           transfers_data_df, wallet_cohort)

# Define the full feature set by appending a suffix for each window
training_data_df = training_wallet_features_df.add_suffix("_all_windows")

# Generate features for each window
for i, window_profits_df in enumerate(training_windows_profits_dfs, 1):
    # Generate the features
    window_wallet_features_df = wf.calculate_wallet_features(window_profits_df, market_indicators_data_df,
                                                             transfers_data_df, wallet_cohort)

    # Add column suffix and join to training_data_df
    window_wallet_features_df = window_wallet_features_df.add_suffix(f'_w{i}')
    training_data_df = training_data_df.join(window_wallet_features_df, how='left')


training_data_df.describe()

### join target variable to training data

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# Clean inactive wallets from modeling period data
modeling_wallets_df = wo.filter_modeling_period_wallets(modeling_profits_df)

# Generate target variables
target_vars_df = wm.generate_target_variables(modeling_wallets_df)

# Merge training data and target variables?
modeling_df = training_data_df.join(target_vars_df[wallets_config['modeling']['target_variable']],
                                    how='inner')

modeling_df.describe()

## Crude Model

### drop columns if specified to

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

# make df
df = modeling_df.copy()

if wallets_config['modeling']['drop_columns']:
    # Get list of columns to drop from config
    columns_to_drop = wallets_config['modeling']['drop_columns']

    # Only drop columns that actually exist in the DataFrame
    existing_columns = [col for col in columns_to_drop if col in df.columns]

    # Drop the columns if any exist
    if existing_columns:
        df = df.drop(columns=existing_columns)

df.shape

In [None]:

# Assuming your dataframe is called 'df'
# Separate features and target
X = df.drop(wallets_config['modeling']['target_variable'], axis=1)
y = df[wallets_config['modeling']['target_variable']]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create preprocessing steps
numeric_features = X.columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ])

# Define the gradient boosting regressor with key parameters
gbr = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    max_features=1.0,
    min_samples_leaf=0.01,
    min_samples_split=0.01,
    subsample=0.8,
    random_state=42,
)

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', gbr)
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)


In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()

model=pipeline.named_steps['regressor']  # Pass the actual model object
# model=grid_search.best_estimator_.named_steps['regressor']

# Example usage with your existing code:
evaluation = wm.evaluate_regression_model(
    y_test,
    y_pred,
    model=model,
    feature_names=X.columns.tolist()
)

# Print summary report
print(evaluation['summary_report'])

# Access specific metrics
print(f"R² Score: {evaluation['r2']:.3f}")

# The figure can be displayed or saved
if evaluation['figures'] is not None:
    plt.show()  # or evaluation['figures'].savefig('model_evaluation.png')

In [None]:
# 
#     Model Performance Summary:
#     -------------------------
#     R² Score: 0.340
#     RMSE: 0.093
#     MAE: 0.062
#     MAPE: 26.0%

#     Residuals Analysis:
#     ------------------
#     Mean of Residuals: -0.001
#     Standard Deviation of Residuals: 0.093
#     95% Prediction Interval: ±0.183

# R² Score: 0.340


### profits validation

In [None]:
# Calculate validation period wallet metrics
validation_profits_df = wcf.add_cash_flow_transfers_logic(validation_profits_df)
wallet_trading_features_df = wf.calculate_wallet_trading_features(validation_profits_df)
validation_wallets_df = wm.generate_target_variables(wallet_trading_features_df)

# Attach validation period performance to modeling period scores
validation_df = pd.DataFrame()
validation_df['wallet_address'] = X_test.index.values
validation_df['score'] = y_pred
validation_df['score_rounded'] = np.ceil(validation_df['score']*20)/20
validation_df = validation_df.set_index('wallet_address')
validation_df = validation_df.join(validation_wallets_df,how='left')

# Group wallets by score bucket and assess performance
grouped_val = validation_df.groupby('score_rounded').agg(
    wallets=('score','count'),
    mean_invested=('invested','mean'),
    mean_net_gain=('net_gain','mean'),
    median_invested=('invested','median'),
    median_net_gain=('net_gain','median'),
)
grouped_val['mean_return'] = grouped_val['mean_net_gain']/grouped_val['mean_invested']
grouped_val['median_return'] = grouped_val['median_net_gain']/grouped_val['median_invested']
grouped_val

## coin performance predictions

In [None]:
[importlib.reload(module) for module in modules]
wallets_config.reload()


# Consolidate wallet scores at the coin level
wallet_scores_df = pd.DataFrame({'score': y_pred}, index=validation_df.index)
coin_wallet_metrics_df = cf.calculate_coin_metrics_from_wallet_scores(validation_profits_df, wallet_scores_df)

# Calculate coin performance during the validation period
coin_performance_df = cf.calculate_coin_performance(market_data_df,
                                                     wallets_config['training_data']['validation_period_start'],
                                                     wallets_config['training_data']['validation_period_end'])

# Join aggregated wallet metrics with actual coin performance
coin_forecasting_df = coin_wallet_metrics_df.join(coin_performance_df, how='inner')

# Run analysis
top_n = wallets_config['coin_forecasting']['top_n']
max_market_cap = wallets_config['coin_forecasting']['max_market_cap']
min_market_cap = wallets_config['coin_forecasting']['min_market_cap']

metric_top_coin_performance_df = cf.validate_coin_performance(coin_forecasting_df,top_n,
                                                                max_market_cap, min_market_cap)

metric_top_coin_performance_df

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

def analyze_top_performing_coins(df, return_percentile=75):
    """
    Analyzes how wallet and scoring metrics differ between top performing coins and others.
    Top performers defined as coins with returns >= 75th percentile.

    Args:
        df: DataFrame with coin metrics including 'coin_return' and scoring metrics
        return_percentile: Threshold for defining top performers (default 75)

    Returns:
        Dict containing statistical comparison of metrics between top performers and others
    """
    # Split coins into performance groups
    return_threshold = np.percentile(df['coin_return'], return_percentile)
    top_coins = df[df['coin_return'] >= return_threshold]
    other_coins = df[df['coin_return'] < return_threshold]

    # Keep original metric names for consistency
    metrics_to_analyze = [
        'weighted_avg_score',
        'composite_score',
        'top_wallet_balance_pct',
        'top_wallet_count_pct',
        'score_confidence'
    ]

    results = {}
    for metric in metrics_to_analyze:
        # Calculate group means
        top_mean = top_coins[metric].mean()
        other_mean = other_coins[metric].mean()

        # T-test between groups
        t_stat, p_value = stats.ttest_ind(
            top_coins[metric].fillna(0),
            other_coins[metric].fillna(0)
        )

        # Effect size calculation
        pooled_std = np.sqrt((top_coins[metric].var() + other_coins[metric].var()) / 2)
        cohens_d = (top_mean - other_mean) / pooled_std if pooled_std != 0 else 0

        results[metric] = {
            'top_quartile_mean': top_mean,  # mean for coins with returns >= 75th percentile
            'other_mean': other_mean,       # mean for coins with returns < 75th percentile
            'abs_diff': top_mean - other_mean,
            'pct_diff': ((top_mean - other_mean) / other_mean * 100) if other_mean != 0 else 0,
            'p_value': p_value,
            'effect_size': cohens_d
        }

    return results

def print_performance_analysis(df):
    """
    Prints formatted comparison of metrics between top performing coins and others.
    Top performing defined as returns >= 75th percentile.
    """
    results = analyze_top_performing_coins(df)

    print(f"\n=== Metric Analysis: Returns >= {75}th percentile vs Others ===")
    for metric, stats in results.items():
        print(f"\n{metric}:")
        print(f"  Top quartile mean (returns >= p75): {stats['top_quartile_mean']:.4f}")
        print(f"  Other coins mean (returns < p75): {stats['other_mean']:.4f}")
        print(f"  Absolute difference: {stats['abs_diff']:.4f}")
        print(f"  Percent difference: {stats['pct_diff']:.1f}%")
        print(f"  P-value: {stats['p_value']:.4f}")
        print(f"  Effect size: {stats['effect_size']:.4f}")

# Example usage with additional metrics if needed
def extended_analysis(df):
    # Basic analysis
    basic_results = analyze_top_performing_coins(df)

    # Could add more specialized analyses here while maintaining consistent naming
    return basic_results

if __name__ == "__main__":
    df = pd.read_csv('coin_wallet_metrics.csv')
    print_performance_analysis(df)