In [6]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [17]:
df = pd.read_csv('../data/processed/df_with_anomalies.csv', parse_dates=['timestamp'])
df.set_index('timestamp', inplace=True)
df.head()

Unnamed: 0_level_0,btc_price,btc_volume,eth_price,eth_volume,fees,stablecoins_supply,defi_tvl,market_cap,volume,fng_value,is_panic_day,is_defi_stress_day,panic_in_next_days,is_pure_stress_day,target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-08-18,6566.715163,6206194000.0,314.793253,2930410000.0,8.0,15127992.0,367401.0,229530500000.0,18060760000.0,24,0,0,0.0,0,0.0
2018-08-19,6382.060591,5616966000.0,293.176992,2806352000.0,26.0,15127992.0,365249.0,215235800000.0,15235210000.0,27,0,0,0.0,0,0.0
2018-08-20,6475.49402,5160365000.0,299.435123,2503897000.0,8.0,15127992.0,363340.0,221641100000.0,13795050000.0,26,0,0,0.0,0,0.0
2018-08-21,6242.882438,5578292000.0,271.061552,2209797000.0,35.0,15127992.0,358941.0,209226900000.0,13445270000.0,19,1,0,0.0,0,0.0
2018-08-22,6467.27173,5658517000.0,280.374717,2099628000.0,42.0,15127992.0,316470.0,215144100000.0,12395020000.0,21,0,0,0.0,0,0.0


In [18]:
df.sort_index(inplace=True)

def get_approx_btc_supply(date):
    halving_2_date = pd.to_datetime('2016-07-09')
    supply_at_h2 = 15750000

    halving_3_date = pd.to_datetime('2020-05-11')
    supply_at_h3 = 18375000

    halving_4_date = pd.to_datetime('2024-04-20')
    supply_at_h4 = 19687500

    reward_era_2 = 25.0
    reward_era_3 = 12.5
    reward_era_4 = 6.25
    reward_era_5 = 3.125

    blocks_per_day = 144

    date = pd.to_datetime(date).normalize()
    if date < halving_2_date:
        return 15000000
    elif date < halving_3_date:
        days_since_h2 = (date - halving_2_date).days
        return supply_at_h2 + (days_since_h2 * blocks_per_day * reward_era_3)
    elif date < halving_4_date:
        days_since_h3 = (date - halving_3_date).days
        return supply_at_h3 + (days_since_h3 * blocks_per_day * reward_era_4)
    else:
        days_since_h4 = (date - halving_4_date).days
        return supply_at_h4 + (days_since_h4 * blocks_per_day * reward_era_5)

df['btc_circulating_supply'] = df.index.to_series().apply(get_approx_btc_supply)

In [19]:
# Tech features
df['btc_return_1d'] = df['btc_price'].pct_change(1)
df['eth_return_1d'] = df['eth_price'].pct_change(1)

for lag in [1, 3, 7]:
    df[f'btc_return_{lag}d'] = df['btc_return_1d'].shift(lag)
    df[f'eth_return_{lag}d'] = df['eth_return_1d'].shift(lag)

df['btc_volatility_7d'] = df['btc_return_1d'].rolling(window=7).std()
df['eth_volatility_7d'] = df['eth_return_1d'].rolling(window=7).std()
df['btc_volatility_30d'] = df['btc_return_1d'].rolling(window=30).std()
df['eth_volatility_30d'] = df['eth_return_1d'].rolling(window=30).std()

btc_ma_short = df['btc_price'].rolling(window=7).mean()
btc_ma_long = df['btc_price'].rolling(window=30).mean()
df['btc_price_momentum_signal'] = btc_ma_short/ btc_ma_long

eth_ma_short = df['eth_price'].rolling(window=7).mean()
eth_ma_long = df['eth_price'].rolling(window=30).mean()
df['eth_price_momentum_signal'] = eth_ma_short / eth_ma_long

df['volume_change_1d'] = df['volume'].pct_change(1)
df['volume_change_7d'] = df['volume'].pct_change(7)
volume_momentum_signal = df['volume'].rolling(7).mean() / df['volume'].rolling(30).mean()
df['volume_spike_signal'] = df['volume'] / df['volume'].rolling(30).mean()

df['btc_volume_change_1d'] = df['btc_volume'].pct_change(1)
df['btc_volume_change_7d'] = df['btc_volume'].pct_change(7)
df['eth_volume_change_1d'] = df['eth_volume'].pct_change(1)
df['eth_volume_change_7d'] = df['eth_volume'].pct_change(7)
df['btc_volume_momentum_signal'] = df['btc_volume'].rolling(7).mean() / df['btc_volume'].rolling(30).mean()
df['eth_volume_momentum_signal'] = df['eth_volume'].rolling(7).mean() / df['eth_volume'].rolling(30).mean()

df['tvl_change_1d'] = df['defi_tvl'].pct_change(1)
df['tvl_change_7d'] = df['defi_tvl'].pct_change(7)
df['tvl_change_30d'] = df['defi_tvl'].pct_change(30)
df['tvl_momentum_signal'] = df['defi_tvl'].rolling(30).mean() / df['defi_tvl'].rolling(90).mean()

df['fees_ma_7d'] = df['fees'].rolling(window=7).mean()
df['fees_spike_signal'] = df['fees'] / df['fees'].rolling(30).mean()

df['MarketConcentration_Proxy'] = (df['btc_price'] * df['btc_circulating_supply']) / df['market_cap']
df['ETH_Volatility_30d'] = df['eth_return_1d'].rolling(window=30).std()
df['LiquidationPressureIndex'] = df['ETH_Volatility_30d'] * (100 - df['fng_value'])
df['SentimentMomentum_7d'] = df['fng_value'].diff(7)
df['Capital_Efficiency'] = df['volume'] / df['defi_tvl']

In [20]:
df.dropna(inplace=True)
df.head()

Unnamed: 0_level_0,btc_price,btc_volume,eth_price,eth_volume,fees,stablecoins_supply,defi_tvl,market_cap,volume,fng_value,...,tvl_change_7d,tvl_change_30d,tvl_momentum_signal,fees_ma_7d,fees_spike_signal,MarketConcentration_Proxy,ETH_Volatility_30d,LiquidationPressureIndex,SentimentMomentum_7d,Capital_Efficiency
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-11-15,5788.612494,5690410000.0,184.022559,2658769000.0,44.0,25140229.0,3683607.0,194166300000.0,20127100000.0,28,...,-0.161401,-0.237979,1.649111,55.0,1.264368,0.515646,0.022709,1.635012,-23.0,5463.964713
2018-11-16,5686.541131,4981979000.0,182.448243,2461870000.0,23.0,25140229.0,3916651.0,191272500000.0,16783630000.0,23,...,-0.066622,-0.038763,1.625543,43.428571,0.666667,0.51427,0.027453,2.113909,-24.0,4285.198059
2018-11-17,5609.956182,3236633000.0,176.022406,1724797000.0,20.0,25140229.0,3777996.0,186587400000.0,11587960000.0,24,...,-0.065863,-0.339936,1.583668,39.285714,0.626305,0.520138,0.027444,2.085708,-28.0,3067.223328
2018-11-18,5599.191596,2525819000.0,175.667416,1412768000.0,86.0,25140229.0,3619614.0,187809200000.0,11148420000.0,26,...,-0.132657,-0.335361,1.54501,45.142857,2.629969,0.515816,0.028005,2.072335,-28.0,3080.001423
2018-11-19,5638.861481,2422604000.0,177.72869,1419370000.0,35.0,25140229.0,14168001.0,188092200000.0,9776134000.0,28,...,2.469982,1.485601,1.559092,38.0,1.059536,0.518743,0.02756,1.984351,-24.0,690.015062


In [21]:
df.to_csv('../data/processed/big_data_for_learning.csv', index=False)