# Feature Engineering

In this step we create new features that may help models capture liquidity behavior:
- Moving averages of price (3-day, 5-day)
- Price volatility (3-day std)
- EMA of liquidity ratio


In [1]:
import pandas as pd
from pathlib import Path

BASE = Path.cwd().parent if Path.cwd().name=="notebooks" else Path.cwd()
DATA = BASE/"data"/"processed"/"merged_coin_gecko.csv"

df = pd.read_csv(DATA, parse_dates=["date"])
df.head()

Unnamed: 0,coin,symbol,price,1h,24h,7d,24h_volume,mkt_cap,date,source_file,liquidity_ratio
0,Bitcoin,BTC,40859.46,0.022,0.03,0.055,35390760000.0,770991500000.0,2022-03-16,coin_gecko_2022-03-16.csv,0.045903
1,Ethereum,ETH,2744.41,0.024,0.034,0.065,19748700000.0,327104400000.0,2022-03-16,coin_gecko_2022-03-16.csv,0.060374
2,Tether,USDT,1.0,-0.001,-0.001,0.0,57934970000.0,79965160000.0,2022-03-16,coin_gecko_2022-03-16.csv,0.724503
3,BNB,BNB,383.43,0.018,0.028,0.004,1395854000.0,64043820000.0,2022-03-16,coin_gecko_2022-03-16.csv,0.021795
4,USD Coin,USDC,0.999874,-0.001,0.0,-0.0,3872274000.0,52222140000.0,2022-03-16,coin_gecko_2022-03-16.csv,0.07415


### Add Engineered Features
We add moving averages, volatility, and EMA features per coin.

In [2]:
df = df.sort_values(["coin","date"]).reset_index(drop=True)

# Rolling averages
df["price_ma_3"] = df.groupby("coin")["price"].transform(lambda s: s.rolling(3, min_periods=1).mean())
df["price_ma_5"] = df.groupby("coin")["price"].transform(lambda s: s.rolling(5, min_periods=1).mean())

# Volatility (3-day std)
df["vol_3d"] = df.groupby("coin")["price"].transform(lambda s: s.rolling(3, min_periods=2).std())

# EMA of liquidity ratio
df["lr_ema_3"] = df.groupby("coin")["liquidity_ratio"].transform(lambda s: s.ewm(span=3, adjust=False).mean())

df[["coin","price","price_ma_3","price_ma_5","vol_3d","liquidity_ratio","lr_ema_3"]].head(10)


Unnamed: 0,coin,price,price_ma_3,price_ma_5,vol_3d,liquidity_ratio,lr_ema_3
0,0x,0.509791,0.509791,0.509791,,0.06849,0.06849
1,0x,0.51816,0.513975,0.513975,0.005917777,0.055165,0.061828
2,1inch,1.5,1.5,1.5,,0.195501,0.195501
3,1inch,1.49,1.495,1.495,0.007071068,0.103878,0.14969
4,AIOZ Network,0.237396,0.237396,0.237396,,0.120092,0.120092
5,AIOZ Network,0.214561,0.225978,0.225978,0.01614678,0.053256,0.086674
6,APENFT,2e-06,2e-06,2e-06,,0.16376,0.16376
7,APENFT,2e-06,2e-06,2e-06,1.080459e-08,0.123973,0.143867
8,API3,4.7,4.7,4.7,,0.128719,0.128719
9,API3,4.64,4.67,4.67,0.04242641,0.133236,0.130977


In [3]:
OUT = BASE/"data"/"processed"/"engineered_features.csv"
df.to_csv(OUT, index=False)
print("Saved:", OUT)

Saved: C:\Users\krpra\Desktop\Project\crypto_liquidity_project\data\processed\engineered_features.csv


In [4]:
import pandas as pd

df = df.sort_values(["coin","date"]).reset_index(drop=True)

# 1-day lags per coin (strictly past)
for col in ["liquidity_ratio","price","24h_volume","mkt_cap"]:
    if col in df.columns:
        df[f"{col}_lag1"] = df.groupby("coin")[col].shift(1)

# 1-day returns / changes from lags
df["price_ret_1d"] = (df["price"] - df["price_lag1"]) / df["price_lag1"]
df["vol_chg_1d"]   = (df["24h_volume"] - df["24h_volume_lag1"]) / df["24h_volume_lag1"]
df["mcap_chg_1d"]  = (df["mkt_cap"] - df["mkt_cap_lag1"]) / df["mkt_cap_lag1"]

# Fill inf with NaN, we will drop NaNs before modeling
for c in ["price_ret_1d","vol_chg_1d","mcap_chg_1d"]:
    if c in df.columns:
        df[c] = df[c].replace([pd.NA, pd.NaT, float("inf"), float("-inf")], pd.NA)

df.head(10)


Unnamed: 0,coin,symbol,price,1h,24h,7d,24h_volume,mkt_cap,date,source_file,...,price_ma_5,vol_3d,lr_ema_3,liquidity_ratio_lag1,price_lag1,24h_volume_lag1,mkt_cap_lag1,price_ret_1d,vol_chg_1d,mcap_chg_1d
0,0x,ZRX,0.509791,0.025,0.035,0.028,29309302.0,427933388.0,2022-03-16,coin_gecko_2022-03-16.csv,...,0.509791,,0.06849,,,,,,,
1,0x,ZRX,0.51816,0.003,0.016,-0.01,24224308.0,439124277.0,2022-03-17,coin_gecko_2022-03-17.csv,...,0.513975,0.005917777,0.061828,0.06849,0.509791,29309302.0,427933388.0,0.016417,-0.173494,0.026151
2,1inch,1INCH,1.5,0.021,0.111,0.163,120457113.0,616145134.0,2022-03-16,coin_gecko_2022-03-16.csv,...,1.5,,0.195501,,,,,,,
3,1inch,1INCH,1.49,0.008,-0.003,0.087,64145152.0,617505356.0,2022-03-17,coin_gecko_2022-03-17.csv,...,1.495,0.007071068,0.14969,0.195501,1.5,120457113.0,616145134.0,-0.006667,-0.467486,0.002208
4,AIOZ Network,AIOZ,0.237396,0.047,0.287,0.241,13715452.0,114207956.0,2022-03-16,coin_gecko_2022-03-16.csv,...,0.237396,,0.120092,,,,,,,
5,AIOZ Network,AIOZ,0.214561,-0.012,-0.095,-0.107,5493212.0,103148149.0,2022-03-17,coin_gecko_2022-03-17.csv,...,0.225978,0.01614678,0.086674,0.120092,0.237396,13715452.0,114207956.0,-0.096189,-0.599487,-0.096839
6,APENFT,NFT,2e-06,0.004,0.005,-0.047,69002807.0,421365369.0,2022-03-16,coin_gecko_2022-03-16.csv,...,2e-06,,0.16376,,,,,,,
7,APENFT,NFT,2e-06,0.001,0.01,-0.052,52887977.0,426608620.0,2022-03-17,coin_gecko_2022-03-17.csv,...,2e-06,1.080459e-08,0.143867,0.16376,2e-06,69002807.0,421365369.0,0.010029,-0.233539,0.012443
8,API3,API3,4.7,0.028,-0.008,-0.101,29731985.0,230984150.0,2022-03-16,coin_gecko_2022-03-16.csv,...,4.7,,0.128719,,,,,,,
9,API3,API3,4.64,0.003,-0.012,-0.146,30688251.0,230330587.0,2022-03-17,coin_gecko_2022-03-17.csv,...,4.67,0.04242641,0.130977,0.128719,4.7,29731985.0,230984150.0,-0.012766,0.032163,-0.002829


In [6]:
import numpy as np

df["log_price"] = np.log1p(df["price"])
df["log_vol"]   = np.log1p(df["24h_volume"])
df["log_mcap"]  = np.log1p(df["mkt_cap"])


In [7]:
OUT_LAG = BASE/"data"/"processed"/"engineered_features_lag.csv"
df.to_csv(OUT_LAG, index=False)
print("Saved:", OUT_LAG)


Saved: C:\Users\krpra\Desktop\Project\crypto_liquidity_project\data\processed\engineered_features_lag.csv
