### 🔹 Modeling Setup
Define features (X) and target (y). Remove NaNs due to lag features.  
Split dataset into train and test for model evaluation.  


In [1]:
import pandas as pd
from pathlib import Path

BASE = Path.cwd().parent if Path.cwd().name=="notebooks" else Path.cwd()
DATA = BASE/"data"/"processed"/"engineered_features_lag.csv"

df = pd.read_csv(DATA, parse_dates=["date"])
print("Shape:", df.shape)
df.head()


Shape: (993, 25)


Unnamed: 0,coin,symbol,price,1h,24h,7d,24h_volume,mkt_cap,date,source_file,...,liquidity_ratio_lag1,price_lag1,24h_volume_lag1,mkt_cap_lag1,price_ret_1d,vol_chg_1d,mcap_chg_1d,log_price,log_vol,log_mcap
0,0x,ZRX,0.509791,0.025,0.035,0.028,29309302.0,427933388.0,2022-03-16,coin_gecko_2022-03-16.csv,...,,,,,,,,0.411971,17.193416,19.874478
1,0x,ZRX,0.51816,0.003,0.016,-0.01,24224308.0,439124277.0,2022-03-17,coin_gecko_2022-03-17.csv,...,0.06849,0.509791,29309302.0,427933388.0,0.016417,-0.173494,0.026151,0.417499,17.002867,19.900293
2,1inch,1INCH,1.5,0.021,0.111,0.163,120457113.0,616145134.0,2022-03-16,coin_gecko_2022-03-16.csv,...,,,,,,,,0.916291,18.606804,20.238993
3,1inch,1INCH,1.49,0.008,-0.003,0.087,64145152.0,617505356.0,2022-03-17,coin_gecko_2022-03-17.csv,...,0.195501,1.5,120457113.0,616145134.0,-0.006667,-0.467486,0.002208,0.912283,17.976659,20.241198
4,AIOZ Network,AIOZ,0.237396,0.047,0.287,0.241,13715452.0,114207956.0,2022-03-16,coin_gecko_2022-03-16.csv,...,,,,,,,,0.213009,16.434034,18.553532


In [2]:
import numpy as np

target = "liquidity_ratio"

# Base engineered features (no target-derived current EMA)
base_features = [
    "price","1h","24h","7d","24h_volume","mkt_cap",
    "price_ma_3","price_ma_5","vol_3d"
]

# Add lagged features (strictly past)
lag_features = [
    "liquidity_ratio_lag1","price_lag1","24h_volume_lag1","mkt_cap_lag1",
    "price_ret_1d","vol_chg_1d","mcap_chg_1d"
]

# Log features for skewed vars
df["log_price"] = np.log1p(df["price"])
df["log_vol"]   = np.log1p(df["24h_volume"])
df["log_mcap"]  = np.log1p(df["mkt_cap"])

features = base_features + lag_features + ["log_price","log_vol","log_mcap"]

# Drop rows where any lag-based feature is NaN (first day per coin)
use = df.dropna(subset=[c for c in lag_features if c in df.columns]).copy()

X = use[features].fillna(0)
y = use[target].astype(float)

print("Rows after dropping first-day lag NaNs:", len(use), "of", len(df))
print("Features:", features)


Rows after dropping first-day lag NaNs: 489 of 993
Features: ['price', '1h', '24h', '7d', '24h_volume', 'mkt_cap', 'price_ma_3', 'price_ma_5', 'vol_3d', 'liquidity_ratio_lag1', 'price_lag1', '24h_volume_lag1', 'mkt_cap_lag1', 'price_ret_1d', 'vol_chg_1d', 'mcap_chg_1d', 'log_price', 'log_vol', 'log_mcap']


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])

Train size: 391 Test size: 98


In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    results[name] = {
        "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
        "MAE": mean_absolute_error(y_test, preds),
        "R2": r2_score(y_test, preds),
    }

results


{'LinearRegression': {'RMSE': 0.3084661554072961,
  'MAE': 0.08100489099237934,
  'R2': 0.4951826881383419},
 'RandomForest': {'RMSE': 0.14232293339940055,
  'MAE': 0.03168710025033905,
  'R2': 0.8925345228445103}}

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV, LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd

# Train/test split reused from Cell [3]
# Define helper to compute metrics in ORIGINAL target scale via log1p trick
def fit_predict_eval(model, Xtr, ytr, Xte, yte, log_target=True):
    if log_target:
        ytr_t = np.log1p(ytr)
        model.fit(Xtr, ytr_t)
        p_t = model.predict(Xte)
        p = np.expm1(p_t)
    else:
        model.fit(Xtr, ytr)
        p = model.predict(Xte)
    return {
        "RMSE": float(np.sqrt(mean_squared_error(yte, p))),
        "MAE": float(mean_absolute_error(yte, p)),
        "R2": float(r2_score(yte, p)),
    }

models_fixed = {
    "LinearRegression": Pipeline([("scaler", StandardScaler()), ("est", LinearRegression())]),
    "RidgeCV": Pipeline([("scaler", StandardScaler()), ("est", RidgeCV(alphas=[0.01,0.1,1.0,10.0,100.0]))]),
    "LassoCV": Pipeline([("scaler", StandardScaler()), ("est", LassoCV(alphas=[1e-4,1e-3,1e-2,1e-1], max_iter=20000, random_state=42))]),
    "RandomForest": RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1),
    "HistGB": HistGradientBoostingRegressor(random_state=42),
    "XGB": XGBRegressor(n_estimators=800, max_depth=4, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9, random_state=42, n_jobs=-1),
    "LGBM": LGBMRegressor(n_estimators=800, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9, random_state=42),
}

fixed_results = {name: fit_predict_eval(mdl, X_train, y_train, X_test, y_test, log_target=True)
                 for name, mdl in models_fixed.items()}

pd.DataFrame(fixed_results).T.sort_values("RMSE")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000552 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 391, number of used features: 19
[LightGBM] [Info] Start training from score 0.072948


Unnamed: 0,RMSE,MAE,R2
RidgeCV,0.102312,0.03999,0.944464
LassoCV,0.10349,0.040574,0.943178
LinearRegression,0.140565,0.048404,0.895173
RandomForest,0.213276,0.038861,0.758674
XGB,0.263789,0.035559,0.630825
LGBM,0.364835,0.058946,0.293824
HistGB,0.372777,0.062691,0.262745


In [6]:
import joblib
from pathlib import Path

ART_DIR = (BASE/"artifacts"); (ART_DIR/"models").mkdir(parents=True, exist_ok=True)
(ART_DIR/"metrics").mkdir(parents=True, exist_ok=True)

df_results = pd.DataFrame(fixed_results).T.sort_values("RMSE")
df_results.to_csv(ART_DIR/"metrics"/"results.csv")

best_name = df_results["RMSE"].idxmin()
best_model = models_fixed[best_name]

# Refit on full data (train+test) in log-target space for deployment
y_full_t = np.log1p(y)
best_model.fit(X, y_full_t)
joblib.dump(best_model, ART_DIR/"models"/f"{best_name}_logtarget.joblib")

print("Saved metrics:", ART_DIR/"metrics"/"results.csv")
print("Best model:", best_name, "->", ART_DIR/'models'/f"{best_name}_logtarget.joblib")


Saved metrics: C:\Users\krpra\Desktop\Project\crypto_liquidity_project\artifacts\metrics\results.csv
Best model: RidgeCV -> C:\Users\krpra\Desktop\Project\crypto_liquidity_project\artifacts\models\RidgeCV_logtarget.joblib


In [7]:
# Model was trained on log1p(target), so inverse transform
y_pred = np.expm1(model.predict(X))

results_df = pd.DataFrame({
    "coin": use["coin"],
    "date": use["date"],
    "actual": y,
    "predicted": y_pred
})

results_df.head(10)


Unnamed: 0,coin,date,actual,predicted
1,0x,2022-03-17,0.055165,0.060824
3,1inch,2022-03-17,0.103878,0.110295
5,AIOZ Network,2022-03-17,0.053256,0.053501
7,APENFT,2022-03-17,0.123973,0.131143
9,API3,2022-03-17,0.133236,0.142159
11,ARPA Chain,2022-03-17,0.201198,0.227407
13,Aave,2022-03-17,0.202411,0.230862
15,Aave [OLD],2022-03-17,0.000158,0.000234
17,Aavegotchi,2022-03-17,0.246111,0.284048
19,Acala,2022-03-17,0.064324,0.060088
