In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import polars as pl
import numpy as np
from datetime import datetime, timedelta

from centimators.data_transformers import (
    RankTransformer,
    LagTransformer,
    MovingAverageTransformer,
    LogReturnTransformer,
)

## Mock price data

In [39]:
dates = [datetime.now() - timedelta(days=i) for i in range(90)]
dates.reverse()

# Generate 5 tickers
tickers = [f"Ticker{i}" for i in range(1, 21)]

# Generate random OHLCV data
data = {
    "ticker": [],
    "date": [],
    "open": [],
    "high": [],
    "low": [],
    "close": [],
    "volume": [],
}

for ticker in tickers:
    # Start with random base price between 10 and 1000
    base_price = np.random.uniform(10, 1000)
    for date in dates:
        # Generate daily price movements
        daily_return = np.random.normal(0.005, 0.03)  # Mean 0.5%, std 3%
        close = base_price * (1 + daily_return)
        high = close * (1 + abs(np.random.normal(0, 0.01)))
        low = close * (1 - abs(np.random.normal(0, 0.01)))
        open_price = close * (1 + np.random.normal(0, 0.005))
        volume = int(np.random.lognormal(10, 1))

        data["ticker"].append(ticker)
        data["date"].append(date)
        data["open"].append(round(open_price, 2))
        data["high"].append(round(high, 2))
        data["low"].append(round(low, 2))
        data["close"].append(round(close, 2))
        data["volume"].append(volume)

        base_price = close  # Use today's close as tomorrow's base price

df_pandas = pd.DataFrame(data)
df_polars = pl.DataFrame(data)

In [None]:
df_polars.plot.line(x="date", y="close", color="ticker").properties(width=800, height=400, title="Stock Prices Over Time")

## Instantiate transformers

In [None]:
ranker: RankTransformer = RankTransformer()
ranker

lag_windows = [0, 1, 2, 3, 4]
lagger: LagTransformer = LagTransformer(windows=lag_windows)

ma_windows = [5, 20, 50]
ma_transformer = MovingAverageTransformer(windows=ma_windows)

log_return_transformer = LogReturnTransformer()

display(log_return_transformer, ranker, lagger, ma_transformer)

## In a pipeline

In [59]:
from sklearn import set_config
from sklearn.pipeline import make_pipeline

set_config(enable_metadata_routing=True)

In [None]:
# Use scikit-learn metadata routing API (i.e. set_transform_request)
lagger = LagTransformer(windows=lag_windows).set_transform_request(ticker_series=True)
ranker = RankTransformer().set_transform_request(date_series=True)
ma_transformer = MovingAverageTransformer(windows=ma_windows).set_transform_request(
    ticker_series=True
)
log_return_transformer = LogReturnTransformer().set_transform_request(
    ticker_series=True
)

lagged_ranker = make_pipeline(log_return_transformer, ranker, lagger, ma_transformer)
display(lagged_ranker)

In [None]:
feature_names = ["open", "close", "volume"]
transformed_df = lagged_ranker.fit_transform(
    df_polars[feature_names],
    date_series=df_polars["date"],
    ticker_series=df_polars["ticker"],
)
transformed_df

In [None]:
# Visualization of the transformation into features
chart_df = pl.concat([df_polars, transformed_df], how="horizontal")
(
    chart_df.plot.line(x="date", y="close", color="ticker").properties(
        width=400, height=400, title="Stock Prices Over Time"
    )
    | chart_df.plot.line(
        x="date", y="close_log_return_rank_lag_0_ma20", color="ticker"
    ).properties(width=400, height=400, title="Normalized and Smoothed Average Features")
).interactive()