In [91]:
# data processing
import datetime
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional, Union

import pandas as pd
import polars as pl


@dataclass(kw_only=True)
class Data:
    df: pl.DataFrame
    columns: Optional[list[str]] = None
    row_index: Optional[list[int]] = None
    date_partition_column: Optional[Union[str, list[datetime.datetime]]] = None
    partition_column: Optional[str] = None
    date_column: Optional[str] = None
    date_format: str = "%Y-%m-%d"
    target_column: str

    @staticmethod
    def load(load_path: Path, target_column: str, **kwargs) -> "Data":
        df = pl.read_csv(
            load_path,
        )
        return Data(df=df, target_column=target_column, **kwargs)

    @property
    def rendered_df(self) -> pl.DataFrame:
        df = self.df
        if self.columns is not None and len(self.columns):
            df = df.select(self.columns)
        if self.row_index is not None and len(self.row_index):
            df = df[self.row_index]
        if self.date_column is not None and not df.dtypes[df.columns.index(self.date_column)] == pl.Datetime:
            df = df.with_columns(
                pl.col(self.date_column).str.to_datetime(self.date_format)
            )
        return df

    def get_date_partitions(self):
        df = self.rendered_df

        if self.date_column is None:
            raise ValueError("date column is not set")
        start_date = df[self.date_column].min()
        end_date = df[self.date_column].max()
        if self.date_partition_column is None:
            return {"__all__": df}

        elif isinstance(self.date_partition_column, str):
            return {
                group: group_df
                for group, group_df in df.groupby([self.date_partition_column])
            }

        elif isinstance(self.date_partition_column, list) and isinstance(
            self.date_partition_column[0], datetime.datetime
        ):
            partition_dates = self.date_partition_column
            return {
                start_date: df.filter(
                    df[self.date_column].is_between(start_date, end_date)
                )
                for start_date, end_date in zip(
                    [start_date] + partition_dates, partition_dates + [end_date]
                )
            }

    def get_partitions(self):
        df = self.get_dataframe()
        if self.partition_column is not None:
            return {
                group: df_group
                for group, df_group in df.groupby([self.partition_column])
            }
        else:
            return {"__all__": df}


@dataclass(kw_only=True)
class TrainingData(Data): ...


class ExternalHoldoutData(Data): ...


class FeatureSelectionMethod: ...


def load(path: str) -> tuple[TrainingData, ExternalHoldoutData]: ...


def variable_downsampling(data: Data) -> Data: ...


def target_engineering(data: Data) -> Data: ...


def feature_selection(data: Data, method: FeatureSelectionMethod) -> Data: ...

In [92]:
pwd

[32m'/home/lukas/code/testing/test_x_flow/recipe-xflow'[0m

In [93]:
training_data = TrainingData.load(
    load_path=Path("include/x_flow/raw_data/DR_Demo_Bond_trading_RFQ_train.csv"),
    date_column="date",
    date_partition_column=[datetime.datetime(2018, 6, 3), datetime.datetime(2018, 6, 24)],
    date_format="%d/%m/%Y",
    target_column="Mid",
)
test_data = ExternalHoldoutData.load(
    load_path=Path("include/x_flow/raw_data/DR_Demo_Bond_trading_RFQ_test.csv"),
    date_column="date",
    date_format="%d/%m/%Y",
    target_column="Mid",
)

In [94]:
import datarobotx

In [95]:
from abc import ABC, abstractmethod

from utils.operator import Operator


class DataPreprocessor(ABC):
    def fit(self, df: Data) -> "DataPreprocessor":
        return self._fit(df)

    def transform(self, df: Data) -> Data:
        return self._transform(df)

    def fit_transform(self, df: Data) -> Data:
        return self._fit(df)._transform(df)

    @abstractmethod
    def _fit(self, df: Data) -> "DataPreprocessor": ...
    @abstractmethod
    def _transform(self, df: Data) -> Data: ...


class BinarizeData(DataPreprocessor):
    def __init__(
        self,
        threshold: float,
        operator: str,
        binarize_drop_regression_target=True,
        binarize_new_target_name="target_cat",
    ):
        self._threshold = threshold
        self._operator = operator
        self._binarize_drop_regression_target = binarize_drop_regression_target
        self._binarize_new_target_name = binarize_new_target_name

    def _fit(self, df: Data):
        return self

    def _transform(self, df: Data) -> pl.DataFrame:
        """helper function: binarize a target variable for classification"""
        categorical_data = df.rendered_df
        target_series = categorical_data[df.target_column]

        op_fun = Operator(operator=self._operator).apply_operation(self._threshold)

        categorical_data = categorical_data.with_columns(
            target_series.map_elements(op_fun, return_dtype=bool).alias(self._binarize_new_target_name)
        )
        if self._binarize_drop_regression_target:
            categorical_data.drop(df.target_column)

        df.df = categorical_data
        df.target_column = self._binarize_new_target_name

        return categorical_data

In [96]:
binarizer = BinarizeData(
    threshold=100, operator="<", binarize_drop_regression_target=True)

In [99]:
binarizer.fit_transform(training_data)
binarizer.fit_transform(test_data)

request_id,date,cusip,BidAsk,Mid,yield_to_maturity,years_to_maturity,years_since_issue,is_bench_02y,is_bench_05y,is_bench_10y,is_bench_30y,IssueName,AmountOut,issue_date,Coupon,Currency,maturity_date,counterparty_id,counterparty_type,name,counterparty_aum,ann_account_value_bln,tier,salesperson,salesperson_num_ac_covered,notional_EURm,notional_bps_total_issue,log_notional,resp_bidAsk_norm,response_revenue_EUR,num_brokers,trade_won,target_cat
i64,datetime[μs],str,f64,f64,f64,f64,f64,bool,bool,bool,bool,str,f64,str,f64,str,str,str,str,str,f64,f64,str,str,i64,i64,f64,f64,f64,f64,i64,bool,bool
1781,2018-04-25 00:00:00,"""D20659WR""",0.0242,106.5311,-0.492709,1.957604,7.685305,true,false,false,false,"""Government of Germany 2.25% 04…",17443.19954,"""18/08/2010""",2.25,"""Euro""","""09/04/2020""","""LO090""","""LO""","""Jackson Trust""",135.431575,50.701944,"""B""","""Louis""",3,228,130.709965,2.357935,2.299022,27923.75113,9,false,false
1782,2019-05-30 00:00:00,"""D2R8H4DN""",0.0266,102.6897,-0.606078,4.616111,0.840537,false,false,false,false,"""Government of Germany 0.0% 13-…",17443.19954,"""27/07/2018""",0.0,"""Euro""","""10/01/2024""","""BK057""","""BK""","""Offenbach am Mainer Sparkasse""",25.368607,9.650497,"""E""","""Charlotte""",30,104,59.622089,2.017033,2.42161,12932.74292,1,true,false
1784,2019-01-30 00:00:00,"""D2R8H4DE""",0.022,101.7326,-0.408307,5.013108,0.991122,false,true,false,false,"""Government of Germany 0.0% 14-…",17443.19954,"""02/02/2018""",0.0,"""Euro""","""04/02/2024""","""LO071""","""LO""","""Brownsville Investments""",185.46821,317.609109,"""A""","""Kate""",9,188,107.778392,2.274158,2.422428,23167.85569,10,false,false
1788,2019-01-23 00:00:00,"""D2R8H4CM""",0.03,101.753,-0.468608,3.460714,1.546917,false,false,false,false,"""Government of Germany 0.0% 07-…",18533.39951,"""07/07/2017""",0.0,"""Euro""","""10/07/2022""","""HF027""","""HF""","""Oceanus Capital Partners""",12.000902,164.476827,"""B""","""Philip""",8,695,374.998661,2.841985,2.230289,78872.79592,13,false,false
1789,2018-01-24 00:00:00,"""D20658UR""",0.13,158.2486,0.914769,16.200196,14.981827,false,false,false,false,"""Government of Germany 4.75% 04…",21803.99942,"""31/01/2003""",4.75,"""Euro""","""07/04/2034""","""HF035""","""HF""","""Atlas Alpha LLP""",8.816866,143.542231,"""C""","""Meghan""",41,261,119.70281,2.416641,7.537577,155725.778,14,false,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
21979,2019-01-29 00:00:00,"""D2R8H4DJ""",0.02523,100.834215,-0.60674,1.853563,0.681739,false,false,false,false,"""Government of Germany 0.0% 12-…",13082.39965,"""25/05/2018""",0.0,"""Euro""","""06/12/2020""","""LO023""","""LO""","""New York City Global Investors""",167.298627,14.611218,"""B""","""Philip""",8,56,42.805603,1.748188,2.796823,7897.420481,10,false,false
21986,2018-10-30 00:00:00,"""D2R8H4CD""",0.02686,101.54061,-0.444156,3.761884,1.735833,false,false,false,false,"""Government of Germany 0.0% 08-…",19623.59948,"""03/02/2017""",0.0,"""Euro""","""04/08/2022""","""HF039""","""HF""","""Helios Partners""",9.575365,10.705201,"""E""","""Edward""",21,14,7.134267,1.146128,3.776311,2684.497671,13,false,false
21987,2019-07-29 00:00:00,"""D20659F3""",0.0239,107.83025,-0.759609,1.692027,8.249314,false,false,false,false,"""Government of Germany 3.25% 04…",20713.79945,"""29/04/2011""",3.25,"""Euro""","""07/04/2021""","""LO015""","""LO""","""Indianapolis Advisors""",26.797261,51.242625,"""D""","""Camilla""",17,41,19.793568,1.612784,2.785548,6158.191403,10,false,false
21989,2018-07-23 00:00:00,"""D20658KG""",0.1028,143.1976,0.328738,9.708618,20.052431,false,false,true,false,"""Government of Germany 4.75% 04…",12264.74968,"""04/07/1998""",4.75,"""Euro""","""07/04/2028""","""LO141""","""LO""","""Cleveland Capital""",240.47282,151.581871,"""A""","""Kate""",9,187,152.46948,2.271842,7.585086,101592.9891,4,false,false


In [88]:
import polars as pl

# Sample DataFrame
df = pl.DataFrame({
    "values": [10, 20, 30, 40, 50]
})

# Define the threshold
threshold = 25

# Applying a function to create a new boolean column
df = df.with_columns(
    df["values"].apply(lambda x: x < threshold).alias("is_under_threshold")
)

print(df)

shape: (5, 2)
┌────────┬────────────────────┐
│ values ┆ is_under_threshold │
│ ---    ┆ ---                │
│ i64    ┆ bool               │
╞════════╪════════════════════╡
│ 10     ┆ true               │
│ 20     ┆ true               │
│ 30     ┆ false              │
│ 40     ┆ false              │
│ 50     ┆ false              │
└────────┴────────────────────┘
