# Preprocessing demo

This is a demo about data preprocessing.
We take a raw transactions and convert it to `ptls` format (list with feature dicts).

Each paragraph shows a small example.

# Colab setup

In [1]:
import sys
if "google.colab" in str(get_ipython()):
    ! {sys.executable} -m pip install pytorch-lifestream

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting omegaconf (from pytorch-lifestream)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pytorch-lightning>=1.6.0 (from pytorch-lifestream)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.9.0 (from pytorch-lifestream)
  Downloading torchmetrics-1.4.1-py3-none-any.whl.metadata (20 kB)
Collecting antlr4-python3-runtime==4.9.* (from hydra-core>=1.1.2->pytorch-lifestream)
  Downloading antlr4-python3-runtime-4.

# Common tools for this demo

In [2]:
import pandas as pd

from IPython.display import display
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import FeatureDict

In [3]:
def load_trx(keep_original_trans_date=False):
    """
    Load trx.

    Only for demo purpose we:
        - convert trans_date to datetime.
          Usually date time field are already hase datetime format.
        - load only first 1M records. Demo will be faster
    """

    path = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true"

    df = pd.read_csv(path, compression="gzip", nrows=1e6)

    if not keep_original_trans_date:
        df["trans_date"] = (
            pd.to_datetime("2000-01-01") + pd.to_timedelta(df["trans_date"], "D")
        ).astype(str)

    df["amount_rur"] = df["amount_rur"].astype("float32")

    return df

In [4]:
def load_target():
    path = "https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true"
    df = pd.read_csv(path)

    return df

In [5]:
def print_feature_dict(d):
    """
    Print dict with only first 10 trnasactions
    """
    print("\n".join([
        f"{k}: {v[:10]} (first 10 values)"
        if FeatureDict.is_seq_feature(k, v) else f"{k}: {v}"
        for k, v in d.items()
    ]))

# Base Example

In [6]:
df_trx = load_trx()
df_trx

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,2000-01-07,4,71.462997
1,33172,2000-01-07,35,45.016998
2,33172,2000-01-09,11,13.887000
3,33172,2000-01-10,11,15.983000
4,33172,2000-01-11,11,21.341000
...,...,...,...,...
999995,19189,2001-10-26,36,98.331001
999996,19189,2001-10-26,15,17.247000
999997,19189,2001-10-26,24,150.645004
999998,19189,2001-10-26,1,51.473999


In [7]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
)

In [8]:
df_data = preprocessor.fit_transform(df_trx)

In [9]:
print(f"Unique clients count: {len(df_data)}")

Unique clients count: 1145


In [10]:
print_feature_dict(df_data[0])

client_id: 6
event_time: tensor([946684800, 947116800, 947548800, 947635200, 947980800, 947980800,
        948067200, 948067200, 948153600, 948240000]) (first 10 values)
small_group: tensor([ 4,  3,  1,  3,  4,  1,  4,  3, 22,  2]) (first 10 values)
amount_rur: tensor([ 4.0540, 13.7380, 20.7010, 21.5640, 13.4990, 23.7220,  4.3040,  8.6250,
        12.9380, 28.1620]) (first 10 values)


In [11]:
preprocessor.get_category_dictionary_sizes()

{'small_group': 186}

pay attention:
- `client_id` presented as scalar value
- `small_group` converted to embedding indexes with frequency encoding. There are 186 unique values incluging 0 padding token exists.
- `amount_rur` don"t changed
- `event_time` converted to timestamp

# Join target

## before preprocessing

In [12]:
df_trx = load_trx()
df_target = load_target()

In [13]:
df_trx

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,2000-01-07,4,71.462997
1,33172,2000-01-07,35,45.016998
2,33172,2000-01-09,11,13.887000
3,33172,2000-01-10,11,15.983000
4,33172,2000-01-11,11,21.341000
...,...,...,...,...
999995,19189,2001-10-26,36,98.331001
999996,19189,2001-10-26,15,17.247000
999997,19189,2001-10-26,24,150.645004
999998,19189,2001-10-26,1,51.473999


In [14]:
df_target

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3
...,...,...
29995,14303,1
29996,22301,2
29997,25731,0
29998,16820,3


In [15]:
df_trx = pd.merge(df_trx, df_target, on="client_id", how="inner")
df_trx

Unnamed: 0,client_id,trans_date,small_group,amount_rur,bins
0,33172,2000-01-07,4,71.462997,0
1,33172,2000-01-07,35,45.016998,0
2,33172,2000-01-09,11,13.887000,0
3,33172,2000-01-10,11,15.983000,0
4,33172,2000-01-11,11,21.341000,0
...,...,...,...,...,...
999995,19189,2001-10-26,36,98.331001,0
999996,19189,2001-10-26,15,17.247000,0
999997,19189,2001-10-26,24,150.645004,0
999998,19189,2001-10-26,1,51.473999,0


> **Note:**
> Target are multiplyed. Each row for `client_id` has the same `bins`

In [16]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
    cols_first_item=["bins"],  # As target is the same for all transactions, lets take it from 1st record
)

In [17]:
df_data = preprocessor.fit_transform(df_trx)

In [18]:
print(f"Unique clients count: {len(df_data)}")

Unique clients count: 1145


In [19]:
print_feature_dict(df_data[0])

client_id: 6
bins: 1
event_time: tensor([946684800, 947116800, 947548800, 947635200, 947980800, 947980800,
        948067200, 948067200, 948153600, 948240000]) (first 10 values)
small_group: tensor([ 4,  3,  1,  3,  4,  1,  4,  3, 22,  2]) (first 10 values)
amount_rur: tensor([ 4.0540, 13.7380, 20.7010, 21.5640, 13.4990, 23.7220,  4.3040,  8.6250,
        12.9380, 28.1620]) (first 10 values)


pay attention:
- `client_id` presented as scalar value
- `small_group` converted to embedding indexes with frequency encoding
- `amount_rur` don"t changed
- `event_time` converted to timestamp
- target value `bins` is in dict

## after preprocessing

In [20]:
df_trx = load_trx()
df_trx

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,2000-01-07,4,71.462997
1,33172,2000-01-07,35,45.016998
2,33172,2000-01-09,11,13.887000
3,33172,2000-01-10,11,15.983000
4,33172,2000-01-11,11,21.341000
...,...,...,...,...
999995,19189,2001-10-26,36,98.331001
999996,19189,2001-10-26,15,17.247000
999997,19189,2001-10-26,24,150.645004
999998,19189,2001-10-26,1,51.473999


In [21]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
    return_records=False,  # change output type
)

In [22]:
df_data = preprocessor.fit_transform(df_trx)

In [23]:
print(f"Unique clients count: {len(df_data)}")

Unique clients count: 1145


> **Note:**
> `df_data` is not a `list` this is `DataFrame`

In [24]:
df_target = load_target()
df_target

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3
...,...,...
29995,14303,1
29996,22301,2
29997,25731,0
29998,16820,3


In [25]:
df_data = pd.merge(df_data, df_target, on="client_id", how="inner")
df_data = df_data.to_dict(orient="records")

In [26]:
print(f"Unique clients count: {len(df_data)}")

Unique clients count: 1145


In [27]:
print_feature_dict(df_data[0])

client_id: 6
event_time: tensor([946684800, 947116800, 947548800, 947635200, 947980800, 947980800,
        948067200, 948067200, 948153600, 948240000]) (first 10 values)
small_group: tensor([ 4,  3,  1,  3,  4,  1,  4,  3, 22,  2]) (first 10 values)
amount_rur: tensor([ 4.0540, 13.7380, 20.7010, 21.5640, 13.4990, 23.7220,  4.3040,  8.6250,
        12.9380, 28.1620]) (first 10 values)
bins: 1


pay attention:
- `client_id` presented as scalar value
- `small_group` converted to embedding indexes with frequency encoding
- `amount_rur` don"t changed
- `event_time` converted to timestamp
- target value `bins` is in dict

# Without data preprocessing

Original `trans_date` in this dataset is not a datetime. But we can use if for trx ordering

In [28]:
df_trx = load_trx(keep_original_trans_date=True)
df_trx

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.462997
1,33172,6,35,45.016998
2,33172,8,11,13.887000
3,33172,9,11,15.983000
4,33172,10,11,21.341000
...,...,...,...,...
999995,19189,664,36,98.331001
999996,19189,664,15,17.247000
999997,19189,664,24,150.645004
999998,19189,664,1,51.473999


In [29]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",  # none transformation
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
)

In [30]:
df_data = preprocessor.fit_transform(df_trx)

In [31]:
print(f"Unique clients count: {len(df_data)}")

Unique clients count: 1145


In [32]:
print_feature_dict(df_data[0])

client_id: 6
trans_date: tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]) (first 10 values)
event_time: tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]) (first 10 values)
small_group: tensor([ 4,  3,  1,  3,  4,  1,  4,  3, 22,  2]) (first 10 values)
amount_rur: tensor([ 4.0540, 13.7380, 20.7010, 21.5640, 13.4990, 23.7220,  4.3040,  8.6250,
        12.9380, 28.1620]) (first 10 values)


pay attention:
- `client_id` presented as scalar value
- `small_group` converted to embedding indexes with frequency encoding
- `amount_rur` don"t changed
- `event_time` keep original value. Trx are ordered

# Category identity encoding

`small_group` in out dataset is encoded mcc code. It seems we don"t need freauency encoding.

In [33]:
df_trx = load_trx(keep_original_trans_date=True)
df_trx

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.462997
1,33172,6,35,45.016998
2,33172,8,11,13.887000
3,33172,9,11,15.983000
4,33172,10,11,21.341000
...,...,...,...,...
999995,19189,664,36,98.331001
999996,19189,664,15,17.247000
999997,19189,664,24,150.645004
999998,19189,664,1,51.473999


In [34]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",  # none transformation
    cols_category=["small_group"],
    category_transformation="none",
    cols_numerical=["amount_rur"],
)

In [35]:
df_data = preprocessor.fit_transform(df_trx)

  self.min_fit_index, self.max_fit_index = pd_col.agg([min, max])
  self.min_fit_index, self.max_fit_index = pd_col.agg([min, max])
  min_index, max_index = pd_col.agg([min, max])
  min_index, max_index = pd_col.agg([min, max])


> Oops. 0 is padding token. Let"s shift values to 1

In [36]:
df_trx["small_group"] = df_trx["small_group"] + 1

In [37]:
df_data = preprocessor.fit_transform(df_trx)

  self.min_fit_index, self.max_fit_index = pd_col.agg([min, max])
  self.min_fit_index, self.max_fit_index = pd_col.agg([min, max])
  min_index, max_index = pd_col.agg([min, max])
  min_index, max_index = pd_col.agg([min, max])


In [38]:
print(f"Unique clients count: {len(df_data)}")

Unique clients count: 1145


In [39]:
print_feature_dict(df_data[0])

client_id: 6
trans_date: tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]) (first 10 values)
event_time: tensor([ 0,  5, 10, 11, 15, 15, 16, 16, 17, 18]) (first 10 values)
small_group: tensor([16,  4,  2,  4, 16,  2, 16,  4, 38, 12]) (first 10 values)
amount_rur: tensor([ 4.0540, 13.7380, 20.7010, 21.5640, 13.4990, 23.7220,  4.3040,  8.6250,
        12.9380, 28.1620]) (first 10 values)


In [40]:
preprocessor.get_category_dictionary_sizes()

{'small_group': 189}

pay attention:
- `client_id` presented as scalar value
- `small_group` is embedding indexes but orignal values is used. 189 indexes including 0 padding token are used. There was 186 with frequency encoding. This means that some indexes aren"t presented in dataset and aren"t used.
- `amount_rur` don"t changed
- `event_time` keep original value. Trx are ordered