## Colab install

In [None]:
import sys
if "google.colab" in str(get_ipython()):
    ! {sys.executable} -m pip install pytorch-lifestream
    ! {sys.executable} -m pip install pyspark

Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.6.0.tar.gz (163 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m153.6/163.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.4/163.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2 (from pytorch-lifestream)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting omegaconf (from pytorch-lifestream)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pytorch-lightning>=1.6.0 (from pytorch-lifestream)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (2

# Libraries

In [None]:
import os
import pyspark
import torchmetrics
import torch
import torch.nn as nn
import pytorch_lightning as pl

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, lpad
from pyspark.sql.functions import array, max, when
from pyspark.sql.types import ArrayType, IntegerType
from pyspark.sql.window import Window

from ptls.frames import PtlsDataModule
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.data_load.datasets import PersistDataset
from ptls.data_load.datasets import ParquetDataset, ParquetFiles
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.preprocessing import PysparkDataPreprocessor
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget

from tqdm import tqdm
from functools import partial
from urllib.request import urlretrieve

## Spark init


In [None]:
data_path = "data/"

spark_conf = pyspark.SparkConf()
spark_conf.setMaster("local[*]").setAppName("PysparkDataPreprocessor")
spark_conf.set("spark.driver.maxResultSize", "4g")
spark_conf.set("spark.executor.memory", "16g")
spark_conf.set("spark.executor.memoryOverhead", "4g")
spark_conf.set("spark.driver.memory", "16g")
spark_conf.set("spark.driver.memoryOverhead", "4g")
spark_conf.set("spark.cores.max", "24")
spark_conf.set("spark.sql.shuffle.partitions", "200")
spark_conf.set("spark.local.dir", "../../spark_local_dir")

spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
spark.sparkContext.getConf().getAll()

[('spark.driver.memoryOverhead', '4g'),
 ('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'),
 ('spark.app.id', 'local-1723911551703'),
 ('spark.driver.port'

## Processing data

In [None]:
path = "https://huggingface.co/datasets/dllllb/transactions-gender/resolve/main/transactions.csv.gz?download=true"
data, _ = urlretrieve(path, "transactions.csv.gz")

In [None]:
source_data = spark.read.options(header=True, inferSchema=True).csv(data)
source_data.show(5)

+-----------+-----------+--------+-------+---------+-------+
|customer_id|tr_datetime|mcc_code|tr_type|   amount|term_id|
+-----------+-----------+--------+-------+---------+-------+
|   39026145| 0 10:23:26|    4814|   1030| -2245.92|   NULL|
|   39026145| 1 10:19:29|    6011|   7010| 56147.89|   NULL|
|   39026145| 1 10:20:56|    4829|   2330|-56147.89|   NULL|
|   39026145| 1 10:39:54|    5499|   1010| -1392.47|   NULL|
|   39026145| 2 15:33:42|    5499|   1010|  -920.83|   NULL|
+-----------+-----------+--------+-------+---------+-------+
only showing top 5 rows



In [None]:
source_data = source_data.withColumn("tr_datetime", lpad(source_data.tr_datetime, 16, "0"))
source_data = source_data.drop(source_data.term_id)
source_data.show(5)

+-----------+----------------+--------+-------+---------+
|customer_id|     tr_datetime|mcc_code|tr_type|   amount|
+-----------+----------------+--------+-------+---------+
|   39026145|0000000 10:23:26|    4814|   1030| -2245.92|
|   39026145|0000001 10:19:29|    6011|   7010| 56147.89|
|   39026145|0000001 10:20:56|    4829|   2330|-56147.89|
|   39026145|0000001 10:39:54|    5499|   1010| -1392.47|
|   39026145|0000002 15:33:42|    5499|   1010|  -920.83|
+-----------+----------------+--------+-------+---------+
only showing top 5 rows



Take N=50 most popular mcc codes.

In [None]:
num_labels = 50
mcc_codes = (
    source_data.groupBy("mcc_code")
    .count()
    .orderBy(col("count").desc())
    .select("mcc_code")
    .rdd
    .flatMap(lambda x: x)
    .take(num_labels)
)

In [None]:
def get_target(data, mcc_codes):
    for code in tqdm(mcc_codes):
        col_name = str(code)
        data = data.withColumn(
            col_name, when(data.mcc_code == code, 1).otherwise(0)
        )

    data = data.groupBy("customer_id").max(*[str(code) for code in mcc_codes])

    data = data.withColumn(
        "target",
        array(["max({})".format(code) for code in mcc_codes])
    )

    return data.select("customer_id", "target")

In [None]:
target = get_target(source_data, mcc_codes)
target.show(5)

100%|██████████| 50/50 [00:02<00:00, 17.77it/s]


+-----------+--------------------+
|customer_id|              target|
+-----------+--------------------+
|    1804650|[1, 1, 1, 1, 1, 0...|
|   57210145|[1, 1, 1, 1, 0, 0...|
|   28064405|[1, 1, 1, 1, 1, 1...|
|   52646727|[1, 1, 1, 1, 1, 1...|
|    9459005|[1, 0, 1, 1, 1, 1...|
+-----------+--------------------+
only showing top 5 rows



In [None]:
preprocessor_pyspark = PysparkDataPreprocessor(
    col_id='customer_id',
    col_event_time='tr_datetime',
    event_time_transformation='none',
    cols_category= ['tr_type', 'mcc_code'],
    cols_numerical=['amount'],
)

In [None]:
data = preprocessor_pyspark.fit_transform(source_data).persist()
data = data.drop(data.tr_datetime)
data = data.join(
    target,
    data.customer_id == target.customer_id
).drop(target.customer_id)

## Preparing data for training

In [None]:
data_train, data_valid, data_test = data.randomSplit([0.75, 0.1, 0.15])

In [None]:
data_train.write.parquet("/content/train.parquet", mode="overwrite")
data_valid.write.parquet("/content/valid.parquet", mode="overwrite")
data_test.write.parquet("/content/test.parquet", mode="overwrite")

In [None]:
train_dataset = ParquetDataset(
    ParquetFiles("/content/train.parquet"),
    i_filters=[
        SeqLenFilter(min_seq_len=15),
    ],
)

valid_dataset = ParquetDataset(ParquetFiles("/content/valid.parquet"))

test_dataset = ParquetDataset(ParquetFiles("/content/test.parquet"))

In [None]:
train_dataset = PersistDataset(data=train_dataset)
valid_dataset = PersistDataset(data=valid_dataset)
test_dataset = PersistDataset(data=test_dataset)

In [None]:
print(
    "Train size: {} , Valid size: {}, Test size: {}".format(
        *[len(x) for x in [train_dataset, valid_dataset, test_dataset]]
    )
)

Train size: 11121 , Valid size: 1480, Test size: 2272


## Define encoder

In [None]:
preprocessor_pyspark.get_category_dictionary_sizes()

{'tr_type': 79, 'mcc_code': 186}

In [None]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            "tr_type": {"in": 79, "out": 31},
        },
        numeric_values={
            "amount": "identity",
        },
        embeddings_noise=0.001,
    ),
    hidden_size=32,
)

## Define classification model

In [None]:
sup_data = PtlsDataModule(
    train_data=SeqToTargetDataset(train_dataset, target_col_name="target", target_dtype=torch.float),
    valid_data=SeqToTargetDataset(valid_dataset, target_col_name="target", target_dtype=torch.float),
    test_data=SeqToTargetDataset(test_dataset, target_col_name="target", target_dtype=torch.float),
    train_batch_size=16,
    valid_batch_size=16,
    train_num_workers=8,
)

In [None]:
num_labels = 50

classifier = nn.Sequential(
    nn.Linear(32, num_labels),
    nn.Sigmoid(),
)

In [None]:
sup_module = SequenceToTarget(
    seq_encoder=seq_encoder,
    head=classifier,
    loss=nn.BCELoss(),
    metric_list=torchmetrics.Accuracy(task="multilabel", num_labels=num_labels),
    optimizer_partial=partial(torch.optim.Adam),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=4, gamma=0.5),
)

## Training

In [None]:
trainer = pl.Trainer(
    max_epochs=10,
    accelerator="cuda" if torch.cuda.is_available() else "cpu",
    enable_progress_bar=False,
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(sup_module, sup_data)
print(trainer.logged_metrics)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name          | Type          | Params | Mode 
--------------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder | 8.8 K  | train
1 | head          | Sequential    | 1.7 K  | train
2 | loss          | BCELoss       | 0      | train
3 | train_metrics | ModuleDict    | 0      | train
4 | valid_metrics | ModuleDict    | 0      | train
5 | test_metrics  | ModuleDict    | 0      | train
--------------------------------------------------------
10.5 K    Trainable params
0         Non-trainable params
10.5 K    Total params
0.042     Total estimated model params size (MB)
22        Modules in train mode
0         Modules in eval mode
  self.pid = os.fork()
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


{'loss': tensor(0.3683), 'seq_len': tensor(143.), 'y': tensor(11.), 'val_loss': tensor(0.3650), 'valid/MultilabelAccuracy': tensor(0.8232)}


In [None]:
trainer.test(
    ckpt_path="best",
    dataloaders=sup_data.test_dataloader(),
)

INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_0/checkpoints/epoch=9-step=6960.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_0/checkpoints/epoch=9-step=6960.ckpt
  self.pid = os.fork()


  self.pid = os.fork()


[{'test/MultilabelAccuracy': 0.8220246434211731}]