## Data load

In [1]:
import os

if not os.path.exists('data/transactions_train.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/di-datasets/age-prediction-nti-sbebank-2019.zip
    ! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d data
    ! mv age-prediction-nti-sbebank-2019.zip data/

# Setup

In [2]:
%load_ext autoreload
%autoreload 2

import logging
import pytorch_lightning as pl
import warnings

warnings.filterwarnings('ignore')
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

## Pyspark Data Preproccessing

In [4]:
import os
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import types as T


data_path = 'data/'

spark_conf = pyspark.SparkConf()
spark_conf.setMaster("local[*]").setAppName("PysparkDataPreprocessor")
spark_conf.set("spark.driver.maxResultSize", "4g")
spark_conf.set("spark.executor.memory", "16g")
spark_conf.set("spark.executor.memoryOverhead", "4g")
spark_conf.set("spark.driver.memory", "16g")
spark_conf.set("spark.driver.memoryOverhead", "4g")
spark_conf.set("spark.cores.max", "24")
spark_conf.set("spark.sql.shuffle.partitions", "200")
spark_conf.set("spark.local.dir", "../../spark_local_dir")


spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
spark.sparkContext.getConf().getAll()

[('spark.executor.memoryOverhead', '4g'),
 ('spark.driver.memoryOverhead', '4g'),
 ('spark.app.startTime', '1651943944680'),
 ('spark.local.dir', '../../spark_local_dir'),
 ('spark.executor.id', 'driver'),
 ('spark.sql.warehouse.dir',
  'file:/mnt/mikheev/pytorch-lifestream/demo/spark-warehouse'),
 ('spark.driver.memory', '16g'),
 ('spark.executor.memory', '16g'),
 ('spark.driver.host', 'tdl3.internal.cloudapp.net'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.sql.shuffle.partitions', '200'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'local-1651943945630'),
 ('spark.app.name', 'PysparkDataPreprocessor'),
 ('spark.cores.max', '24'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.driver.port', '38697'),
 ('spark.driver.maxResultSize', '4g')]

In [5]:
source_data = spark.read.options(header=True, inferSchema=True).csv(os.path.join(data_path, 'transactions_train.csv'))
source_data.show(2)

                                                                                

+---------+----------+-----------+----------+
|client_id|trans_date|small_group|amount_rur|
+---------+----------+-----------+----------+
|    33172|         6|          4|    71.463|
|    33172|         6|         35|    45.017|
+---------+----------+-----------+----------+
only showing top 2 rows



In [6]:
from ptls.data_preprocessing import PysparkDataPreprocessor

preprocessor = PysparkDataPreprocessor(
    col_id='client_id',
    cols_event_time='trans_date',
    time_transformation='float',
    cols_category=["trans_date", "small_group"],
    cols_log_norm=["amount_rur"],
    cols_identity=[],
    print_dataset_info=False,
)

In [10]:
%%time

dataset_pysparkdf = preprocessor.fit_transform(source_data).persist()
dataset_pysparkdf.count()

In [11]:
dataset_pysparkdf.show()

+---------+--------------------+--------------------+--------------------+--------------------+
|client_id|          trans_date|         small_group|          amount_rur|          event_time|
+---------+--------------------+--------------------+--------------------+--------------------+
|      463|[726, 724, 725, 7...|[7, 1, 7, 3, 3, 2...|[0.32135927726085...|[1.0, 2.0, 5.0, 7...|
|      471|[730, 726, 726, 7...|[68, 5, 4, 7, 1, ...|[0.26422890225828...|[0.0, 1.0, 1.0, 1...|
|      496|[723, 723, 722, 7...|[3, 1, 1, 1, 1, 4...|[0.27119266729746...|[3.0, 3.0, 4.0, 5...|
|      833|[726, 726, 726, 7...|[17, 15, 1, 49, 3...|[0.28922101008224...|[1.0, 1.0, 1.0, 2...|
|     1238|[730, 723, 716, 7...|[3, 11, 3, 3, 1, ...|[0.13326234667635...|[0.0, 3.0, 9.0, 1...|
|     1342|[730, 722, 702, 7...|[14, 3, 3, 21, 37...|[0.11547227625749...|[0.0, 4.0, 11.0, ...|
|     1591|[724, 724, 723, 7...|[37, 20, 7, 6, 1,...|[0.35099852868535...|[2.0, 2.0, 3.0, 3...|
|     1645|[724, 723, 723, 7...|[2, 16, 

In [12]:
dataset_pysparkdf.dtypes

[('client_id', 'int'),
 ('trans_date', 'array<int>'),
 ('small_group', 'array<int>'),
 ('amount_rur', 'array<double>'),
 ('event_time', 'array<float>')]

In [16]:
test_df = dataset_pysparkdf.sample(fraction=0.2)
train_df = dataset_pysparkdf.subtract(test_df)

valid_df = train_df.sample(fraction=0.1)
train_df = train_df.subtract(valid_df)

print('Size of test dataset:', test_df.count())
print('Size of train dataset', train_df.count())
print('Size of valid dataset', valid_df.count())

test_df.write.parquet('test.parquet')
train_df.write.parquet('train.parquet')
valid_df.write.parquet('valid.parquet')

Size of test dataset: 5866


                                                                                

Size of train dataset 21856


                                                                                

Size of valid dataset 2278


                                                                                

## Inference 

### load SequenceEncoder obtained from `coles-emb.ipynb`

In [19]:
import torch
from ptls.seq_encoder import SequenceEncoder
from ptls.models import Head
from ptls.lightning_modules.emb_module import EmbModule

seq_encoder = SequenceEncoder(
    category_features=preprocessor.get_category_sizes(),
    numeric_features=["amount_rur"],
    trx_embedding_noize=0.003
)

head = Head(input_size=seq_encoder.embedding_size, use_norm_encoder=True)

model = EmbModule(seq_encoder=seq_encoder, head=head)

model.load_state_dict(torch.load('coles-emb.pt'))
model.eval()

In [20]:
import numpy as np

train = train_df.toPandas().to_dict('records')
for d in train:
    for k, v in d.items():
        if isinstance(v, list):
            d[k] = np.array(v)

test = test_df.toPandas().to_dict('records')
for d in test:
    for k, v in d.items():
        if isinstance(v, list):
            d[k] = np.array(v)

                                                                                

### embedding inference

In [21]:
from ptls.data_load.data_module.emb_data_module import inference_data_loader


trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)

train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl))

test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

train_embeds.shape, test_embeds.shape

Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

(torch.Size([21856, 512]), torch.Size([5866, 512]))

In [22]:
import pandas as pd


# join target and embeddings

df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)
train_df.head(2)

(21856, 514) (5866, 514)


Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,...,embed_504,embed_505,embed_506,embed_507,embed_508,embed_509,embed_510,embed_511,client_id,target
0,0.208858,-0.221062,-0.224066,-0.087435,-0.195247,0.014467,0.360516,0.706064,0.124512,0.07878,...,0.488976,-0.970627,0.778389,0.143021,0.521348,0.109524,-0.045446,0.140089,4497,2
1,0.256195,-0.181683,-0.223095,-0.068763,-0.159413,-0.020901,-0.075331,0.081187,0.036054,0.146683,...,0.310049,-0.880986,0.507568,0.116876,0.536976,0.121534,-0.061394,0.124128,15439,1


Obtained embeddings can be used as features for model training

For example:

In [23]:
from sklearn.ensemble import RandomForestClassifier

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.5744971019434026