In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

import pandas as pd
import numpy as np

from ptls.preprocessing import PandasDataPreprocessor
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint

from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder, AggFeatureSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.datasets import MemoryMapDataset
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices, NoSplit
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import inference_data_loader
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.data_load.iterable_processing.iterable_seq_len_limit import ISeqLenLimit
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset

from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

from tqdm.auto import tqdm
import lightgbm as ltb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# data_path = "../data/Hackathon/trx_train.parquet"

# files = ['part-0.parquet'] 
# files = [os.path.join(data_path, f) for f in files]

geo_train = pd.read_parquet('../geo_train.parquet')
geo_test = pd.read_parquet('../geo_test.parquet')

In [3]:
geo_train = pd.concat([geo_train, geo_test])

In [4]:
geo_train = geo_train[geo_train['event_time'].dt.year == 2022]

In [5]:
geo_train['hour'] = geo_train['event_time'].dt.hour
geo_train['weekday'] = geo_train['event_time'].dt.weekday

geo_test['hour'] = geo_test['event_time'].dt.hour
geo_test['weekday'] = geo_test['event_time'].dt.weekday

In [6]:
test_target_b = pd.read_parquet("../test_target_b.parquet")

In [7]:
%%time

preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="event_time",
    event_time_transformation="dt_to_timestamp",
    cols_category=[
        "geohash_4",
        "geohash_5",
        "geohash_6",
#         "hour",
#         "weekday"
    ],
    cols_identity=["hour", "weekday"],
    return_records=True,
)

dataset_train = preprocessor.fit_transform(geo_train)
dataset_test = preprocessor.transform(geo_test)

dataset_train_df = pd.DataFrame(dataset_train)
dataset_test_df = pd.DataFrame(dataset_test)

CPU times: user 41min 16s, sys: 18min 23s, total: 59min 40s
Wall time: 1h 10s


In [8]:
clients_train = [rec.get('client_id') for rec in dataset_train]

In [9]:
target_train = pd.read_parquet("../train_target.parquet")
target_train = pd.concat([target_train, test_target_b])

In [12]:
target_preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="mon",
    event_time_transformation="dt_to_timestamp",
    cols_identity=["target_1", "target_2", "target_3", "target_4"],
    return_records=False,
)

processed_target = target_preprocessor.fit_transform(target_train)

In [13]:
from datetime import datetime

# Отлаживаю этот блок

In [14]:
def get_min_max_month(rec):
    months_tensor = rec.get('event_time')
    first_month = int(datetime.utcfromtimestamp(months_tensor[0].item()).strftime('%m'))
    last_month = int(datetime.utcfromtimestamp(months_tensor[-1].item()).strftime('%m'))
    year = int(datetime.utcfromtimestamp(months_tensor[0].item()).strftime('%Y'))
    return first_month, last_month, year

class GetSplit(IterableProcessingDataset):
    def __init__(
        self,
        start_month,
        end_month,
        year=2022,
        col_id='client_id',
        col_time='event_time'
    ):
        super().__init__()
        self.start_month = start_month
        self.end_month = end_month
        self._year = year
        self._col_id = col_id
        self._col_time = col_time

    def __iter__(self):
        
        for rec in self._src:
#             print(rec)
            min_month, max_month, year = get_min_max_month(rec)
            for month in range(self.start_month, self.end_month+1):
                features = rec[0] if type(rec) is tuple else rec
                features = features.copy()

                if month == 12:
                    month_event_time = datetime(self._year + 1, 1, 1).timestamp()
                else:
                    month_event_time = datetime(self._year, month + 1, 1).timestamp()

                year_event_time = datetime(self._year, 1, 1).timestamp()

                mask = features[self._col_time] < month_event_time

                if month <= max_month:
#                     print(f'min month: {min_month}')
#                     print(f'max month: {max_month}')
                    
                    for key, tensor in features.items():
#                         print(len(tensor))
                        if key.startswith('target'):
                            target_index = month - 1
                            if target_index < len(tensor):
                                features[key] = tensor[target_index].tolist()
                            else:
                                break
                        elif key != self._col_id:
                            features[key] = tensor[mask]
                else:
                    # Stop iteration if there are no more targets
                    break

                features[self._col_id] += '_month=' + str(month)

                yield features



def collate_feature_dict_with_target(batch, col_id='client_id', targets=False):
    batch_ids = []
    target_cols = []
    for sample in batch:
        batch_ids.append(sample[col_id])
        del sample[col_id]

        if targets:
            target_cols.append([sample[f'target_{i}'] for i in range(1, 5)])
            del sample['target_1']
            del sample['target_2']
            del sample['target_3']
            del sample['target_4']

    padded_batch = collate_feature_dict(batch)
    if targets:
        return padded_batch, batch_ids, target_cols
    return padded_batch, batch_ids


class InferenceModuleMultimodal(pl.LightningModule):
    def __init__(self, model, pandas_output=True, drop_seq_features=True, model_out_name='out'):
        super().__init__()

        self.model = model
        self.pandas_output = pandas_output
        self.drop_seq_features = drop_seq_features
        self.model_out_name = model_out_name

    def forward(self, x):
        x_len = len(x)
        if x_len == 3:
            x, batch_ids, target_cols = x
        else:
            x, batch_ids = x

        out = self.model(x)
        if x_len == 3:
            target_cols = torch.tensor(target_cols)
            x_out = {
                'client_id': batch_ids,
                'target_1': target_cols[:, 0],
                'target_2': target_cols[:, 1],
                'target_3': target_cols[:, 2],
                'target_4': target_cols[:, 3],
                self.model_out_name: out
            }
        else:
            x_out = {
                'client_id': batch_ids,
                self.model_out_name: out
            }
        torch.cuda.empty_cache()

        if self.pandas_output:
            return self.to_pandas(x_out)
        return x_out

    @staticmethod
    def to_pandas(x):
        expand_cols = []
        scalar_features = {}

        for k, v in x.items():
            if type(v) is torch.Tensor:
                v = v.cpu().numpy()

            if type(v) is list or len(v.shape) == 1:
                scalar_features[k] = v
            elif len(v.shape) == 2:
                expand_cols.append(k)
            else:
                scalar_features[k] = None

        dataframes = [pd.DataFrame(scalar_features)]
        for col in expand_cols:
            v = x[col].cpu().numpy()
            dataframes.append(pd.DataFrame(v, columns=[f'{col}_{i:04d}' for i in range(v.shape[1])]))

        return pd.concat(dataframes, axis=1)

In [15]:
%%time

train = MemoryMapDataset(
    data=dataset_train_df.merge(processed_target.drop("event_time", axis=1), on="client_id", how="inner").to_dict("records"),
    i_filters=[
        ISeqLenLimit(max_seq_len=4096),
        FeatureFilter(keep_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        GetSplit(start_month=1, end_month=12),
        ToTorch(),
    ]
)

test = MemoryMapDataset(
    data=dataset_test_df.to_dict("records"),
    i_filters=[
        ISeqLenLimit(max_seq_len=4096),
        FeatureFilter(keep_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        ToTorch(),
    ]
)

CPU times: user 22min 13s, sys: 1min 20s, total: 23min 33s
Wall time: 13min 25s


In [16]:
clients_train = [record.get('client_id') for record in train]

clients_test = [record.get('client_id') for record in test]

In [18]:
y_target_1 = [record.get('target_1') for record in train]
y_target_2 = [record.get('target_2') for record in train]
y_target_3 = [record.get('target_3') for record in train]
y_target_4 = [record.get('target_4') for record in train]

y_target_1 = [0 if isinstance(x, torch.Tensor) else x for x in y_target_1]
y_target_2 = [0 if isinstance(x, torch.Tensor) else x for x in y_target_2]
y_target_3 = [0 if isinstance(x, torch.Tensor) else x for x in y_target_3]
y_target_4 = [0 if isinstance(x, torch.Tensor) else x for x in y_target_4]

In [20]:
%%time

params = {
    'numeric_values': {
        'hour': {'identity'},
        'weekday': {'identity'}
    },
    'embeddings': {
        # 'event_time': {'in': 40},
        "geohash_4": {"in": 50},
        "geohash_5": {"in": 50},
        "geohash_6": {"in": 50}
    },
}

seq_encoder = AggFeatureSeqEncoder(**params)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

CPU times: user 207 ms, sys: 22.6 ms, total: 230 ms
Wall time: 375 ms


In [21]:
trainer = pl.Trainer(
    max_epochs=30,
    enable_progress_bar=True,
    limit_val_batches=5000,
    devices='auto',
#     gpus=1,
    gradient_clip_val=0.5,
    # logger=pl.loggers.TensorBoardLogger(
    #     save_dir='./logdir',
    #     name='baseline_result'
    # ),
    callbacks=[
        pl.callbacks.LearningRateMonitor(logging_interval='step'),
        pl.callbacks.ModelCheckpoint(every_n_train_steps=5000, save_top_k=-1),
        pl.callbacks.EarlyStopping(monitor='valid/recall_top_k', mode="max")
    ]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
train_dl = inference_data_loader(train, num_workers=0, batch_size=16)
train_embeds = torch.vstack(trainer.predict(model, train_dl))

test_dl = inference_data_loader(test, num_workers=0, batch_size=16)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
2024-06-16 17:14:58.215723: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-16 17:14:58.215791: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-16 17:14:58.215811: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Predicting DataLoader 0: |          | 425810/? [1:11:09<00:00, 99.74it/s] 


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [7]


Predicting DataLoader 0: |          | 10422/? [02:30<00:00, 69.07it/s]


In [23]:
X_train = train_embeds.numpy() 

X_test = test_embeds.numpy()

In [24]:
X_train = pd.DataFrame(X_train)

X_test = pd.DataFrame(X_test)

In [26]:
X_train['client_id'] = clients_train

X_train['target_1'] = y_target_1
X_train['target_2'] = y_target_2
X_train['target_3'] = y_target_3
X_train['target_4'] = y_target_4

In [27]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,905,906,907,908,909,client_id,target_1,target_2,target_3,target_4
0,5.0,43.0,8.600000,7.668116,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.894427,1.0,1.0,1.0,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0
1,6.0,52.0,8.666667,6.860515,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.264911,1.0,1.0,1.0,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0
2,6.0,52.0,8.666667,6.860515,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.264911,1.0,1.0,1.0,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0
3,6.0,52.0,8.666667,6.860515,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.264911,1.0,1.0,1.0,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0
4,6.0,52.0,8.666667,6.860515,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.264911,1.0,1.0,1.0,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6812952,1046.0,8742.0,8.357553,5.373182,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.029002,2.0,2.0,1.0,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,0,0,0,0
6812953,1292.0,10773.0,8.338235,5.440987,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.025403,2.0,2.0,1.0,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,0,0,0,0
6812954,1577.0,13169.0,8.350666,5.442657,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.015220,2.0,2.0,1.0,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,0,0,0,0
6812955,1795.0,14914.0,8.308635,5.398760,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.008750,2.0,2.0,1.0,fffff598cd1a947b8ce0b86d56fd356729ec7bacb7053a...,0,0,0,0


# Saving

In [28]:
X_test['client_id'] = clients_test

not_only_trx = pd.DataFrame({"client_id": test_target_b["client_id"].unique()}).merge(X_test, how="left").fillna(0)
not_only_trx.columns = not_only_trx.columns.map(lambda x: str(x) if isinstance(x, int) else x)
not_only_trx.to_parquet("X_test_geo.parquet", index=False, engine="pyarrow", compression="snappy")
# not_only_trx = not_only_trx.drop(columns=['client_id'])

In [29]:
X_train.columns = X_train.columns.map(lambda x: str(x) if isinstance(x, int) else x)

X_train.to_parquet("X_train_geo.parquet", index=False, engine="pyarrow", compression="snappy")

In [30]:
not_only_trx

# not_only_trx.columns = not_only_trx.columns.map(lambda x: str(x) if isinstance(x, int) else x)

Unnamed: 0,client_id,0,1,2,3,4,5,6,7,8,...,900,901,902,903,904,905,906,907,908,909
0,2b7ff0c1c99cefe259ed83c5dfa0a403f2cbc88032b671...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0433d23e224b7a520656da6181efadb8d556bb293158c9...,3489.0,41368.0,11.856692,4.319400,0.0,0.0,2.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.024468,6.0,2.0,1.0
2,f2ce8b292e5f9f778f3e20db7608ac76dc8812113a2631...,391.0,4563.0,11.670076,4.492855,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.988939,1.0,1.0,1.0
3,4f807e8b163c653bcaeff9f925983568f4c3e6b1a1f231...,407.0,5733.0,14.085995,4.771114,0.0,5.0,0.0,133.0,269.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.033801,3.0,3.0,1.0
4,64369f6f8ae1b719332ee1bfb2b454e642b2053d2c9b8a...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140483,d49a66825bb16ceb5b6a01126e1f2391b085dba8da44ee...,1869.0,20135.0,10.773141,4.364042,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.001466,5.0,4.0,1.0
140484,f772af6720c0b591d49b97946c5e420c1c077affc0f7c7...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
140485,06b282335bc4853f888e1ab50a6ba23a8e420d42313959...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
140486,90d423a25d7cdaf674f7d78bc37d88830443ff17717e02...,2547.0,21662.0,8.504908,4.536432,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.009462,7.0,3.0,3.0
