In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import logging
import torch
import pytorch_lightning as pl
from ptls.preprocessing import PandasDataPreprocessor
import warnings
import os
import pandas as pd

from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import inference_data_loader

import torch
import pytorch_lightning as pl
import logging
from datetime import datetime


warnings.filterwarnings('ignore')
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)


libgomp: Invalid value for environment variable OMP_NUM_THREADS
  from .autonotebook import tqdm as notebook_tqdm

libgomp: Invalid value for environment variable OMP_NUM_THREADS


In [31]:
N = 1 # минимальная длина рассматриваемой последовательности

# Предобработка данных

In [3]:
source_data = pd.read_csv('data_to_coles.csv')
source_data.head(2)

Unnamed: 0.1,Unnamed: 0,event_timestamp,ua_device_type,ua_client_type,viewer_uid,category,duration,latitude,longitude,time_zone,time of day,Day Type,how_many_times_watche
0,0,2024-06-01 03:40:58+00:00,desktop,browser,10067243,Телепередачи,2456534,55.16444,61.436844,2,morning,weekend,0.766527
1,1,2024-06-01 16:33:24+00:00,smartphone,mobile app,10245341,Юмор,519211,54.733334,56.0,2,evening,weekend,0.986112


In [4]:
len(source_data.viewer_uid.unique())

180012

In [5]:
# приводим метку времени в числовой формат
source_data['event_timestamp'] = source_data['event_timestamp'].apply(lambda x: datetime.strptime(x.replace('+00:00', ''), '%Y-%m-%d %H:%M:%S').timestamp())

In [6]:
data_drop = ['Unnamed: 0']
category_features = ['ua_device_type', 'ua_client_type', 'category', 'time of day', 'Day Type']
float_features = ['duration', 'latitude', 'longitude', 'time_zone', 'how_many_times_watche']

In [7]:
source_data = source_data.drop(columns=data_drop)
source_data

Unnamed: 0,event_timestamp,ua_device_type,ua_client_type,viewer_uid,category,duration,latitude,longitude,time_zone,time of day,Day Type,how_many_times_watche
0,1.717202e+09,desktop,browser,10067243,Телепередачи,2456534,55.164440,61.436844,2,morning,weekend,0.766527
1,1.717249e+09,smartphone,mobile app,10245341,Юмор,519211,54.733334,56.000000,2,evening,weekend,0.986112
2,1.717256e+09,desktop,browser,10894333,Телепередачи,5518280,59.937500,30.308611,0,evening,weekend,1.023326
3,1.717261e+09,smartphone,mobile app,10029092,Разное,1522069,55.751244,37.618423,0,evening,weekend,0.999298
4,1.717260e+09,smartphone,mobile app,10452976,Путешествия,1249920,55.751244,37.618423,0,evening,weekend,0.056804
...,...,...,...,...,...,...,...,...,...,...,...,...
1759611,1.719746e+09,smartphone,mobile app,10026914,Сериалы,4480915,55.751244,37.618423,0,day,weekend,0.944004
1759612,1.719689e+09,smartphone,browser,10417567,Обучение,320134,55.751244,37.618423,0,evening,weekend,1.193250
1759613,1.719765e+09,desktop,browser,10009094,Телепередачи,3125675,59.937500,30.308611,0,evening,weekend,0.429347
1759614,1.719726e+09,smartphone,mobile app,10574374,Телепередачи,2401283,59.937500,30.308611,0,morning,weekend,0.333572


In [8]:
preprocessor = PandasDataPreprocessor( # инициализация предпроцессора данных
    col_id='viewer_uid',
    col_event_time='event_timestamp',
    event_time_transformation='none',
    cols_category=category_features,
    cols_numerical=float_features,
    return_records=True,
)

In [9]:
%%time

dataset = preprocessor.fit_transform(source_data)

CPU times: user 1min 52s, sys: 2.62 s, total: 1min 55s
Wall time: 1min 55s


In [10]:
import pickle

with open('preprocessor.p', 'wb') as f:
    pickle.dump(preprocessor, f)

In [11]:
dataset = sorted(dataset, key=lambda x: x['viewer_uid']) # сортируем по id юзеру для возможности сохранения взаимосвязи с id

# Архитектура модели

In [29]:
trx_encoder_params = dict( # инициализация энкодера фичей айтемов
    embeddings_noise=0.003,
    numeric_values={'how_many_times_watche': 'identity',
                    'duration': 'identity',
                    'latitude': 'identity',
                    'longitude': 'identity',
                    'time_zone': 'identity'},
    embeddings={
        'ua_device_type': {'in': 80, 'out': 16},
        'ua_client_type': {'in': 80, 'out': 16},
        'category': {'in': 80, 'out': 16},
        'duration': {'in': 80, 'out': 16},
        'latitude': {'in': 80, 'out': 16},
        'longitude': {'in': 80, 'out': 16},
        'event_timestamp': {'in': 80, 'out': 16},
        'time of day': {'in': 80, 'out': 16},
        'Day Type': {'in': 80, 'out': 16}
    },
)

seq_encoder = RnnSeqEncoder( # инициализация энкодера последовательности
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

model = CoLESModule( # инициализация CoLES
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [30]:
train_dl = PtlsDataModule( # dataloader
    train_data=ColesDataset(
        MemoryMapDataset(
            data=dataset,
            i_filters=[
                SeqLenFilter(min_seq_len=N),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=16,
    train_batch_size=256,
)

# Обучение модели

In [22]:
trainer = pl.Trainer(
    max_epochs=20,
    accelerator="cuda" if torch.cuda.is_available() else "cpu",
    devices=1 if torch.cuda.is_available() else "auto",
    enable_progress_bar=True,
)

In [23]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

logger.version = 13
Epoch 19: 100%|██████████| 99/99 [00:09<00:00, 10.63it/s, v_num=13, seq_len=31.60]
{'loss': tensor(28.1798), 'seq_len': tensor(31.5795)}
CPU times: user 2min 8s, sys: 39 s, total: 2min 47s
Wall time: 3min 3s


In [24]:
torch.save(seq_encoder.state_dict(), f"coles-emb_{N}.pt")

# Получение эмбеддингов активности пользователя

In [25]:
dl = inference_data_loader(dataset, batch_size=256)
embeds = torch.vstack(trainer.predict(model, dl))

Predicting DataLoader 0: |          | 704/? [00:20<00:00, 34.66it/s]


In [26]:
pd.DataFrame(embeds).shape

(180012, 256)

In [27]:
result = pd.concat([pd.DataFrame([i['viewer_uid'] for i in dataset], columns=['viewer_uid']), pd.DataFrame(embeds)], axis=1)
result

Unnamed: 0,viewer_uid,0,1,2,3,4,5,6,7,8,...,246,247,248,249,250,251,252,253,254,255
0,10000001,-0.079379,-0.645619,-0.574997,0.027316,-0.081550,-0.200424,0.030352,-0.324994,0.395032,...,-0.083376,-0.426868,-0.266104,0.208563,0.320169,0.475945,0.082858,0.202235,0.164373,0.101488
1,10000002,0.128869,0.727794,-0.421140,-0.042360,0.162606,-0.418398,0.534725,-0.585396,0.375854,...,-0.186962,-0.385147,-0.244132,0.215764,0.576363,-0.254286,0.113213,-0.238589,-0.626280,-0.075768
2,10000004,0.579511,-0.042496,-0.249056,-0.123967,0.131384,-0.182829,0.461624,-0.472861,0.198432,...,-0.532505,-0.151703,-0.223830,0.297315,0.516899,-0.022531,0.068474,-0.405091,-0.308587,0.457735
3,10000005,0.043953,0.058478,0.712583,-0.658882,0.001185,-0.087187,-0.137440,-0.641901,0.651410,...,0.370385,-0.772020,-0.183225,-0.012196,0.682627,-0.157202,-0.010767,-0.641666,-0.754198,-0.418630
4,10000006,0.001238,0.015289,0.178141,-0.647374,0.006929,-0.235073,0.028447,-0.640753,0.512778,...,0.427441,-0.850755,-0.174661,0.000601,0.684307,-0.205769,-0.023953,-0.253055,-0.756359,-0.224717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180007,11140828,0.060576,0.036286,0.351472,-0.539197,0.102528,-0.387113,-0.056368,-0.640522,0.516121,...,0.764847,-0.817152,-0.171262,-0.012067,0.684556,-0.136056,-0.054886,-0.181711,-0.755438,-0.002223
180008,11140869,0.073792,0.034852,0.360384,-0.534586,0.093691,-0.340066,-0.064027,-0.640840,0.535008,...,0.765736,-0.825546,-0.170678,-0.011828,0.684513,-0.138585,-0.052862,-0.179434,-0.755481,0.033273
180009,11140872,0.021156,0.031904,0.235382,-0.494798,-0.028934,-0.330044,-0.030157,-0.625192,0.264592,...,-0.259198,-0.870519,-0.239116,0.049962,0.659371,-0.161894,0.102529,0.500162,-0.709724,-0.348569
180010,11140875,0.143982,0.039419,0.307343,-0.561041,0.061871,-0.066092,-0.047869,-0.641366,0.601362,...,0.765952,-0.827400,-0.173300,-0.009983,0.684271,-0.141243,-0.043091,0.365486,-0.755420,-0.374357


In [28]:
result.to_csv(f'coles_embeds_{N}.csv', index=False)