# Обработка диалоговых эмбеддингов

In [1]:
import os
# os.environ["OMP_NUM_THREADS"] = "4"

import pandas as pd
import numpy as np
import torch
from functools import partial
import pytorch_lightning as pl
import warnings
warnings.filterwarnings("ignore")

from torch.utils.data import DataLoader

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing.iterable_seq_len_limit import ISeqLenLimit
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesIterableDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset

from tqdm.auto import tqdm
import lightgbm as ltb

# EDA

In [14]:
dialog_data = pd.DataFrame()

In [2]:
from tqdm import tqdm
import gc

dialog_data = pd.DataFrame()

for i in tqdm(range(0, 18)):
    df = pd.read_parquet(f"D:/archive/Hackathon/dial_train.parquet/part-{i}.parquet")
    tmp_df = pd.DataFrame(df['embedding'].tolist(), columns=[f'mean_dialog_emb_{i+1}' for i in range(len(df['embedding'][0]))])
    df = pd.concat([df.drop(columns = ['embedding']), tmp_df], axis = 1)
    dialog_data = pd.concat([dialog_data, df], axis = 0)
    gc.collect()

100%|██████████| 18/18 [05:16<00:00, 17.60s/it]


In [15]:
for i in tqdm(range(0, 3)):
    df = pd.read_parquet(f"D:/archive/Hackathon/dial_test.parquet/part-{i}.parquet")
    tmp_df = pd.DataFrame(df['embedding'].tolist(), columns=[f'mean_dialog_emb_{i+1}' for i in range(len(df['embedding'][0]))])
    df = pd.concat([df.drop(columns = ['embedding']), tmp_df], axis = 1)
    dialog_data = pd.concat([dialog_data, df], axis = 0)
    gc.collect()

100%|██████████| 3/3 [01:14<00:00, 24.87s/it]


In [16]:
del tmp_df
del df

In [17]:
dialog_data.reset_index(drop = True, inplace = True)

In [18]:
len(dialog_data['client_id'].unique())

81425

In [19]:
dialog_data['event_time'] = pd.to_datetime(dialog_data['event_time']).dt.strftime('%Y-%m')

In [20]:
aggregation_functions = {f'mean_dialog_emb_{i+1}': 'mean' for i in range(len(dialog_data.columns)-2)}
aggregated_df = dialog_data.groupby(['client_id', 'event_time']).agg(aggregation_functions)

In [21]:
aggregated_df.reset_index(inplace = True)

In [22]:
aggregated_df

Unnamed: 0,client_id,event_time,sum_dialog_emb_1,sum_dialog_emb_2,sum_dialog_emb_3,sum_dialog_emb_4,sum_dialog_emb_5,sum_dialog_emb_6,sum_dialog_emb_7,sum_dialog_emb_8,...,sum_dialog_emb_759,sum_dialog_emb_760,sum_dialog_emb_761,sum_dialog_emb_762,sum_dialog_emb_763,sum_dialog_emb_764,sum_dialog_emb_765,sum_dialog_emb_766,sum_dialog_emb_767,sum_dialog_emb_768
0,00006c6ed6d81e18051751b68f9cb0d4f31d13ef0ae7fb...,2022-02,0.130907,-0.194891,0.310927,-0.194666,-0.080639,0.256647,0.084057,0.236912,...,0.292817,0.174434,0.211314,0.781331,0.281405,0.046178,0.127985,-0.042661,0.296090,0.134088
1,00006c6ed6d81e18051751b68f9cb0d4f31d13ef0ae7fb...,2022-07,0.053377,-0.106471,0.338472,0.166533,-0.002339,0.232304,0.128242,0.065465,...,0.193704,0.020750,0.004487,0.556650,0.302507,0.031423,0.036292,-0.007353,0.093835,0.194562
2,00011c01bb22d8f62d9655f32d123dcca5ae55179f8266...,2022-12,0.325750,-0.102299,0.584633,-0.301862,-0.321009,0.436904,0.244961,0.494768,...,0.524059,0.348781,0.245613,0.862081,0.567360,0.378372,0.386632,-0.228029,0.495043,0.206350
3,0001ac6446bf223a094d6514a6c890d82e9aa92104dee0...,2022-01,0.106713,0.108001,0.325686,0.043290,-0.151876,0.297966,0.022422,0.104224,...,0.110361,0.035899,0.165119,0.248349,0.190246,0.159030,0.197670,-0.051101,0.293387,-0.144627
4,0001ac6446bf223a094d6514a6c890d82e9aa92104dee0...,2022-07,0.373225,-0.076706,0.556706,-0.396113,-0.082214,0.528915,0.338543,0.471667,...,0.479426,0.235950,0.472678,0.981142,0.540747,0.262803,0.363959,-0.227680,0.509521,0.216208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171296,fffe8ed2b0c1cdf0992f01cdd4d071edfa2cdf60279dcb...,2022-06,0.392902,-0.045059,0.726924,-0.206968,-0.283472,0.519860,0.338938,0.247836,...,0.536575,0.383584,0.429522,0.983889,0.507465,0.097660,0.377414,-0.234115,0.502277,0.363152
171297,fffee74dcdcef5ef70419e38c6d1b20f807bf706431f54...,2022-08,0.307973,-0.233778,0.472195,-0.205287,-0.297054,0.518218,0.273719,0.413443,...,0.580869,0.379513,0.577191,0.899924,0.451241,0.345429,0.335675,-0.543611,0.453234,0.359812
171298,fffee74dcdcef5ef70419e38c6d1b20f807bf706431f54...,2022-09,0.496979,-0.219277,0.456451,-0.422763,-0.300284,0.479127,0.291044,0.590559,...,0.598580,0.538545,0.532814,0.925532,0.462797,0.336355,0.519994,-0.464084,0.416059,0.560979
171299,fffee74dcdcef5ef70419e38c6d1b20f807bf706431f54...,2022-10,0.364285,-0.018232,0.532640,-0.362312,-0.285379,0.519170,0.321172,0.236775,...,0.487734,0.394147,0.427134,0.979744,0.509605,0.092510,0.340311,-0.199094,0.493977,0.170497


In [23]:
aggregated_df['client_id'] = aggregated_df['client_id'] + '_month=' + pd.to_datetime(
    aggregated_df['event_time']).apply(lambda x: str(x.month))

In [25]:
aggregated_df.to_parquet('dialog_embs_mean_train_test.parquet')