In [1]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import sys
import json
import jsonlines
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from advsber.utils.data import write_jsonlines

In [10]:
NUM_DAYS = 7

TEST_RATIO = 0.1
SUBST_RATIO = 0.3
VALID_RATIO = 0.2
LM_RATIO = 0.1
DATASET_PATH = '../datasets/age'
NUM_WEEKS = 25

MIN_LEN = 3
MAX_LEN = 50

In [3]:
data = pd.read_csv('/notebook/morozov/data/age/original/transactions_train.csv')
target_data = pd.read_csv('/notebook/morozov/data/age/original/train_target.csv')

In [4]:
target_data_dict = dict(target_data.values)

In [5]:
data['week'] = data['trans_date'] // NUM_DAYS

In [6]:
data.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur,week
0,33172,6,4,71.463,0
1,33172,6,35,45.017,0
2,33172,8,11,13.887,1
3,33172,9,11,15.983,1
4,33172,10,11,21.341,1


In [7]:
transactions = data.groupby(['client_id', 'week']).agg(list)

In [8]:
my_lovely_data_raw = []

for idx, (_, row) in tqdm(enumerate(transactions.iterrows())):
    client_id, week = row.name

    my_lovely_data_raw.append(
        {
            'transactions': row['small_group'],
            'amounts': row.amount_rur,
            'client_id': client_id, 
            'week': week
        }
    )

3060107it [07:20, 6948.07it/s]


In [9]:
# my_lovely_data_raw = []

# for idx, (_, row) in tqdm(enumerate(transactions.iterrows())):
#     client_id, week = row.name

#     if (
#         idx > 0 and 
#         len(my_lovely_data_raw[-1]['transactions']) < MIN_LEN and
#         client_id == my_lovely_data_raw[-1]['client_id']
#     ):
#         my_lovely_data_raw[-1]['transactions'].extend(row['small_group'])
#         my_lovely_data_raw[-1]['amounts'].extend(row['amount_rur'])
#         my_lovely_data_raw[-1]['week'] = week
#     else:
#         my_lovely_data_raw.append(
#             {
#                 'transactions': row['small_group'],
#                 'amounts': row.amount_rur,
#                 'client_id': client_id, 
#                 'week': week
#             }
#         )

In [11]:
my_lovely_data = pd.DataFrame(my_lovely_data_raw)

In [12]:
my_lovely_data.shape

(3060107, 4)

In [13]:
my_lovely_data = my_lovely_data[(my_lovely_data['week'] < NUM_WEEKS)]

In [14]:
my_lovely_data.shape

(715574, 4)

In [15]:
my_lovely_data['label'] = my_lovely_data['client_id'].apply(lambda x: target_data_dict.get(x))
my_lovely_data = my_lovely_data[~my_lovely_data['label'].isna()]
my_lovely_data['label'] = my_lovely_data['label'].astype(int)
my_lovely_data = my_lovely_data[['transactions', 'amounts', 'client_id', 'label']]

In [16]:
my_lovely_data.shape

(715574, 4)

In [17]:
lens = my_lovely_data.transactions.apply(lambda x: len(x))

my_lovely_data = my_lovely_data[(lens >= MIN_LEN) & (lens <= MAX_LEN)]

In [18]:
my_lovely_data.shape

(640700, 4)

In [19]:
lm_train, lm_valid = train_test_split(
    my_lovely_data, 
    stratify=my_lovely_data['label'], 
    random_state=126663,
    test_size=LM_RATIO
)

other_data, test_data = train_test_split(
    my_lovely_data, 
    stratify=my_lovely_data['label'], 
    random_state=123,
    test_size=TEST_RATIO
)

target_data, subst_data = train_test_split(
    other_data, 
    stratify=other_data['label'], 
    random_state=123,
    test_size=SUBST_RATIO
)

target_data_tr, target_data_val = train_test_split(
    target_data, 
    stratify=target_data['label'], 
    random_state=123,
    test_size=VALID_RATIO
)

subst_data_tr, subst_data_val = train_test_split(
    subst_data, 
    stratify=subst_data['label'], 
    random_state=123,
    test_size=VALID_RATIO
)

In [20]:
test_data.shape, target_data_tr.shape, target_data_val.shape

((64070, 4), (322912, 4), (80729, 4))

In [21]:
subst_data_tr.shape, subst_data_val.shape

((138391, 4), (34598, 4))

In [22]:
lm_train.shape, lm_valid.shape

((576630, 4), (64070, 4))

In [23]:
write_jsonlines(test_data.to_dict('records'), f'{DATASET_PATH}/test.jsonl')

write_jsonlines(target_data_tr.to_dict('records'), f'{DATASET_PATH}/target_clf/train.jsonl')
write_jsonlines(target_data_val.to_dict('records'), f'{DATASET_PATH}/target_clf/valid.jsonl')

write_jsonlines(subst_data_tr.to_dict('records'), f'{DATASET_PATH}/substitute_clf/train.jsonl')
write_jsonlines(subst_data_val.to_dict('records'), f'{DATASET_PATH}/substitute_clf/valid.jsonl')

write_jsonlines(lm_train.to_dict('records'), f'{DATASET_PATH}/lm/train.jsonl')
write_jsonlines(lm_valid.to_dict('records'), f'{DATASET_PATH}/lm/valid.jsonl')