In [18]:
import pandas as pd
import numpy as np
import sys
import json
import jsonlines
from sklearn.model_selection import train_test_split

In [19]:
transactions = pd.read_csv('../data/gender/transactions.csv')
target_data = pd.read_csv('../data//gender/gender_train.csv')

In [21]:
transactions.term_id = transactions.term_id.fillna("UNK")
transactions['trans'] = "mcc" + transactions['mcc_code'].astype(str)

In [22]:
data = transactions.rename(columns={'customer_id': 'client_id', 'trans':'small_group', 'amount':'amount_rur'})
target_data = target_data.rename(columns={'customer_id':'client_id', 'gender':'bins'})
print(data.head())

   client_id tr_datetime  mcc_code  tr_type  amount_rur term_id small_group
0   39026145  0 10:23:26      4814     1030    -2245.92     UNK     mcc4814
1   39026145  1 10:19:29      6011     7010    56147.89     UNK     mcc6011
2   39026145  1 10:20:56      4829     2330   -56147.89     UNK     mcc4829
3   39026145  1 10:39:54      5499     1010    -1392.47     UNK     mcc5499
4   39026145  2 15:33:42      5499     1010     -920.83     UNK     mcc5499


In [23]:
#change transaction to numbers
keys = np.unique(data.small_group)
new_values = np.arange(0,len(keys), dtype=int)
dictionary = dict(zip(keys, new_values))
new_column = [dictionary[key] for key in list(data.small_group)]
data.small_group = new_column

### Data for language model (LM)

In [25]:
!mkdir ../../gender
!mkdir ../../gender/target_clf
!mkdir ../../gender/substitute_clf
!mkdir ../../gender/lm

In [26]:
def split_slice_subsample(sub_data, cnt_min, cnt_max, split_count):
    sub_datas = []
    cnt_min = cnt_min if len(sub_data) > cnt_max else int(cnt_min*len(sub_data)/cnt_max)
    cnt_max = cnt_max if len(sub_data) > cnt_max else len(sub_data)-1
    split_count = split_count if len(sub_data) > cnt_max else int(len(sub_data)/cnt_max*split_count)
    for i in range(0, split_count):
        if cnt_min < cnt_max: 
            T_i = np.random.randint(cnt_min, cnt_max)
            s = np.random.randint(0, len(sub_data)-T_i-1)
            S_i = sub_data[s:s+T_i-1]
            sub_datas.append(S_i)
            
    return sub_datas

In [27]:
def create_set(name, data, target):
    len_ = len(np.unique(target.client_id))
    dict_data = {}
    with jsonlines.open(name, "w") as writer:
        S = 0
        for index, client_id in enumerate(np.unique(target.client_id)):
            sys.stdout.write("\r %d out of %d" % (index, len_))

            sub_data = data[data['client_id']==client_id]
            sub_data_target = target[target['client_id']==client_id]

            sub_datas = split_slice_subsample(sub_data, 25, 150, 30)
             
            for loc_data in sub_datas:
                loc_dict = {"transactions": list(loc_data.small_group),
                            "amounts": list(loc_data.amount_rur),
                            "label": int(sub_data_target.bins),
                            "client_id": int(client_id)}
                S = S+ len(loc_data.small_group)
                writer.write(loc_dict) 
           
    print('mean length:', S/(len(sub_datas)* len((np.unique(target.client_id)))))
    return

In [28]:
def split_data_lm(data, target_data):
    target_data_train, target_data_valid = train_test_split(target_data, test_size=0.2, random_state=10, shuffle=True)
    print('Create train set...')
    create_set('../../gender0/lm/train.jsonl', data, target_data_train)
    print('Create valid set...')
    create_set('../../gender0/lm/valid.jsonl', data, target_data_valid)
    return

In [11]:
split_data_lm(data, target_data)

Create train set...
 6719 out of 6720mean length: 76.26872023809524
Create valid set...
 1679 out of 1680mean length: 76.90662698412699


### Data for classifiers

In [29]:
!mkdir gender_substitute_clf
!mkdir target_clf

mkdir: cannot create directory ‘gender_substitute_clf’: File exists
mkdir: cannot create directory ‘target_clf’: File exists


In [30]:
target_data_test_sub, target_data_targetclf = train_test_split(target_data, test_size=0.65, random_state=10, shuffle=True)
target_data_subclf, target_data_test = train_test_split(target_data_test_sub, test_size=2./7, random_state=10, shuffle=True)

In [31]:
def split_data(data, target_data, dir_):
    target_data_train, target_data_valid = train_test_split(target_data, test_size=0.2, random_state=10, shuffle=True)
    print('Create train set...')
    create_set(dir_+'/'+'train.jsonl', data, target_data_train)
    print('Create valid set...')
    create_set(str(dir_)+'/'+'valid.jsonl', data, target_data_valid)
    return

In [32]:
#create test set for both target and substitute classifiers
create_set('../../gender/test.jsonl', data, target_data_test)

 839 out of 840mean length: 76.39734126984126


In [33]:
#create valid and train data for substitute classifier
split_data(data, target_data_subclf, '../../gender/substitute_clf')

Create train set...
 1679 out of 1680mean length: 76.74390873015874
Create valid set...
 419 out of 420mean length: 77.74626984126984


In [34]:
#create valid and train data for target classifier
split_data(data, target_data_targetclf, '../../gender/target_clf')

Create train set...
 4367 out of 4368mean length: 76.0153540903541
Create valid set...
 1091 out of 1092mean length: 77.4495115995116
