In [8]:
from collections import defaultdict

import pandas as pd

from tqdm import tqdm
import random
import json
import os


In [9]:
NEG_SAMPLE_SIZE = 10
info = {'NEG_SAMPLE_SIZE': NEG_SAMPLE_SIZE}

if not (os.path.exists('data')):
    os.mkdir('data')
with open('data//info.txt', 'w') as info_file:
            json.dump(info, info_file)

In [10]:
df = pd.read_csv ('clear_data_realbank.csv')
df.drop('TRANS_DETAIL', inplace=True, axis=1)
df.rename(columns = {'RETAILER' : 'tail', 'CustomerKey' : 'head',
                        'MCC' : 'tail_type', 'AMOUNT_EQ' : 'relation'   }, 
                            inplace = True) 
# некоторые id магазинов имели разные категории. Здесь id c разл категорией приравниваются к -2 и затем удаляются
for i in (range(min(df.tail_type.unique()),max(df.tail_type.unique())+1)):
    for j in range(i+1,max(df.tail_type.unique())+1):
        for k in (set(df[df.tail_type==i]['tail'].unique())&set(df[df.tail_type==j]['tail'].unique())):
            df.loc[df['tail'] == k,'tail_type'] = -2
df = df[df.tail_type != -2].reset_index(drop=True)

# траты разбиваются на 8 категорий и в дальнейшем будут характеризировать отношения
df.relation = pd.qcut(df.relation, q=8, 
        labels=["small", "medium_small", "medium_small_2", 'medium_1', 'medium_2', 'medium_large_2', 'medium_large', 'large'])

# удаление непопулярных категорий магазинов
top_mcc = list(df.tail_type.value_counts()[:10].rename_axis('unique_values').reset_index(name='counts')['unique_values'])
df = df[df.tail_type.isin(top_mcc)].reset_index(drop=True)
df = df.sort_values('tail_type')

# присвоение уникальным id пользователей и магазинов чисел от 0 до len(уникальных id)
df['head'] = pd.factorize(df['head'])[0]
max_person = max(df['head'])
df['tail'] = pd.factorize(df['tail'])[0] + max_person + 1

# словарь с типами сущностей и списком их id
dict_id = {}
dict_id['person'] = list(range(0,max(df['head'])+1))
for tail_type in df['tail_type'].unique():
    dict_id[tail_type] = list(range(min(df[df.tail_type==tail_type]['tail']),(max(df[df.tail_type==tail_type]['tail'])+1)))

# присвоение чисел отношениям
dict_rel = {key: idx for idx,key in enumerate(pd.factorize(df.relation)[1].categories)}
df['relation'] = df['relation'].apply(lambda x: dict_rel[x])

# создание негативных сущностей 
# neg_head --> tail      neg_tail --> head
df['neg_head'] = [random.sample(dict_id['person'],NEG_SAMPLE_SIZE) for _ in range(len(df))]
df['neg_tail'] = [random.sample(dict_id[i], NEG_SAMPLE_SIZE) for i in df.tail_type]

# разбиение на выборки по времени
train = df[(df.tstmp >= '2018-01-01 00:00:00+03:00') & (df.tstmp < '2018-08-01 00:00:00+03:00')]
train.drop('tstmp', inplace=True, axis=1, errors='ignore')
valid = df[(df.tstmp >= '2018-08-01 00:00:00+03:00') & (df.tstmp < '2018-10-01 00:00:00+03:00')]
valid.drop('tstmp', inplace=True, axis=1, errors='ignore')
test = df[(df.tstmp >= '2018-10-01 00:00:00+03:00') & (df.tstmp < '2019-01-01 00:00:00+03:00')]
test.drop('tstmp', inplace=True, axis=1, errors='ignore')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [11]:
train_count, train_true_head, train_true_tail = defaultdict(lambda: 4), defaultdict(list), defaultdict(list)
for i in tqdm(train.index):
    head, relation, tail = train.loc[i,'head'], train.loc[i,'relation'],  train.loc[i,'tail']
    train_count[(head, relation)] += 1
    train_count[(tail, -relation-1)] += 1
    train_true_head[(relation, tail)].append(head)
    train_true_tail[(head, relation)].append(tail)

train['subsampling_weight'] = [(1/(train_count[(train.loc[i,'head'], train.loc[i,'relation'])] 
                                + train_count[(train.loc[i,'tail'], -train.loc[i,'relation']-1)]))**(1/2)
                                                                                for i in train.index]


100%|██████████| 1795880/1795880 [00:37<00:00, 48407.10it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['subsampling_weight'] = [(1/(train_count[(train.loc[i,'head'], train.loc[i,'relation'])]


In [12]:
train.to_csv(r'data/train.csv', index=False)
valid.to_csv(r'data/train.csv', index=False)
test.to_csv(r'data/train.csv', index=False)