In [1]:
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import pandas as pd

from tqdm import tqdm
import random
import math

In [3]:
#from torch.utils.data import DataLoader
import sklearn
from sklearn import model_selection
import torch_geometric
from torch_geometric import utils


In [2]:
adversarial_temperature = 1
regularization = 0
test_batch_size = 4
learning_rate = 0.0001
cpu_num = 10
max_steps = 1
#save_checkpoint_steps = 10000
valid_steps = 10000
log_steps = 100
test_log_steps = 1000
nentity = 0
nrelation = 0
ntriples_eval_train = 200000
neg_size_eval_train = 500
model = 'TransE'
double_entity_embedding = True # True if RotatE, ComplEx
double_relation_embedding = True # True if ComplEx

negative_adversarial_sampling = True #In self-adversarial sampling, we do not apply back-propagation on the sampling weight
uni_weight = True

NEG_SAMPLE_SIZE = 10
EPOCH = 100
BATCH_SIZE = 4
VALIDATION_INTERVAL = 10
EMBEDDING_SIZE = 10
GAMMA = 12

#### Предобработка данных

In [None]:
df = pd.read_csv ('clear_data_realbank.csv')
df.drop('TRANS_DETAIL', inplace=True, axis=1)
df.rename(columns = {'RETAILER' : 'tail', 'CustomerKey' : 'head',
                        'MCC' : 'tail_type', 'AMOUNT_EQ' : 'relation'   }, 
                            inplace = True) 
# некоторые id магазинов имели разные категории. Здесь id c разл категорией приравниваются к -2 и затем удаляются
for i in (range(min(df.tail_type.unique()),max(df.tail_type.unique())+1)):
    for j in range(i+1,max(df.tail_type.unique())+1):
        for k in (set(df[df.tail_type==i]['tail'].unique())&set(df[df.tail_type==j]['tail'].unique())):
            df.loc[df['tail'] == k,'tail_type'] = -2
df = df[df.tail_type != -2].reset_index(drop=True)

# траты разбиваются на 8 категорий и в дальнейшем будут характеризировать отношения
df.relation = pd.qcut(df.relation, q=8, 
        labels=["small", "medium_small", "medium_small_2", 'medium_1', 'medium_2', 'medium_large_2', 'medium_large', 'large'])

# удаление непопулярных категорий магазинов
top_mcc = list(df.tail_type.value_counts()[:10].rename_axis('unique_values').reset_index(name='counts')['unique_values'])
df = df[df.tail_type.isin(top_mcc)].reset_index(drop=True)
df = df.sort_values('tail_type')

# присвоение уникальным id пользователей и магазинов чисел от 0 до len(уникальных id)
df['head'] = pd.factorize(df['head'])[0]
max_person = max(df['head'])
df['tail'] = pd.factorize(df['tail'])[0] + max_person + 1

# словарь с типами сущностей и списком их id
dict_id = {}
dict_id['person'] = list(range(0,max(df['head'])+1))
for tail_type in df['tail_type'].unique():
    dict_id[tail_type] = list(range(min(df[df.tail_type==tail_type]['tail']),(max(df[df.tail_type==tail_type]['tail'])+1)))

# присвоение чисел отношениям
dict_rel = {key: idx for idx,key in enumerate(pd.factorize(df.relation)[1].categories)}
df['relation'] = df['relation'].apply(lambda x: dict_rel[x])

# создание негативных сущностей 
# neg_head --> tail      neg_tail --> head
df['neg_head'] = [random.sample(dict_id['person'],NEG_SAMPLE_SIZE) for _ in range(len(df))]
df['neg_tail'] = [random.sample(dict_id[i], NEG_SAMPLE_SIZE) for i in df.tail_type]

# разбиение на выборки по времени
train = df[(df.tstmp >= '2018-01-01 00:00:00+03:00') & (df.tstmp < '2018-08-01 00:00:00+03:00')]
train.drop('tstmp', inplace=True, axis=1, errors='ignore')
valid = df[(df.tstmp >= '2018-08-01 00:00:00+03:00') & (df.tstmp < '2018-10-01 00:00:00+03:00')]
valid.drop('tstmp', inplace=True, axis=1, errors='ignore')
test = df[(df.tstmp >= '2018-10-01 00:00:00+03:00') & (df.tstmp < '2019-01-01 00:00:00+03:00')]
test.drop('tstmp', inplace=True, axis=1, errors='ignore')

In [4]:
print(f'TRAIN: {round(len(train)/len(df)*100)}%')
print(f'VALID: {round(len(valid)/len(df)*100)}%')
print(f'TEST: {round(len(test)/len(df)*100)}%')

TRAIN: 58%
VALID: 17%
TEST: 25%


In [5]:
train_count, train_true_head, train_true_tail = defaultdict(lambda: 4), defaultdict(list), defaultdict(list)
for i in tqdm(train.index):
    head, relation, tail = train.loc[i,'head'], train.loc[i,'relation'],  train.loc[i,'tail']
    train_count[(head, relation)] += 1
    train_count[(tail, -relation-1)] += 1
    train_true_head[(relation, tail)].append(head)
    train_true_tail[(head, relation)].append(tail)

train['subsampling_weight'] = [(1/(train_count[(train.loc[i,'head'], train.loc[i,'relation'])] 
                                + train_count[(train.loc[i,'tail'], -train.loc[i,'relation']-1)]))**(1/2)
                                                                                for i in train.index]

nentity = len(df['tail'].unique())+len(df['head'].unique())
nrelation = len(df['relation'].unique())

100%|██████████| 1795880/1795880 [00:50<00:00, 35735.01it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


#### Модель

In [6]:
class KGEModel(nn.Module):
    def __init__(self, nentity, nrelation, embedding_size, gamma, evaluator,
                 double_entity_embedding=False, double_relation_embedding=False, epsilon = 2.0):
        super(KGEModel, self).__init__()
        
        self.gamma = nn.Parameter(
            torch.Tensor([gamma]), 
            requires_grad=False
        )
        
        self.embedding_range = nn.Parameter(
            torch.Tensor([(self.gamma.item() + epsilon) / embedding_size]), 
            requires_grad=False
        )
        
        self.entity_dim = embedding_size*2 if double_entity_embedding else embedding_size
        self.relation_dim = embedding_size*2 if double_relation_embedding else embedding_size
        
        self.entity_embedding = nn.Parameter(torch.zeros(nentity, self.entity_dim))
        nn.init.uniform_(
            tensor=self.entity_embedding, 
            a=-self.embedding_range.item(), 
            b=self.embedding_range.item()
        )
        
        self.relation_embedding = nn.Parameter(torch.zeros(nrelation, self.relation_dim))
        nn.init.uniform_(
            tensor=self.relation_embedding, 
            a=-self.embedding_range.item(), 
            b=self.embedding_range.item()
        )

        self.evaluator = evaluator
        
    def forward(self, head, tail, relation, neg_head, neg_tail):
        head_E = torch.index_select(
                self.entity_embedding, 
                dim=0, 
                index=head
            ).unsqueeze(1)
        
        relation_E = torch.index_select(
                self.relation_embedding, 
                dim=0, 
                index=relation
            ).unsqueeze(1)

        tail_E = torch.index_select(
                self.entity_embedding, 
                dim=0, 
                index=tail
            ).unsqueeze(1)

        neg_head_E = torch.index_select(
                self.entity_embedding, 
                dim=0, 
                index=neg_head.view(-1)
            ).view(neg_head.size(0), neg_head.size(1), -1)  #batch_size, negative_sample_size
            
        neg_tail_E  = torch.index_select(
                self.entity_embedding, 
                dim=0, 
                index=neg_tail.view(-1)
            ).view(neg_tail.size(0), neg_tail.size(1), -1) 
            
        #TransE
        positive_score = (head_E + relation_E) - tail_E
        negative_tail_score = (head_E + relation_E) - neg_tail_E
        negative_head_score = neg_head_E + (relation_E - tail_E)

        positive_score = self.gamma.item() - torch.norm(positive_score, p=1, dim=2)
        negative_tail_score = self.gamma.item() - torch.norm(negative_tail_score, p=1, dim=2)
        negative_head_score = self.gamma.item() - torch.norm(negative_head_score, p=1, dim=2)
        
        return positive_score, negative_tail_score, negative_head_score

#### Метрика

In [7]:
class Evaluator:
    def eval(self, input_dict):
        y_pred_pos, y_pred_neg = input_dict['y_pred_pos'], input_dict['y_pred_neg']
        y_pred = torch.cat([y_pred_pos.view(-1,1), y_pred_neg], dim = 1)
        argsort = torch.argsort(y_pred, dim = 1, descending = True)
        ranking_list = torch.nonzero(argsort == 0, as_tuple=False)
        ranking_list = ranking_list[:, 1] + 1
        hits1_list = (ranking_list <= 1).to(torch.float)
        hits3_list = (ranking_list <= 3).to(torch.float)
        hits10_list = (ranking_list <= 10).to(torch.float)
        mrr_list = 1./ranking_list.to(torch.float)

        return {'hits@1_list': hits1_list,
                     'hits@3_list': hits3_list,
                     'hits@10_list': hits10_list,
                     'mrr_list': mrr_list}

In [28]:
train_exp = train[:100]
EPOCH = 2

#### TRAIN

In [29]:
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [31]:
evaluator = Evaluator()
kge_model = KGEModel(
        nentity=nentity,
        nrelation=nrelation,
        embedding_size=EMBEDDING_SIZE,
        gamma=GAMMA,
        double_entity_embedding=double_entity_embedding,
        double_relation_embedding=double_relation_embedding,
        evaluator=evaluator
    )
kge_model = kge_model.cuda()
optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, kge_model.parameters()), 
            lr=learning_rate
        )

log = {}
for num_epoch in tqdm(range(EPOCH)):
    train_iter = chunks(train_exp, BATCH_SIZE)
    iteration = math.ceil(len(train_exp)/BATCH_SIZE) 
    for i in range(iteration):
        kge_model.train()
        optimizer.zero_grad()

        train_batch = next(train_iter)

        head = torch.tensor(np.array(train_batch['head'])).cuda()
        tail = torch.tensor(np.array(train_batch['tail'])).cuda()
        relation = torch.tensor(np.array(train_batch['relation'])).cuda()
        neg_head = torch.tensor(np.array(list(train_batch['neg_head']))).cuda()
        neg_tail = torch.tensor(np.array(list(train_batch['neg_tail']))).cuda()
        subsampling_weight = torch.tensor(np.array(train_batch['subsampling_weight'])).cuda()

        positive_score, negative_tail_score, negative_head_score = kge_model(head, tail, relation, neg_head, neg_tail)

        positive_score = F.logsigmoid(positive_score).squeeze(dim = 1)
        if negative_adversarial_sampling:
            #In self-adversarial sampling, we do not apply back-propagation on the sampling weight
            negative_tail_score = (F.softmax(negative_tail_score * adversarial_temperature, dim = 1).detach() 
                              * F.logsigmoid(-negative_tail_score)).sum(dim = 1)
            negative_head_score = (F.softmax(negative_head_score * adversarial_temperature, dim = 1).detach() 
                              * F.logsigmoid(-negative_head_score)).sum(dim = 1)
        else:
            negative_tail_score = F.logsigmoid(-negative_tail_score).mean(dim = 1)
            negative_head_score = F.logsigmoid(-negative_head_score).mean(dim = 1)

        if uni_weight:
            positive_sample_loss = - positive_score.mean()
            negative_sample_tail_loss = - negative_tail_score.mean()
            negative_sample_head_loss = - negative_head_score.mean()
        else:
            positive_sample_loss = - (subsampling_weight * positive_score).sum()/subsampling_weight.sum()
            negative_sample_tail_loss = - (subsampling_weight * negative_tail_score).sum()/subsampling_weight.sum()
            negative_sample_head_loss = - (subsampling_weight * negative_head_score).sum()/subsampling_weight.sum()

        loss = (2*positive_sample_loss + negative_sample_tail_loss+negative_sample_head_loss)/4

        if regularization != 0.0:
            #Use L3 regularization for ComplEx and DistMult
            regularization = regularization * (
                kge_model.entity_embedding.norm(p = 3)**3 + 
                kge_model.relation_embedding.norm(p = 3).norm(p = 3)**3
            )
            loss = loss + regularization
        
        loss.backward()

        optimizer.step()
    
    if (num_epoch+1)//VALIDATION_INTERVAL == 0:
        

100%|██████████| 2/2 [00:00<00:00,  5.76it/s]


In [30]:
positive_sample

tensor([[    695,       1, 2155224],
        [   3340,       1, 1255182],
        [   3468,       6,  936329],
        [   1758,       1,  467378]])

In [24]:
negative_sample

tensor([[2218828, 2304479, 2250254, 1846286, 1935662, 2151055, 2283020, 1977150,
         1825154, 2193574],
        [ 206160, 1580145, 1194749, 1480901, 1518416, 1333486,  300682, 1797609,
         1438722,  919222],
        [ 490480,  824455,  508149,  444770, 1686218,  881021,  170813, 1361398,
          363282,  670189],
        [1762836,  551054,  400965, 1467674,  275619, 1334130,  633564,  215826,
         1257242,  170830]])

In [25]:
subsampling_weight

tensor([0.1147, 0.1336, 0.0851, 0.0909])

In [52]:
positive_sample = torch.tensor([])
positive_sample = torch.cat((positive_sample, torch.LongTensor([0,1,2])),-1)
positive_sample = torch.cat((positive_sample, torch.LongTensor([0,1,2])),-1)
print(positive_sample)

tensor([0., 1., 2., 0., 1., 2.])


In [55]:
positive_sample = []
positive_sample.append([0,1,2])
positive_sample.append([0,1,2])
print(torch.LongTensor(positive_sample))


tensor([[0, 1, 2],
        [0, 1, 2]])


In [34]:
positive_sample

tensor([])

In [56]:
train_count[(head, relation, head_type)] + train_count[(tail, -relation-1, tail_type)]

52

In [57]:
(1/(train_count[(head, relation, head_type)] + train_count[(tail, -relation-1, tail_type)]))**(1/2)

0.1386750490563073

In [61]:
print(entity_dict[tail_type][0], entity_dict[tail_type][1], negative_sample_size)

4045 162173 10


In [62]:
import random
random.sample(range(entity_dict[tail_type][0], entity_dict[tail_type][1]), negative_sample_size)

[34857, 102533, 67879, 151864, 101192, 48360, 63725, 54264, 107219, 157469]

In [72]:
negative_sample.view(-1)

tensor([ 157851,   47495,   77106,   55592,  134623,  125030,   95340,  141292,
          89379,    7282,  599768,  675371,  802333,  719380,  929603, 1023578,
        1596766,  449713, 1185347, 1295857,  468832, 1229757,  312410,  645505,
        1542261, 1004376,  939976, 1681983, 1246682,  547799,  540114,  730751,
        1562784,  955217, 1149887, 1438719, 1130914,  963494,  699750, 1497028],
       device='cuda:0')

In [71]:
positive_sample

tensor([[      0,       0,    4045],
        [   1505,       0, 1377759],
        [   2172,       0, 1377762],
        [   3567,       0, 1377775]], device='cuda:0')

In [69]:
positive_sample

tensor([[      0,       0,    4045],
        [   1505,       0, 1377759],
        [   2172,       0, 1377762],
        [   3567,       0, 1377775]], device='cuda:0')

In [66]:
print(positive_sample)
print(negative_sample)
print(subsampling_weight)

tensor([[   2963,       2,  435157],
        [   2831,       2, 2327754],
        [   3708,       2, 2327753],
        [    173,       2,   10857]])
tensor([[1549108,  291924,  358954,  234002,  718396,  271483,  361656, 1540361,
         1097434,  743985],
        [2335794, 2237600, 2333249, 2004466, 2288563, 1956566, 2158608, 1837253,
         2247794, 2219773],
        [1844113, 2072521, 1967319, 1954988, 1832988, 2106742, 1981877, 1915342,
         1927755, 2172085],
        [  36823,   53804,  105826,   85847,  143019,  151635,  138683,  157103,
           19976,   83402]])
tensor([0.1091, 0.1348, 0.1125, 0.1387])
