# Graph recommendation

In [8]:
%load_ext kedro.extras.extensions.ipython

In [9]:
%reload_kedro

In [83]:
from typing import Iterator, Tuple
import re

from kedro.extras.datasets.pandas import CSVDataSet
from kedro.io.core import get_filepath_str
import numpy as np
import pandas as pd
import dgl
from dgl.sampling import sample_neighbors, select_topk
from dgl import save_graphs
import torch

import gid_ml_framework.pipelines.santander_preprocessing.nodes
from gid_ml_framework.extras.datasets.chunks_dataset import (
 _concat_chunks,
)

pd.options.mode.chained_assignment = None
pd.set_option('expand_frame_repr', True)
pd.set_option("display.max_rows", 999)
pd.set_option('max_colwidth', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

In [110]:
dataset = "santander"

In [111]:
transactions_graph_path = f"{dataset}.transactions_graph"
transactions_mapped_path = f"{dataset}_transactions_mapped"
users_mapping_path = f"{dataset}_users_mapping"
items_mapping_path = f"{dataset}_items_mapping"

In [112]:
transactions_mapped = _concat_chunks(context.catalog.load(transactions_mapped_path))

In [113]:
np.sort(transactions_mapped.item_id.unique())

In [101]:
# 计算item序列的相对次序
def cal_order(data):
    data = data.sort_values(['time'], kind='mergesort')
    data['order'] = range(len(data))
    return data

# 计算user序列的相对次序
def cal_u_order(data):
    data = data.sort_values(['time'], kind='mergesort')
    data['u_order'] = range(len(data))
    return data

def refine_time(data):
    data = data.sort_values(['time'], kind='mergesort')
    time_seq = data['time'].values
    time_gap = 1
    for i, da in enumerate(time_seq[0:-1]):
        if time_seq[i] == time_seq[i+1] or time_seq[i] > time_seq[i+1]:
            time_seq[i+1] = time_seq[i+1] + time_gap
            time_gap += 1
    data['time'] = time_seq
    return  data

In [18]:

data = data.groupby('user_id').apply(refine_time).reset_index(drop=True)
data = data.groupby('user_id').apply(cal_order).reset_index(drop=True)
data = data.groupby('item_id').apply(cal_u_order).reset_index(drop=True)
user = data['user_id'].values
item = data['item_id'].values
time = data['time'].values
graph_data = {('item','by','user'):(torch.tensor(item), torch.tensor(user)),
            ('user','pby','item'):(torch.tensor(user), torch.tensor(item))}
graph = dgl.heterograph(graph_data)
graph.edges['by'].data['time'] = torch.LongTensor(time)
graph.edges['pby'].data['time'] = torch.LongTensor(time)
#graph.edges['by'].data['t'] = torch.tensor(data['order'])
# graph.edges['by'].data['rt'] = torch.tensor(data['re_order'])
# graph.edges['pby'].data['t'] = torch.tensor(data['u_order'])
#graph.edges['pby'].data['rt'] = torch.tensor(data['u_re_order'])
graph.nodes['user'].data['user_id'] = torch.LongTensor(np.unique(user))
graph.nodes['item'].data['item_id'] = torch.LongTensor(np.unique(item))
# graph.nodes['item'].data['last_user'] = torch.tensor(data['u_last'])
# graph.nodes['user'].data['last_item'] = torch.tensor(data['last'])


In [114]:
data = transactions_mapped.groupby('user_id').apply(refine_time).reset_index(drop=True)
data['time'] = data['time'].astype('int64')

In [115]:
data = data.groupby('user_id').apply(refine_time).reset_index(drop=True)
data = data.groupby('user_id').apply(cal_order).reset_index(drop=True)
data = data.groupby('item_id').apply(cal_u_order).reset_index(drop=True)
user = data['user_id'].values
item = data['item_id'].values
time = data['time'].values

In [117]:
np.unique(item)

In [118]:
graph_data = {('item','by','user'):(torch.tensor(item), torch.tensor(user)),
            ('user','pby','item'):(torch.tensor(user), torch.tensor(item))}
graph = dgl.heterograph(graph_data)
graph.edges['by'].data['time'] = torch.LongTensor(time)
graph.edges['pby'].data['time'] = torch.LongTensor(time)

In [119]:
graph

In [57]:
len(np.unique(torch.tensor(user)))

In [54]:
transactions_mapped

Unnamed: 0,user_id,item_id,time
0,15596,2,1425081600
1,73485,2,1425081600
2,42249,2,1425081600
3,73754,2,1425081600
4,15798,2,1425081600
...,...,...,...
57330,85158,20,1464393600
57331,17044,20,1464393600
57332,31896,20,1464393600
57333,85133,20,1464393600


In [51]:
graph

In [109]:
graph.nodes['user'].data['user_id'] = torch.LongTensor(np.unique(user))
graph.nodes['item'].data['item_id'] = torch.LongTensor(np.unique(item))