In [1]:
import math
import pickle
import pandas as pd
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data.dataset import random_split
from datetime import *
from sklearn.model_selection import KFold 
from torchmetrics import RetrievalMRR

import load_data
import utils
import model

In [2]:
dname='nyc'
filter_data=pd.read_csv('../data/'+dname+'_filter_data.csv')
seq_data_path='../data/'+dname+'_seq_data_pad'

print(len(filter_data))
poiNum=len(filter_data.groupby('venueid'))
catNum=len(filter_data.groupby('categid'))
userNum=len(filter_data.groupby('userid'))

#用來pad序列資料中POI、類別編號和時間的數字，整數比較好看
poiPad=((poiNum//10)+1)*10
catPad=((catNum//10)+1)*10
timePad=((48//10)+1)*10
print(poiNum,poiPad)
print(catNum,catPad)
print(userNum,timePad)

147729
5130 5140
208 210
1083 50


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size=32
dataset=load_data.myDataset(seq_data_path, poiPad, catPad, timePad)
first_data = dataset[1]
print(first_data)

# Split training and validation set
train_len = int(0.8*len(dataset))
valid_len = len(dataset) - train_len
TrainData, ValidationData = random_split(dataset,[train_len, valid_len])

# Load into Iterator (each time get one batch)
train_loader = data.DataLoader(TrainData, batch_size=batch_size, shuffle=True, num_workers=0)
test_loader = data.DataLoader(ValidationData, batch_size=batch_size, shuffle=True, num_workers=0)

# Print statistics
print("Total: ", len(dataset))
print("Training Set: ", len(TrainData))
print("Validation Set: ", len(ValidationData))
print('train batch:',len(train_loader))
print('validation batch:',len(test_loader))

data length: 15992


100%|██████████| 15992/15992 [00:00<00:00, 55631.70it/s]


(tensor([2815, 1211, 3602, 1502,  383, 2870, 5140, 5140, 5140, 5140, 5140, 5140,
        5140, 5140]), tensor([  4,  36, 105, 108,  36,  48, 210, 210, 210, 210, 210, 210, 210, 210]), tensor([19, 22,  1, 24, 40, 42, 50, 50, 50, 50, 50, 50, 50, 50]), 1, 1508, 36, 48)
Total:  15992
Training Set:  12793
Validation Set:  3199
train batch: 400
validation batch: 100


In [4]:
#A=np.load('../list/nyc_usercat_graph.npy')
#C=np.load('../list/nyc_utc_graph.npy')
A=np.load('../list/nyc_usercat_Norm.npy')
C=np.load('../list/nyc_utc_Norm.npy')
#A=np.load('../list/nyc_usercat_Percent.npy')
#C=np.load('../list/nyc_utc_Percent.npy')
B=np.load('../list/nyc_dis_graph.npy')

for i in range(len(A)):
    A[i]=utils.calculate_laplacian_matrix(A[i])
print(A.shape)
A = torch.from_numpy(A)
A = A.to(device=device, dtype=torch.float)

disGraph = utils.calculate_laplacian_matrix(B)
print(disGraph.shape)
disGraph = torch.from_numpy(disGraph)
disGraph = disGraph.to(device=device, dtype=torch.float)

for i in range(len(C)):
    C[i]=utils.calculate_laplacian_matrix_time(C[i], catNum)
print(C.shape)
C = torch.from_numpy(C)
C = C.to(device=device, dtype=torch.float)

(1084, 208, 208)
(5130, 5130)
(1084, 256, 256)


In [None]:
dropout=0.4
DTTCG = model.DTTCG(poiPad, catPad, userNum).float().to(device)
catGCN = model.batchGCN(dropout, catNum).to(device)
timeGCN = model.batchGCN(dropout,catNum+48).to(device)
disGCN = model.GCN(dropout, poiNum).to(device)
criterion = nn.CrossEntropyLoss()
criterionc = nn.CrossEntropyLoss()
epochs=50
optimizer = torch.optim.Adam(params=list(DTTCG.parameters()) +list(catGCN.parameters())
                             +list(timeGCN.parameters())+list(disGCN.parameters()), lr=0.001)

scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[5,15,25,35,45], gamma=0.5)
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10, factor=0.1)


for epoch in range(epochs):
    #start training
    DTTCG.train()     # Enter Train Mode
    catGCN.train()
    timeGCN.train()
    disGCN.train()

    correct,correctc = 0,0
    loss_sum, num=0, len(train_loader.dataset) 

    for batch_id, batch in enumerate(train_loader):
        p, c, t, u, targets, targetsc, lastc = [d.to(device) for d in batch]
        optimizer.zero_grad()

        userGraph=A[u]
        catNode=DTTCG.returnEmb('cat',catNum)
        gcn_cat=catGCN(userGraph,catNode)
        gcn_cat_emb=utils.idx_to_emb(c,gcn_cat,catPad,device)

        utcGraph=C[u]
        timeNode=DTTCG.returnEmb('time',catNum+48)
        gcn_time=timeGCN(utcGraph,timeNode)
        gcn_time_emb=utils.idx_to_emb(t,gcn_time,timePad,device)

        poiNode=DTTCG.returnEmb('poi',poiNum)
        gcn_poi=disGCN(disGraph,poiNode)
        gcn_poi_emb=utils.idx_to_emb(p,gcn_poi,poiPad,device)

        last_cat=[]
        for i in range(len(lastc)):
            idx=lastc[i]
            lc=gcn_cat[i][idx]
            last_cat.append(lc)
        lastc=torch.stack(last_cat)
        
        predsc,preds,loss3 = DTTCG(u, gcn_cat_emb, gcn_time_emb, gcn_poi_emb,lastc)
        with torch.no_grad():
            correct+=utils.topk(preds,targets,1)
            correctc+=utils.topk(predsc,targetsc,1)

        loss1 = criterion(preds, targets.long())
        loss2 = criterionc(predsc, targetsc.long())
        loss = loss1+loss2+loss3

        loss.backward()
        optimizer.step()
        loss_sum += loss.item()
    train_loss=loss_sum/num
    train_acc_cat=correctc*100/len(TrainData)
    train_acc_poi=correct*100/len(TrainData)

    DTTCG.eval()
    catGCN.eval()
    timeGCN.eval()
    disGCN.eval()
    correct, correctc=0,0
    loss_sum, num=0, len(test_loader.dataset)
    val_loss=[]
    with torch.no_grad():
        for _, batch in enumerate(test_loader):
            p, c, t, u, targets, targetsc, lastc = [d.to(device) for d in batch]

            userGraph=A[u]
            catNode=DTTCG.returnEmb('cat',catNum)
            gcn_cat=catGCN(userGraph,catNode)
            gcn_cat_emb=utils.idx_to_emb(c,gcn_cat,catPad,device)

            utcGraph=C[u]
            timeNode=DTTCG.returnEmb('time',catNum+48)
            gcn_time=timeGCN(utcGraph,timeNode)
            gcn_time_emb=utils.idx_to_emb(t,gcn_time,timePad,device)

            poiNode=DTTCG.returnEmb('poi',poiNum)
            gcn_poi=disGCN(disGraph,poiNode)
            gcn_poi_emb=utils.idx_to_emb(p,gcn_poi,poiPad,device)

            last_cat=[]
            for i in range(len(lastc)):
                idx=lastc[i]
                lc=gcn_cat[i][idx]
                last_cat.append(lc)
            lastc=torch.stack(last_cat)

            predsc, preds , loss3= DTTCG(u, gcn_cat_emb, gcn_time_emb, gcn_poi_emb, lastc)
            correct+=utils.topk(preds,targets,1)
            correctc+=utils.topk(predsc,targetsc,1)

            loss1 = criterion(preds, targets.long())
            loss2 = criterionc(predsc, targetsc.long())
            loss = loss1+loss2+loss3
            loss_sum += loss.item()

    test_loss=loss_sum/num
    scheduler.step()
    test_acc_cat=correctc*100/len(ValidationData)
    test_acc_poi=correct*100/len(ValidationData)

    print("Epoch:{}/{}  Lr:{:.6f} Traing Loss:{:.3f} TestLoss:{:.3f} Training_Cat_Acc:{:.3f} Test_Cat_Acc:{:.3f}   Training_POI_Acc {:.3f} %  Test_POI_Acc {:.3f} %".format(epoch + 1,
                                                                                                            epochs,optimizer.state_dict()['param_groups'][0]['lr'],
                                                                                                            train_loss,test_loss,
                                                                                                            train_acc_cat,test_acc_cat,
                                                                                                            train_acc_poi,test_acc_poi))
    

In [None]:
# category=GCN+att+lastc  time=GCN+att  BPRloss   fuse(disGCN+user)+fuse(crep+trep)
def evaluteTop(model, loader):
    model.eval()
    total = len(loader.sampler)
    correct=[0,0,0,0]
    correctc=[0,0,0,0]
    toplist=[1,5,10,20]
    mrr_acc=0
    for batch in loader:
        p, c, t, u, targets, targetsc, lastc = [d.to(device) for d in batch]

        with torch.no_grad():
            userGraph=A[u]
            catNode=model.returnEmb('cat',catNum)
            gcn_cat=catGCN(userGraph,catNode)
            gcn_cat_emb=utils.idx_to_emb(c,gcn_cat,catPad,device)

            utcGraph=C[u]
            timeNode=model.returnEmb('time',catNum+48)
            gcn_time=timeGCN(utcGraph,timeNode)
            gcn_time_emb=utils.idx_to_emb(t,gcn_time,timePad,device)

            poiNode=model.returnEmb('poi',poiNum)
            gcn_poi=disGCN(disGraph,poiNode)
            gcn_poi_emb=utils.idx_to_emb(p,gcn_poi,poiPad,device)

            last_cat=[]
            for i in range(len(lastc)):
                idx=lastc[i]
                lc=gcn_cat[i][idx]
                last_cat.append(lc)
            lastc=torch.stack(last_cat)

            predsc, preds , loss3= model(u, gcn_cat_emb, gcn_time_emb, gcn_poi_emb, lastc)

            num=0
            for top_k in [1,5,10,20]:
                correctc[num] +=utils.topk(predsc,targetsc,top_k)
                correct[num] +=utils.topk(preds,targets,top_k)
                num+=1
                
            sort_pred,idx=torch.sort(preds,descending=True)
            idx=idx.to('cpu').numpy()
            y=targets.to('cpu').numpy()
            for i in range(len(y)):
                index=np.where(idx[i]==y[i])
                mrr_acc+=1/(index[0]+1)
    print('MRR accuracy : {} %'.format(mrr_acc*100/total))
    for tk,i in zip(toplist,correctc):            
        print('Top {topk} Accuracy of category rec: {acc} %'.format(topk=tk,acc=i*100 / total))

    for tk,i in zip(toplist,correct):
        print('Top {topk} Accuracy poi rec: {acc} %'.format(topk=tk,acc=i*100 / total))
    
#print('Test Accuracy of the model on the top1 test: {} %'.format(100 *evaluteTop(model,test_loader,1)))
evaluteTop(DTTCG,test_loader)