In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import scipy.sparse as sp
import datetime
import torch
from torch import nn, optim
import sys
import time
import random
import os
from tqdm import tqdm
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
import scipy.stats

from util import *
from models import *
from optimization import *

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
!export CUDA_VISIBLE_DEVICES=0,1
cuda = torch.device('cuda') 

In [4]:
'''
 -----------------------------------------------------------------
| Run the code in Prepare_dataset.ipynb when re-preparing dataset.|
 -----------------------------------------------------------------
'''

def train(config, dictBank, model_file):
    weight_file = model_file + f"{source_name}to{target_name}-SCDGN_retrained.pth.tar"
    patient = 8
    min_ = 0.
    for epoch in range(config["epochs"]):
        if epoch %10 == 0:
            print("")
            results = Test(Recmodel, config, dictBank)
            print("[TEST] hr:{0}, ndcg:{1}".format(results['hr'][0], results['ndcg'][0]))
            if results['ndcg'][0] > min_:
                torch.save(Recmodel.state_dict(), weight_file)
                min_ = results['ndcg'][0]
                patient = 8
                continue
            if results['ndcg'][0] <= min_:
                patient = patient - 1
            if patient == 0:
                break
        start = time.time()
        aver_loss, aver_pre_loss, aver_reg_loss, aver_rec_loss, aver_res_loss = Train_on_epoch(n_users, m_items, Recmodel, config, dictBank, dictBank.tr_u)
        end = time.time()
        sys.stdout.write("\r ||epoch:{0}||loss:{1}||pre_loss:{2}||reg_loss:{3}||rec_loss:{4}||res_loss:{5}||time:{6}".format(epoch, 
                                                                                                           aver_loss, aver_pre_loss, aver_reg_loss, aver_rec_loss, aver_res_loss,
                                                                                                           round(end-start, 2)))
        sys.stdout.flush()        
    print("Training Done.")
    print(f"Saved model in: {weight_file}")
    
def get_experimental_result(model, my_dictBank):
    HR = []
    NDCG = []
    model.eval()
    item_list = df_T.deal_index.unique()
    all_users_local, all_items = model.LocalGCLayers()
    all_users_global, all_clusters = model.GlobalGCLayers(1)
    
    for i in range(10):
        np.random.seed(i*7) 
        condidate_item = np.random.choice(item_list, 200, replace=False)

        pred_reclist = {}
        item_matrix = []
        for index in tqdm(range(len(my_dictBank.t_u))):
            u = my_dictBank.t_u[index]
            u_condidate_item = np.setdiff1d(condidate_item, dict_interactions[u])
            u_condidate_item = np.random.choice(u_condidate_item, 99, replace=False)
            u_condidate_item = np.union1d(u_condidate_item, dict_interactions[u][-1:])
            item_matrix.append(u_condidate_item)
        v = torch.LongTensor(item_matrix)
        v = v.t()
        pred_result = []

        for j in tqdm(range(100)):
            y = model(torch.LongTensor(my_dictBank.t_u), v[j])
            pred_result.append(list(y.data))

        result = np.array(pred_result).T
        pred_reclist = {}
        for h in range(len(my_dictBank.t_u)):
            pred_score = dict(zip(item_matrix[h], result[h]))
            pred_score = sorted(pred_score.items(),key = lambda x:x[1],reverse = True)
            for k in range(100):
                pred_score[k] = pred_score[k][0]   
            pred_reclist[my_dictBank.t_u[h]] = pred_score

        hit_rite, ndcg = Evaluation(pred_reclist, my_dictBank.t_v, my_dictBank.t_u, [1,5])
        HR.append(hit_rite)
        NDCG.append(ndcg)
        print("HR: {0}".format(hit_rite))
        print("NDCG: {0}".format(ndcg))
    
    print(f"HR@1, HR@5: ", end=" ")
    output_result(HR)
    print()
    print(f"NDCG@1, NDCG@5:", end=" ")
    output_result(NDCG)

In [5]:
data_path = './Datasets/'
model_file = "./Models/"
Expe_scenarios = ["MLtoAM", "AMtoML"]
source_file = ["Rating_MLasS.csv", "Rating_AmzMasS.csv"]
target_file = ["Rating_AmzMasT.csv", "Rating_MLasT.csv"]

Expe_index = 0

print("Experiment scenario: "+ Expe_scenarios[Expe_index])
print("Source domain file: "+ source_file[Expe_index])
print("Target domain file:" + target_file[Expe_index])

df_S = pd.read_csv(data_path+ source_file[Expe_index])
df_T = pd.read_csv(data_path+ target_file[Expe_index])

'''
---------------------------------------------------------------
|Adding the following code when inversing source and target. e.g.(AM=>ML)
|
|
|df_S = df_S[["account_id", "deal_id"]]
|df_S.columns =["userId", "movieId"]
|
|df_T = df_T[["userId", "movieId"]]
|df_T.columns =["account_id", "deal_id"]
---------------------------------------------------------------
'''

source_name = Expe_scenarios[Expe_index][:2]
target_name = Expe_scenarios[Expe_index][-2:]

model_file = model_file = f"./Models/{source_name}to{target_name}/"

dict_path = './Dictionary/'
dict_itemId2Cluster_S = np.load(dict_path + f"{source_name}to{target_name}/Dict_item2cluster_{source_name}asS.npy", allow_pickle=True).item()
dict_itemId2Cluster_T = np.load(dict_path + f"{source_name}to{target_name}/Dict_item2cluster_{target_name}asT.npy", allow_pickle=True).item()
dict_cluster2vec = np.load(dict_path + f"{source_name}to{target_name}/Dict_cluster2vec_{source_name}to{target_name}.npy", allow_pickle=True).item()

dict_item2vec_t = np.load(dict_path + f'{source_name}to{target_name}/Dict_item2vec_{target_name}asT.npy', allow_pickle=True).item()

df_S["cluster"] = df_S.movieId.map(lambda x: dict_itemId2Cluster_S[x])
df_T["cluster"] = df_T.deal_id.map(lambda x: dict_itemId2Cluster_T[x])

dict_item_id2index = dict(zip(df_T.deal_id.unique(), np.arange(len(df_T.deal_id.unique()))))
dict_user_id2index = dict(zip(df_T.account_id.unique(), np.arange(len(df_T.account_id.unique()))))
dict_user_id2index_S = dict(zip(df_S.userId.unique(), np.arange(len(df_T.account_id.unique()), len(df_S.userId.unique()) + len(df_T.account_id.unique()))))
dict_itemIndex2Cluster_T = dict(zip(dict_item_id2index.values(), [dict_itemId2Cluster_T[x] for x in dict_item_id2index.keys()]))

vec_matrix = [dict_item2vec_t[deal_id] for deal_id in df_T.deal_id.unique()]
dict_ItemIndex2vec = dict(zip(np.arange(len(df_T.deal_id.unique())), vec_matrix))

df_T["account_index"] = df_T.account_id.map(lambda x: dict_user_id2index[x])
df_T["deal_index"] = df_T.deal_id.map(lambda x: dict_item_id2index[x])
df_S["account_index"] = df_S.userId.map(lambda x: dict_user_id2index_S[x])

dict_interactions = dict(df_T.groupby(df_T["account_index"])["deal_index"].apply(ToList))
dict_interactions_c = dict(df_T.groupby(df_T["account_index"])["cluster"].apply(ToList))


n_users = len(df_T.account_index.unique())
m_items = len(df_T.deal_index.unique())
n_inters = df_T.shape[0]
print(f"n_users:{n_users}, m_items:{m_items}, n=inter.:{df_T.shape[0]}")


#UINet = csr_matrix((np.ones(len(tr_u)), (tr_u, tr_v)), shape=(n_users, m_items))

#u_edge = np.concatenate([tr_u, df_S.account_index.values])
#i_edge = np.concatenate([[dict_itemIndex2Cluster_T[i] for i in tr_v], df_S.cluster.values])
#UCNet = csr_matrix((np.concatenate([np.ones(len(tr_u)), np.ones(len(df_S.account_index.values))]), (u_edge, i_edge)), shape=(u_edge.max()+1, 200))
#n_u_c, n_i_c, n_u_g, n_c_g = n_users, m_items, u_edge, config["num_cluster"]

my_dictBank = DataBank(dict_itemId2Cluster_S, 
                dict_itemId2Cluster_T,
                dict_item_id2index,
                dict_user_id2index,
                dict_user_id2index_S, 
                dict_itemIndex2Cluster_T,
                dict_cluster2vec,
                dict_item2vec_t,
                dict_ItemIndex2vec,
                dict_interactions,
                dict_interactions_c)

UINet = csr_matrix((np.ones(len(my_dictBank.tr_u)), (my_dictBank.tr_u, my_dictBank.tr_v)), shape=(n_users, m_items))

u_edge = np.concatenate([my_dictBank.tr_u, df_S.account_index.values])
i_edge = np.concatenate([[my_dictBank.dict_itemIndex2Cluster_T[i] for i in my_dictBank.tr_v], df_S.cluster.values])
UCNet = csr_matrix((np.concatenate([np.ones(len(my_dictBank.tr_u)), np.ones(len(df_S.account_index.values))]), (u_edge, i_edge)), shape=(u_edge.max()+1, 200))
n_u_c, n_i_c, n_u_g, n_c_g = n_users, m_items, u_edge, config["num_cluster"]

Experiment scenario: MLtoAM
Source domain file: Rating_MLasS.csv
Target domain file:Rating_AmzMasT.csv
n_users:8566, m_items:6752, n=inter.:39696


In [6]:
'''
 -------------------------------------------------------------------------------
| Set congif["pretrained_model"] == 0 when re-training the SCDGN model.|
 -------------------------------------------------------------------------------
'''

config["pretrained_model"] = 1
if config["pretrained_model"]:
    weight_file = model_file + f"{source_name}to{target_name}-SCDGN.pth.tar"
    Recmodel = SCDGN(n_u_c, n_i_c, n_u_g, n_c_g, UINet, UCNet, my_dictBank, config)
    Recmodel.load_state_dict(torch.load(weight_file))
else:
    Recmodel = SCDGN(n_u_c, n_i_c, n_u_g, n_c_g, UINet, UCNet, my_dictBank, config)
    train(config, my_dictBank, model_file)

get_experimental_result(Recmodel, my_dictBank)

  d_inv = np.power(rowsum, -0.5).flatten()



[TEST] hr:0.019378939995330375, ndcg:0.009082842850428244
 ||epoch:9||loss:0.44730797021285346||pre_loss:0.12534848276687705||reg_loss:0.1791524835254835||rec_loss:0.13876207885534866||res_loss:0.004044931614771485||time:4.477
[TEST] hr:0.11370534671958907, ndcg:0.08134724570882965
 ||epoch:19||loss:0.37481281679609546||pre_loss:0.07880305952352026||reg_loss:0.21505822630032248||rec_loss:0.07618432411033174||res_loss:0.004767207064382408||time:4.47
[TEST] hr:0.12211067009105767, ndcg:0.09088321893057477
 ||epoch:29||loss:0.3681427020093669||pre_loss:0.06707359251120816||reg_loss:0.22105901655943497||rec_loss:0.07516724616289139||res_loss:0.004842845945740523||time:4.5754
[TEST] hr:0.13436843334111603, ndcg:0.09539948668684037
 ||epoch:39||loss:0.3626092607560365||pre_loss:0.06458504886730858||reg_loss:0.22314490053964697||rec_loss:0.06998750437860904||res_loss:0.00489180522930363||time:4.499
[TEST] hr:0.13425169273873452, ndcg:0.0976776627991439
 ||epoch:49||loss:0.3590261003245478||p

100%|██████████| 8566/8566 [00:00<00:00, 12739.57it/s]
100%|██████████| 100/100 [00:07<00:00, 13.60it/s]


HR: [0.1802474900770488, 0.2679196824655615]
NDCG: [0.16637396513232833, 0.20419099529630638]


100%|██████████| 8566/8566 [00:00<00:00, 12710.58it/s]
100%|██████████| 100/100 [00:06<00:00, 14.54it/s]


HR: [0.1873686668223208, 0.2676862012607985]
NDCG: [0.1720302355169777, 0.20634340136345652]


100%|██████████| 8566/8566 [00:00<00:00, 12702.77it/s]
100%|██████████| 100/100 [00:07<00:00, 14.25it/s]


HR: [0.17978052766752275, 0.25986458090123743]
NDCG: [0.16538997694846488, 0.19996339647034578]


100%|██████████| 8566/8566 [00:00<00:00, 12768.63it/s]
100%|██████████| 100/100 [00:07<00:00, 13.55it/s]


HR: [0.1898202194723325, 0.2646509455988793]
NDCG: [0.175946694527612, 0.20813321319245412]


100%|██████████| 8566/8566 [00:00<00:00, 12762.05it/s]
100%|██████████| 100/100 [00:07<00:00, 14.02it/s]


HR: [0.1772122344151296, 0.25881391547980387]
NDCG: [0.16239082888412387, 0.19756730507947062]


100%|██████████| 8566/8566 [00:00<00:00, 12793.08it/s]
100%|██████████| 100/100 [00:07<00:00, 14.22it/s]


HR: [0.176161568993696, 0.2563623628297922]
NDCG: [0.16142633442507986, 0.1955403838199682]


100%|██████████| 8566/8566 [00:00<00:00, 12673.22it/s]
100%|██████████| 100/100 [00:07<00:00, 14.22it/s]


HR: [0.17441045995797338, 0.2662853140322204]
NDCG: [0.15752095132961802, 0.19672059893482843]


100%|██████████| 8566/8566 [00:00<00:00, 12665.22it/s]
100%|██████████| 100/100 [00:07<00:00, 14.22it/s]


HR: [0.1887695540508989, 0.2661685734298389]
NDCG: [0.17304335341480273, 0.20615115345651625]


100%|██████████| 8566/8566 [00:00<00:00, 12682.09it/s]
100%|██████████| 100/100 [00:07<00:00, 13.87it/s]


HR: [0.18013074947466728, 0.2548447349988326]
NDCG: [0.16647265193592073, 0.1984678470389798]


100%|██████████| 8566/8566 [00:00<00:00, 12633.89it/s]
100%|██████████| 100/100 [00:07<00:00, 14.22it/s]


HR: [0.16647209899603083, 0.24725659584403456]
NDCG: [0.15264165953250514, 0.18733280345696643]
HR@1, HR@5:  0.18 ± 0.005   0.261 ± 0.005   
NDCG@1, NDCG@5: 0.165 ± 0.005   0.2 ± 0.005   