In [1]:
import torch as th 
import torch.nn as nn
import pandas as pd
import numpy as np
import json
import copy
import os
import warnings

from DGL_LFM1b.data_utils import *
from utils.utils import convert_to_gpu
from dgl.data.utils import load_graphs
from utils.LinkScorePredictor import LinkScorePredictor
from model.R_HGNN import R_HGNN
from tqdm import tqdm
from IPython.core.display import HTML
HTML("""
.output_png {
    display: table-cell;
    text-align: right;
    vertical-align: middle;
}
""")

%load_ext autoreload
%autoreload 2

2022-07-31 21:03:19.603063: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-31 21:03:19.603088: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Setting Arugments and initializing graph

# Recommendation with Link Prediction Models

In [7]:
from DGL_LFM1b.data_utils import get_fileSize, get_col_names, setType, isValid, get_preprocessed_ids

def get_ids_from_txtFile(path_txt_file, type ,ids):
    chunksize=1000000
    chunks=[]
    size = get_fileSize(path_txt_file)
    col_names=get_col_names(type)
    df_chunks = pd.read_csv(path_txt_file, names=col_names, sep="\t", encoding='utf8', header = 0, chunksize=chunksize)
    for chunk in tqdm(df_chunks, total=size//chunksize):
        for col in chunk.columns:
            try:
                chunk[col]=chunk[col].apply(lambda x: setType(x, col))
            except:
                chunk[col]=chunk[col].apply(lambda x: isValid(x, col))
                chunk[col]=chunk[col].apply(lambda x: setType(x, col))
        chunk = chunk[chunk[col_names[0]].isin(ids)] 
        chunks.append(chunk)
    df = pd.concat(chunks)
    # print(f'{type} df has shpe:',df.shape)
    return df

def topK_playcounts(path_to_txt_data, series, type, k, user_id):
    val_count_pairs=series.value_counts().head(k)
    ids=list(val_count_pairs.keys())
    type_df = get_ids_from_txtFile(path_to_txt_data, type, ids)
    print(f'Top {k} {type}s for user {user_id}')
    print('ID       PLAYCOUNT      NAME')
    for id, playcount in val_count_pairs.items():
        row=type_df.loc[type_df[type+'_id'] == id]
        name = row[type+'_name'].item()
        print(f'{id}        {playcount}         {name} ')

def topK_recommendations(path_to_txt_data, ids, type, k, user_id, artist_path_to_txt_data=None):
    type_df = get_ids_from_txtFile(path_to_txt_data, type, ids)
    if artist_path_to_txt_data != None:
        artist_ids=[type_df.loc[type_df[type+'_id'] == id]['artist_id'].item() for id in ids]
        artist_df= get_ids_from_txtFile(artist_path_to_txt_data, type='artist', ids=artist_ids)
        artist_name_mapping={id: artist_df.loc[artist_df['artist_id'] == id]['artist_name'].item() for id in artist_ids}

    print(f'Top {k} {type} recommendations for user #{user_id}')
    if artist_path_to_txt_data != None:
        print('ID           ARTIST_NAME             NAME')
    else:
        print('ID             NAME')
    for id in ids:
        row=type_df.loc[type_df[type+'_id'] == id]
        name = row[type+'_name'].item()
        try:
            artist_id = row['artist_id'].item()
            print(f'{id}        {artist_name_mapping[artist_id]}        {name}')
        except:
            print(f'{id}  {name}')

def displayTopKRecommendations(graph, model, type, sampled_edge_type, k, user_id, typeFile_path, userFile_path, artists_path, albums_path, tracks_path, les_path, device, artist_path_to_txt_data=None):
    input_features = {(stype, etype, dtype): graph.srcnodes[dtype].data['feat'] for stype, etype, dtype in graph.canonical_etypes}
    nodes_representation, _ = model[0].inference(graph, copy.deepcopy(input_features), device=device)
    user_nodes_representation=nodes_representation['user']
    type_nodes_representation=nodes_representation[type]
    # C = torch.mm(A, B.T)  # same as C = A @ B.T
    listen_to_type_likelihood = th.mm(user_nodes_representation, type_nodes_representation.T)

    print(f'listen_to_{type}_likelihood',listen_to_type_likelihood.shape)
    user_type_recommendations={}
    for u_id, row in enumerate(tqdm(listen_to_type_likelihood, total=int(listen_to_type_likelihood.shape[0]))):
        for id, _ in enumerate(row):
            try:
                graph.edge_id(u_id,id, etype=sampled_edge_type)
            except:
                if u_id in user_type_recommendations.keys():
                    user_type_recommendations[u_id].append((u_id, id, listen_to_type_likelihood[u_id,id].item()))
                else:
                    user_type_recommendations[u_id]=list()
                    user_type_recommendations[u_id].append((u_id, id, listen_to_type_likelihood[u_id,id].item()))

    rev_user_mapping = get_id_mapping(path_to_file=userFile_path, type='user', reverse=True)
    rev_type_mapping = get_id_mapping(path_to_file=typeFile_path, type=type, reverse=True)
    user_listens=get_ids_from_txtFile(path_txt_file=les_path, type='le' , ids=[rev_user_mapping[user_id]])
    topK_playcounts(path_to_txt_data=artists_path, series=user_listens['artist_id'], type='artist', k=k, user_id=user_id)
    topK_playcounts(path_to_txt_data=albums_path, series=user_listens['album_id'], type='album', k=k, user_id=user_id)
    topK_playcounts(path_to_txt_data=tracks_path, series=user_listens['track_id'], type='track', k=k, user_id=user_id)
    user_type_recommendations={key: sorted(value, key=lambda x: x[2], reverse=True)[:k] for key, value in  user_type_recommendations.items()}
    user_type_recommendations=[rev_type_mapping[id] for _, id, _ in user_type_recommendations[user_id]]
    if artist_path_to_txt_data !=None:
        topK_recommendations(path_to_txt_data=typeFile_path, ids=user_type_recommendations, type=type, k=k, user_id=user_id, artist_path_to_txt_data=artists_path)
    else:
        topK_recommendations(path_to_txt_data=typeFile_path, ids=user_type_recommendations, type=type, k=k, user_id=user_id)

def get_result_folder_path(root, date, sample_edge_type):
    return f'{root}/lfm1b/{date}/{sample_edge_type}'

def get_result_folder_args(root, date, sample_edge_type):
    return get_result_folder_path(root, date, sample_edge_type)+'/args.json'

def get_result_folder_model_state(root, date, sample_edge_type):
    return get_result_folder_path(root, date, sample_edge_type)+f'/{sample_edge_type}.pkl'

def get_result_folder_metrics(root, date, sample_edge_type):
    return get_result_folder_path(root, date, sample_edge_type)+f'/metrics.pkl'


def get_id_mapping(path_to_file, type, reverse=False):
    ids=get_preprocessed_ids(path_to_file, return_unique_ids=False, type=type, id_list=get_col_names(type))[f'{type}_id']
    if reverse:
        return {i: row for i, row in enumerate(ids)}
    else:
        return {row: i for i, row in enumerate(ids)}


def build_model(data_post_path,  date, args, sample_edge_type, root='results/'):
    model_state_path=get_result_folder_model_state(root, date, sample_edge_type)
    glist,_=load_graphs(f'{data_post_path}/lastfm1b.bin')
    hg=glist[0]
    r_hgnn = R_HGNN(graph=hg,
                input_dim_dict={ntype: 8 for ntype in hg.ntypes},
                hidden_dim=16, 
                relation_input_dim=8,
                relation_hidden_dim=16,
                num_layers=2, 
                n_heads=args['num_heads'], 
                dropout=args['dropout'],
                residual=args['residual'], 
                norm=args['norm'])
    link_scorer = LinkScorePredictor(16 * args['num_heads'])

    model = nn.Sequential(r_hgnn, link_scorer)
    model = convert_to_gpu(model, device=args['device'])
    model.load_state_dict(th.load(model_state_path))
    return model

def get_file_pre_path(data_pre_path, type):
    print('type:',type)
    if type=='user':
        return data_pre_path+'LFM-1b_users.txt'
    elif type=='album':
        return data_pre_path+'LFM-1b_albums.txt'
    elif type=='artist':
        return data_pre_path+'LFM-1b_artists.txt'
    elif type=='track':
        return data_pre_path+'LFM-1b_tracks.txt'
    elif type=='le':
        return data_pre_path+'LFM-1b_LEs.txt'
    elif type=='genre':
        return data_pre_path+'genres_allmusic.txt'
    else:
        raise Exception('bad "type" parameter in get_col_names')



In [3]:
from DGL_LFM1b.DGL_LFM1b import LFM1b
dataset=LFM1b()


 Processing LFM1b


In [4]:
def preform_recommendations(graph, data_pre_path, data_post_path,  date, sample_edge_type, root, u_id, k):
    args_path=get_result_folder_args(root, date, sample_edge_type)
    args = json.load(open(args_path))
    print('args',args)
    model=build_model(data_post_path,  date, args, sample_edge_type, root='./results/')

    displayTopKRecommendations(
    graph=graph,
    model=model,
    type='artist',
    sampled_edge_type=sample_edge_type,
    k=k,
    user_id=u_id,
    typeFile_path=get_file_pre_path(data_pre_path,type='artist'),
    userFile_path=data_pre_path+'LFM-1b_users.txt',
    artists_path=data_pre_path+'LFM-1b_artists.txt',
    albums_path=data_pre_path+'LFM-1b_albums.txt',
    tracks_path=data_pre_path+'LFM-1b_tracks.txt',
    les_path=data_pre_path+'LFM-1b_LEs.txt',
    device=args['device']
    )

In [8]:
uer_id=0
k=10
graph=dataset[0]
data_root_path='results'
data_pre_path='data/DGL_LFM1b/preprocessed/'
data_post_path='data/DGL_LFM1b/processed/'
date='31_07_2022_13:10:59'
sample_edge_type='listened_to_artist'

preform_recommendations(graph, data_pre_path, data_post_path,  date, sample_edge_type, data_root_path, uer_id, k)

args {'seed': 0, 'sample_edge_rate': 0.05, 'num_layers': 2, 'batch_size': 512, 'num_neg_samples': 10, 'node_min_neighbors': 10, 'shuffle': True, 'drop_last': False, 'num_workers': 4, 'hidden_dim': 16, 'rel_input_dim': 8, 'rel_hidden_dim': 16, 'num_heads': 8, 'dropout': 0.5, 'residual': True, 'norm': True, 'opt': 'adam', 'weight_decay': 0.0, 'epochs': 100, 'patience': 25, 'split_by_users': True, 'device': 'cuda', 'artists': True, 'albums': True, 'tracks': True, 'playcount_weight': False, 'norm_playcount_weight': False, 'metapath2vec': True, 'emb_dim': 8, 'walk_length': 64, 'context_size': 7, 'walks_per_node': 1, 'metapath2vec_epochs_batch_size': 512, 'learning_rate': 0.001, 'metapath2vec_epochs': 5, 'logs': 100, 'n_users': 25, 'popular_artists': True, 'model_parameters': 1035882}


RuntimeError: Error(s) in loading state_dict for Sequential:
	Unexpected key(s) in state_dict: "0.relation_embedding.album_listened_by", "0.relation_embedding.listened_to_album", "0.relation_embedding.listened_to_track", "0.relation_embedding.preformed", "0.relation_embedding.preformed_by", "0.relation_embedding.produced", "0.relation_embedding.produced_by", "0.relation_embedding.track_listened_by", "0.projection_layer.album.weight", "0.projection_layer.album.bias", "0.projection_layer.track.weight", "0.projection_layer.track.bias", "0.layers.0.node_transformation_weight.album", "0.layers.0.node_transformation_weight.track", "0.layers.0.relation_transformation_weight.album_listened_by", "0.layers.0.relation_transformation_weight.listened_to_album", "0.layers.0.relation_transformation_weight.listened_to_track", "0.layers.0.relation_transformation_weight.preformed", "0.layers.0.relation_transformation_weight.preformed_by", "0.layers.0.relation_transformation_weight.produced", "0.layers.0.relation_transformation_weight.produced_by", "0.layers.0.relation_transformation_weight.track_listened_by", "0.layers.0.relation_propagation_layer.album_listened_by.weight", "0.layers.0.relation_propagation_layer.album_listened_by.bias", "0.layers.0.relation_propagation_layer.produced_by.weight", "0.layers.0.relation_propagation_layer.produced_by.bias", "0.layers.0.relation_propagation_layer.preformed.weight", "0.layers.0.relation_propagation_layer.preformed.bias", "0.layers.0.relation_propagation_layer.produced.weight", "0.layers.0.relation_propagation_layer.produced.bias", "0.layers.0.relation_propagation_layer.preformed_by.weight", "0.layers.0.relation_propagation_layer.preformed_by.bias", "0.layers.0.relation_propagation_layer.track_listened_by.weight", "0.layers.0.relation_propagation_layer.track_listened_by.bias", "0.layers.0.relation_propagation_layer.listened_to_album.weight", "0.layers.0.relation_propagation_layer.listened_to_album.bias", "0.layers.0.relation_propagation_layer.listened_to_track.weight", "0.layers.0.relation_propagation_layer.listened_to_track.bias", "0.layers.0.res_fc.album.weight", "0.layers.0.res_fc.album.bias", "0.layers.0.res_fc.track.weight", "0.layers.0.res_fc.track.bias", "0.layers.0.residual_weight.album", "0.layers.0.residual_weight.track", "0.layers.0.layer_norm.album.weight", "0.layers.0.layer_norm.album.bias", "0.layers.0.layer_norm.track.weight", "0.layers.0.layer_norm.track.bias", "0.layers.0.relations_crossing_attention_weight.album_listened_by", "0.layers.0.relations_crossing_attention_weight.listened_to_album", "0.layers.0.relations_crossing_attention_weight.listened_to_track", "0.layers.0.relations_crossing_attention_weight.preformed", "0.layers.0.relations_crossing_attention_weight.preformed_by", "0.layers.0.relations_crossing_attention_weight.produced", "0.layers.0.relations_crossing_attention_weight.produced_by", "0.layers.0.relations_crossing_attention_weight.track_listened_by", "0.layers.1.node_transformation_weight.album", "0.layers.1.node_transformation_weight.track", "0.layers.1.relation_transformation_weight.album_listened_by", "0.layers.1.relation_transformation_weight.listened_to_album", "0.layers.1.relation_transformation_weight.listened_to_track", "0.layers.1.relation_transformation_weight.preformed", "0.layers.1.relation_transformation_weight.preformed_by", "0.layers.1.relation_transformation_weight.produced", "0.layers.1.relation_transformation_weight.produced_by", "0.layers.1.relation_transformation_weight.track_listened_by", "0.layers.1.relation_propagation_layer.album_listened_by.weight", "0.layers.1.relation_propagation_layer.album_listened_by.bias", "0.layers.1.relation_propagation_layer.produced_by.weight", "0.layers.1.relation_propagation_layer.produced_by.bias", "0.layers.1.relation_propagation_layer.preformed.weight", "0.layers.1.relation_propagation_layer.preformed.bias", "0.layers.1.relation_propagation_layer.produced.weight", "0.layers.1.relation_propagation_layer.produced.bias", "0.layers.1.relation_propagation_layer.preformed_by.weight", "0.layers.1.relation_propagation_layer.preformed_by.bias", "0.layers.1.relation_propagation_layer.track_listened_by.weight", "0.layers.1.relation_propagation_layer.track_listened_by.bias", "0.layers.1.relation_propagation_layer.listened_to_album.weight", "0.layers.1.relation_propagation_layer.listened_to_album.bias", "0.layers.1.relation_propagation_layer.listened_to_track.weight", "0.layers.1.relation_propagation_layer.listened_to_track.bias", "0.layers.1.res_fc.album.weight", "0.layers.1.res_fc.album.bias", "0.layers.1.res_fc.track.weight", "0.layers.1.res_fc.track.bias", "0.layers.1.residual_weight.album", "0.layers.1.residual_weight.track", "0.layers.1.layer_norm.album.weight", "0.layers.1.layer_norm.album.bias", "0.layers.1.layer_norm.track.weight", "0.layers.1.layer_norm.track.bias", "0.layers.1.relations_crossing_attention_weight.album_listened_by", "0.layers.1.relations_crossing_attention_weight.listened_to_album", "0.layers.1.relations_crossing_attention_weight.listened_to_track", "0.layers.1.relations_crossing_attention_weight.preformed", "0.layers.1.relations_crossing_attention_weight.preformed_by", "0.layers.1.relations_crossing_attention_weight.produced", "0.layers.1.relations_crossing_attention_weight.produced_by", "0.layers.1.relations_crossing_attention_weight.track_listened_by", "0.node_transformation_weight.album_listened_by", "0.node_transformation_weight.listened_to_album", "0.node_transformation_weight.listened_to_track", "0.node_transformation_weight.preformed", "0.node_transformation_weight.preformed_by", "0.node_transformation_weight.produced", "0.node_transformation_weight.produced_by", "0.node_transformation_weight.track_listened_by", "0.relation_transformation_weight.album_listened_by", "0.relation_transformation_weight.listened_to_album", "0.relation_transformation_weight.listened_to_track", "0.relation_transformation_weight.preformed", "0.relation_transformation_weight.preformed_by", "0.relation_transformation_weight.produced", "0.relation_transformation_weight.produced_by", "0.relation_transformation_weight.track_listened_by". 
	size mismatch for 0.projection_layer.artist.weight: copying a param with shape torch.Size([128, 8]) from checkpoint, the shape in current model is torch.Size([128, 32]).
	size mismatch for 0.projection_layer.genre.weight: copying a param with shape torch.Size([128, 8]) from checkpoint, the shape in current model is torch.Size([128, 32]).
	size mismatch for 0.projection_layer.user.weight: copying a param with shape torch.Size([128, 8]) from checkpoint, the shape in current model is torch.Size([128, 32]).