In [1]:
import torch as th 
import torch.nn as nn
import pandas as pd
import numpy as np
import json
import copy
import os
import warnings

from DGL_LFM1b.data_utils import *
from utils.utils import convert_to_gpu
from dgl.data.utils import load_graphs
from utils.LinkScorePredictor import LinkScorePredictor
from model.R_HGNN import R_HGNN
from tqdm import tqdm
from IPython.core.display import HTML
HTML("""
.output_png {
    display: table-cell;
    text-align: right;
    vertical-align: middle;
}
""")

%load_ext autoreload
%autoreload 2

2022-07-08 23:24:28.952342: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-08 23:24:28.952390: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Data Manipulation functions

# Setting Arugments and initializing graph

# Recommendation with Link Prediction Models

In [2]:
from DGL_LFM1b.data_utils import get_fileSize, get_col_names, setType, isValid, get_preprocessed_ids

def get_ids_from_txtFile(path_txt_file, type ,ids):
    chunksize=1000000
    chunks=[]
    size = get_fileSize(path_txt_file)
    col_names=get_col_names(type)
    df_chunks = pd.read_csv(path_txt_file, names=col_names, sep="\t", encoding='utf8', header = 0, chunksize=chunksize)
    for chunk in tqdm(df_chunks, total=size//chunksize):
        for col in chunk.columns:
            try:
                chunk[col]=chunk[col].apply(lambda x: setType(x, col))
            except:
                chunk[col]=chunk[col].apply(lambda x: isValid(x, col))
                chunk[col]=chunk[col].apply(lambda x: setType(x, col))
        chunk = chunk[chunk[col_names[0]].isin(ids)] 
        chunks.append(chunk)
    df = pd.concat(chunks)
    # print(f'{type} df has shpe:',df.shape)
    return df

def topK_playcounts(path_to_txt_data, series, type, k, user_id):
    val_count_pairs=series.value_counts().head(k)
    ids=list(val_count_pairs.keys())
    type_df = get_ids_from_txtFile(path_to_txt_data, type, ids)
    print(f'Top {k} {type}s for user {user_id}')
    print('ID       PLAYCOUNT      NAME')
    for id, playcount in val_count_pairs.items():
        row=type_df.loc[type_df[type+'_id'] == id]
        name = row[type+'_name'].item()
        print(f'{id}        {playcount}         {name} ')

def topK_recommendations(path_to_txt_data, ids, type, k, user_id, artist_path_to_txt_data=None):
    type_df = get_ids_from_txtFile(path_to_txt_data, type, ids)
    if artist_path_to_txt_data != None:
        artist_ids=[type_df.loc[type_df[type+'_id'] == id]['artist_id'].item() for id in ids]
        artist_df= get_ids_from_txtFile(artist_path_to_txt_data, type='artist', ids=artist_ids)
        artist_name_mapping={id: artist_df.loc[artist_df['artist_id'] == id]['artist_name'].item() for id in artist_ids}

    print(f'Top {k} {type} recommendations for user #{user_id}')
    if artist_path_to_txt_data != None:
        print('ID           ARTIST_NAME             NAME')
    else:
        print('ID             NAME')
    for id in ids:
        row=type_df.loc[type_df[type+'_id'] == id]
        name = row[type+'_name'].item()
        try:
            artist_id = row['artist_id'].item()
            print(f'{id}        {artist_name_mapping[artist_id]}        {name}')
        except:
            print(f'{id}  {name}')

def displayTopKRecommendations(graph, model, type, sampled_edge_type, k, user_id, typeFile_path, userFile_path, artists_path, albums_path, tracks_path, les_path, device, artist_path_to_txt_data=None):
    input_features = {(stype, etype, dtype): graph.srcnodes[dtype].data['feat'] for stype, etype, dtype in graph.canonical_etypes}
    nodes_representation, _ = model[0].inference(graph, copy.deepcopy(input_features), device=device)
    user_nodes_representation=nodes_representation['user']
    type_nodes_representation=nodes_representation[type]
    # C = torch.mm(A, B.T)  # same as C = A @ B.T
    listen_to_type_likelihood = C = th.mm(user_nodes_representation, type_nodes_representation.T)

    print(f'listen_to_{type}_likelihood',listen_to_type_likelihood.shape)
    user_type_recommendations={}
    for u_id, row in enumerate(tqdm(listen_to_type_likelihood, total=int(listen_to_type_likelihood.shape[0]))):
        for id, _ in enumerate(row):
            try:
                graph.edge_id(u_id,id, etype=sampled_edge_type)
            except:
                if u_id in user_type_recommendations.keys():
                    user_type_recommendations[u_id].append((u_id, id, listen_to_type_likelihood[u_id,id].item()))
                else:
                    user_type_recommendations[u_id]=list()
                    user_type_recommendations[u_id].append((u_id, id, listen_to_type_likelihood[u_id,id].item()))

    rev_user_mapping = get_id_mapping(path_to_file=userFile_path, type='user', reverse=True)
    rev_type_mapping = get_id_mapping(path_to_file=typeFile_path, type=type, reverse=True)
    user_listens=get_ids_from_txtFile(path_txt_file=les_path, type='le' , ids=[rev_user_mapping[user_id]])
    topK_playcounts(path_to_txt_data=artists_path, series=user_listens['artist_id'], type='artist', k=k, user_id=user_id)
    topK_playcounts(path_to_txt_data=albums_path, series=user_listens['album_id'], type='album', k=k, user_id=user_id)
    topK_playcounts(path_to_txt_data=tracks_path, series=user_listens['track_id'], type='track', k=k, user_id=user_id)
    user_type_recommendations={key: sorted(value, key=lambda x: x[2], reverse=True)[:k] for key, value in  user_type_recommendations.items()}
    user_type_recommendations=[rev_type_mapping[id] for _, id, _ in user_type_recommendations[user_id]]
    if artist_path_to_txt_data !=None:
        topK_recommendations(path_to_txt_data=typeFile_path, ids=user_type_recommendations, type=type, k=k, user_id=user_id, artist_path_to_txt_data=artists_path)
    else:
        topK_recommendations(path_to_txt_data=typeFile_path, ids=user_type_recommendations, type=type, k=k, user_id=user_id)

def get_result_folder_path(root, date, sample_edge_type):
    return f'{root}/lfm1b/{date}/{sample_edge_type}'

def get_result_folder_args(root, date, sample_edge_type):
    return get_result_folder_path(root, date, sample_edge_type)+'/args.json'

def get_result_folder_model_state(root, date, sample_edge_type):
    return get_result_folder_path(root, date, sample_edge_type)+f'/{sample_edge_type}.pkl'

def get_result_folder_metrics(root, date, sample_edge_type):
    return get_result_folder_path(root, date, sample_edge_type)+f'/metrics.pkl'


def get_id_mapping(path_to_file, type, reverse=False):
    ids=get_preprocessed_ids(path_to_file, return_unique_ids=False, type=type, id_list=get_col_names(type))[f'{type}_id']
    if reverse:
        return {i: row for i, row in enumerate(ids)}
    else:
        return {row: i for i, row in enumerate(ids)}


def build_model(data_post_path,  date, args, sample_edge_type, root='results/'):
    model_state_path=get_result_folder_model_state(root, date, sample_edge_type)
    glist,_=load_graphs(f'{data_post_path}/lastfm1b.bin')
    hg=glist[0]
    r_hgnn = R_HGNN(graph=hg,
                input_dim_dict={ntype: hg.nodes[ntype].data['feat'].shape[1] for ntype in hg.ntypes},
                hidden_dim=args['hidden_dim'], 
                relation_input_dim=args['rel_input_dim'],
                relation_hidden_dim=args['rel_hidden_dim'],
                num_layers=args['num_layers'], 
                n_heads=args['num_heads'], 
                dropout=args['dropout'],
                residual=args['residual'], 
                norm=args['norm'])
    link_scorer = LinkScorePredictor(args['hidden_dim'] * args['num_heads'])

    model = nn.Sequential(r_hgnn, link_scorer)
    model = convert_to_gpu(model, device=args['device'])
    model.load_state_dict(th.load(model_state_path, map_location=args['device']))
    return model

def get_file_pre_path(data_pre_path, type):
    print('type:',type)
    if type=='user':
        return data_pre_path+'LFM-1b_users.txt'
    elif type=='album':
        return data_pre_path+'LFM-1b_albums.txt'
    elif type=='artist':
        return data_pre_path+'LFM-1b_artists.txt'
    elif type=='track':
        return data_pre_path+'LFM-1b_tracks.txt'
    elif type=='le':
        return data_pre_path+'LFM-1b_LEs.txt'
    elif type=='genre':
        return data_pre_path+'genres_allmusic.txt'
    else:
        raise Exception('bad "type" parameter in get_col_names')



In [11]:
from DGL_LFM1b.DGL_LFM1b import LFM1b
dataset=LFM1b(tracks=False,albums=False, overwrite_processed=True)


 Processing LFM1b
	 Loading Mapping Data from users.txt
---------------------------- Loading Preprocessed user file  ----------------------------


1it [00:00, 110.85it/s]


	 Loading Mapping Data from allmusic.txt
---------------------------- Loading Preprocessed genre file  ----------------------------


1it [00:00, 163.82it/s]

	 Loading Mapping Data from artists.txt
---------------------------- Loading Preprocessed artist file  ----------------------------



1it [00:00, 63.16it/s]


remapping_ids


100%|██████████| 8964/8964 [00:00<00:00, 825616.31it/s]


	 Loading Graph Data from allmusic.txt


100%|██████████| 8964/8964 [00:00<00:00, 3513479.21it/s]


remapping_ids


100%|██████████| 30118/30118 [00:00<00:00, 1215612.77it/s]


loading artist listen events for every user


1it [00:00,  6.29it/s]


number of user artist edges: 108476
	 Creating DGL HeteroGraph from Graph Data
Graph(num_nodes={'artist': 8964, 'genre': 21, 'user': 10},
      num_edges={('artist', 'artist_listened_by', 'user'): 108476, ('artist', 'in_genre', 'genre'): 30118, ('genre', 'is_genre_of', 'artist'): 30118, ('user', 'listened_to_artist', 'artist'): 108476},
      metagraph=[('artist', 'user', 'artist_listened_by'), ('artist', 'genre', 'in_genre'), ('user', 'artist', 'listened_to_artist'), ('genre', 'artist', 'is_genre_of')])
	 Creating metapath2vec node embeddings
using metapath [('user', 'listened_to_artist', 'artist'), ('artist', 'in_genre', 'genre'), ('genre', 'is_genre_of', 'artist'), ('artist', 'artist_listened_by', 'user')]
training...
 Epoch: 05 of 6, Step: 001/1, Loss: 2.7770 loading...
saved! embedding_dict
	 Loading features from artists.txt
	 Loading features from allmusic.txt
	 Loading features from users.txt
loading artist listen events for every user


1it [00:00,  5.63it/s]


saving graph...
loading graph memory size....
graph is 19897 bytes large
saved!


In [8]:
def preform_recommendations(graph, data_pre_path, data_post_path,  date, sample_edge_type, root, u_id, k):
    args_path=get_result_folder_args(root, date, sample_edge_type)
    args = json.load(open(args_path))
    print('args',args)
    model=build_model(data_post_path,  date, args, sample_edge_type, root='./results/')

    displayTopKRecommendations(
    graph=graph,
    model=model,
    type='artist',
    sampled_edge_type=sample_edge_type,
    k=k,
    user_id=u_id,
    typeFile_path=get_file_pre_path(data_pre_path,type='artist'),
    userFile_path=data_pre_path+'LFM-1b_users.txt',
    artists_path=data_pre_path+'LFM-1b_artists.txt',
    albums_path=data_pre_path+'LFM-1b_albums.txt',
    tracks_path=data_pre_path+'LFM-1b_tracks.txt',
    les_path=data_pre_path+'LFM-1b_LEs.txt',
    device=args['device']
    )

In [13]:
uer_id=0
k=10
graph=dataset[0]
data_root_path='results'
data_pre_path='data/DGL_LFM1b/preprocessed/'
data_post_path='data/DGL_LFM1b/processed/'
date='06_07_2022_21:24:53'
sample_edge_type='listened_to_artist'

preform_recommendations(graph, data_pre_path, data_post_path,  date, sample_edge_type, data_root_path, uer_id, k)

args {'seed': 0, 'sample_edge_rate': 0.1, 'num_layers': 2, 'batch_size': 512, 'num_neg_samples': 5, 'node_min_neighbors': 10, 'shuffle': True, 'drop_last': False, 'num_workers': 4, 'hidden_dim': 32, 'rel_input_dim': 12, 'rel_hidden_dim': 32, 'num_heads': 8, 'dropout': 0.5, 'residual': True, 'norm': True, 'opt': 'adam', 'weight_decay': 0.0, 'epochs': 100, 'patience': 25, 'split_by_users': True, 'device': 'cuda', 'artists': True, 'albums': False, 'tracks': False, 'playcount_weight': False, 'norm_playcount_weight': False, 'metapath2vec': True, 'emb_dim': 32, 'walk_length': 64, 'context_size': 7, 'walks_per_node': 3, 'metapath2vec_epochs_batch_size': 128, 'learning_rate': 0.001, 'metapath2vec_epochs': 5, 'logs': 100, 'n_users': 10, 'popular_artists': True, 'model_parameters': 1775158}
type: artist


inference for the 7-th batch in model 0-th layer: 100%|███████████████████████████████████| 8/8 [00:00<00:00, 16.58it/s]
inference for the 7-th batch in model 1-th layer: 100%|███████████████████████████████████| 8/8 [00:00<00:00, 16.48it/s]


listen_to_artist_likelihood torch.Size([10, 8964])


100%|██████████| 10/10 [00:34<00:00,  3.47s/it]


---------------------------- Loading Preprocessed user file  ----------------------------


1it [00:00, 153.32it/s]


---------------------------- Loading Preprocessed artist file  ----------------------------


1it [00:00, 93.62it/s]
1it [00:00,  3.24it/s]
1it [00:00, 66.83it/s]


Top 10 artists for user 0
ID       PLAYCOUNT      NAME
10624        176         Discharge 
811        142         Miles Davis 
259        101         R.E.M. 
24082        100         Scritti Politti 
7165        82         Elis Regina 
722        72         The Beach Boys 
6100        69         A Tribe Called Quest 
329        69         The Clash 
2646        69         Earth, Wind & Fire 
23154        62         Meat Puppets 


5it [00:07,  1.48s/it]                       


Top 10 albums for user 0
ID       PLAYCOUNT      NAME
21632        118         Hear Nothing See Nothing Say Nothing 
8457        43         Greatest Hits 
5276240        41         Belly Of The Sun 
34551        39         Los Angeles 
787657        36         Moondog (The Viking Of Sixth Avenue) 
676735        35         Provision 
12424198        34         Never Again - Discharge 
12424361        33         Sweet Child - Pentangle 
3675        30         Greatest Hits 
11717875        29         Midnight Blue 


9it [00:15,  1.67s/it]                       


Top 10 tracks for user 0
ID       PLAYCOUNT      NAME
623707        19         Waters Of March 
299489        17         In The Eye 
900800        15         We Are Family 
25198490        14         流れるは飛ぶに非ず 
1739282        14         All That We Are 
190224        14         Nine Out Of Ten 
959150        12         Piss Factory 
2677719        12         蒼氓 
60125        11         Hear Nothing See Nothing Say Nothing 
3709266        11         Tem Boi Na Linha 


1it [00:00, 60.88it/s]

Top 10 artist recommendations for user #0
ID             NAME
258  Mr. Big
6199  The Dillinger Escape Plan
40419  Brazilian Girls
4173  Young the Giant
36499  The Acacia Strain
9518  Jay-Z & Kanye West
26866  Kryptic Minds
543  Band of Horses
3054  Alina Devecerski
26144  MellowHype



