In [26]:
%load_ext autoreload
%autoreload 2
import os, sys, re, datetime, random, gzip, json, copy
from pathlib import Path
import networkx as nx
import numpy as np
import pandas as pd
import itertools
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.metrics import roc_auc_score
from scipy import spatial
from sklearn.neighbors import NearestNeighbors

import matplotlib.pyplot as plt
PROJ_PATH = Path(os.path.join(re.sub("/CTGCN.*$", '', os.getcwd()), 'CTGCN'))
sys.path.insert(1, str(str(PROJ_PATH.parents[0] / 'DySubG/src/')))
from ranking import Evaluation
# exec(open(str(PROJ_PATH.parents[0] / 'DySubG/src/ranking.py')).read())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
def make_prediction(user_idx, movie_idx, exp='movielens', method='GCN', num_time_steps=5):
    node_embedding = {}
    nodes = pd.read_csv(f'./data/{exp}/nodes_set/nodes.csv', names=['nodes'])['nodes'].values
    for time_id in range(num_time_steps-2, num_time_steps):
        embs = pd.read_csv(
            './data/{}/2.embedding/{}/{:02d}.csv'.format(exp, method, time_id), index_col=0, sep='\t').values
        dict_embs = {k:v for k,v in zip(nodes, embs)}
        node_embedding[time_id] = dict_embs

    time_id = num_time_steps - 1
    k = 20
    pred = []
    for i, uid in enumerate(user_idx):
        if i % 10 == 0: print(i)
        distances = []
        user_emb = node_embedding[time_id][uid]
        for mid in movie_idx:
            movie_emb = node_embedding[time_id][mid]
            dist = spatial.distance.cosine(user_emb, movie_emb)
            distances.append((uid, mid, dist))
        distances.sort(key=lambda tup: tup[2])
        pred += distances[:k]
    pd_pred = pd.DataFrame(pred, columns=['user_id', 'movie_id', 'rank'])
    
    return pd_pred

def eval_ranking(pd_pred, pd_true):
    predicted_indices = pd_pred.sort_values(
        ['user_id', 'rank']).groupby('user_id').agg({'movie_id': list})['movie_id'].to_list()
    true_indices = pd_true.sort_values(
        ['target_id', 'source_id']).groupby('target_id').agg({'source_id': list})['source_id'].to_list()
    eval_agent = Evaluation(predicted_indices, true_indices)
    return eval_agent.result

def print_report(exp='movielens', methods=[], selected_methods=[]):
    if len(methods) == 0:
        methods = [
            'GCN', 'GAT', 'SAGE', 'GIN', 
            'TgGCN', 'TgGAT', 'TgSAGE', 'TgGIN', 
            'TIMERS', 'DynAE', 'DynRNN', 'DynAERNN', 'DynGEM', 
            'VGRNN', 'EvolveGCN', 'CTGCN-C',
        ]
    if len(selected_methods) == 0:
        selected_methods = [
            'GCN', 'TgGAT', 'TgSAGE', 'TgGIN', 'DynAE', 
            'DynRNN', 'DynAERNN', 'DynGEM', 'EvolveGCN', 
            'TIMERS', 'CTGCN-C']
        
    try:
        pd_edges = pd.read_csv(f'./data/{exp}/0.input/temporal_edge_list.txt', sep=' ', names=['source_id', 'target_id', 'time_id'])
    except:
        pd_edges = pd.read_csv(f'./data/{exp}/0.input/edges.csv')
    
    data = pd.read_pickle(f'./data/{exp}/0.input/data.pkl')
    entity_mapping = pd.read_pickle(f'./data/{exp}/0.input/entity_mapping.pkl')
    num_time_steps = pd_edges['time_id'].max() + 1
    ground_truth = pd.read_csv(f'./data/{exp}/0.input/ground_truth.csv')
    user_idx = sorted(ground_truth['target_id'].unique())
    movie_idx = entity_mapping['movie'].values()
    
    res = []
    for method in methods:
        print(method)
        if os.path.exists(f'./data/{exp}/0.input/{method}_pred.csv'):
            pd_pred = pd.read_csv(f'./data/{exp}/0.input/{method}_pred.csv')
        else:
            pd_pred = make_prediction(user_idx, movie_idx, exp, method, num_time_steps)
            pd_pred.to_csv(f'./data/{exp}/0.input/{method}_pred.csv', index=False)
        
        results = eval_ranking(pd_pred, ground_truth)
        tmp = results[results['k']==20]
        tmp['method'] = method
        res.append(tmp)
    
    df = pd.concat(res)

    print('Full report')
    display(df)

    print('Selected methods')
    display(df['method'].isin(selected_methods))
    return df

In [31]:
report = print_report('movielens', ['GCN'])

GCN
Full report


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,k,recall,mrr,map,ndcg,method
20,20,0.054771,0.231921,0.029417,0.118989,GCN


Selected methods


20    True
Name: method, dtype: bool