In [1]:
import pandas as pd
import numpy as np
import csv

from scipy import sparse

# references: 
# https://github.com/uthsavc/hypergraph-halo-ranking.git
# https://arxiv.org/pdf/1905.08287.pdf

# Loading data

In [2]:
max_users = 100 # E. can go up to 480189.
max_movies = 500 # V. can go up to 17770

E = max_users
V = max_movies 

# initializing to some values, will add rows/columns as seen fit 
W = np.zeros((V, E)) # hyperedge-weight matrix, |V|x |E|, each row corresponds to a movie. 
R = np.zeros((E, V)) # edge-dependent vertex-weight matrix, |E| x |V|, each row corresponds to a user.
all_ratings = np.zeros((E, V))
erase_prob = 0.15

First we load all the data into a nice hypergraph. 

In [3]:
curr_avail_user_index = 0
curr_movie_index = -1
user_dict = {} 

# for the regular graph
num_pairs = 0

# Initial strategy is to use ratings as vertex weights and set hyperedge weights to 1 (if they exist).
with open('combined_data_1.txt') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        
        # New movie. 
        if len(row) == 1:
            label = row[0]
            label = label[:len(label)-1]
            curr_movie_index = int(label)-1
            if curr_movie_index >= max_movies:
                break
                
        elif len(row) == 3:
            user_label = int(row[0])
            rating = int(row[1])
            
            # Since user ids have gaps between numbers, we map user ids to indices
            user_index = user_dict.get(user_label)

            # New user. 
            if user_index == None:
                if curr_avail_user_index < max_users:
                    user_dict[user_label] = curr_avail_user_index
                    user_index = curr_avail_user_index
                    curr_avail_user_index += 1
                else:
                    continue

            
            err = np.random.rand(1)
            if err > erase_prob:
                W[curr_movie_index][user_index] = 1
                R[user_index][curr_movie_index] = rating
            
                num_pairs += 1
                
            all_ratings[user_index][curr_movie_index] = rating

We also create a second hypergraph, which is the equivalent of a regular graph, for comparison purposes. Each movie-user pair gets an edge with a weight equal to the movie's edge-dependent vertex weight in the hypergraph. Vertex weights are all 1 (trivial, edge-independent). 

In [4]:
WG = np.zeros((V+E, num_pairs)) # hyperedge weight matrix, weights are ratings
RG = np.zeros((num_pairs, V+E)) # edge-dependent vertex weight matrix, weights are 1

curr_edge_index = 0 

for i in range(E):
    for j in range(V):
        if R[i][j] != 0:
            # movie index = j
            # user index = V+i
            
            RG[curr_edge_index][V+i] = 1
            RG[curr_edge_index][j] = 1
            
            WG[V+i][curr_edge_index] = R[i][j]
            WG[j][curr_edge_index] = R[i][j]
            
            curr_edge_index += 1

In [5]:
# We want the nonzero rows of W and R to sum to 1 
def row_normalize(X):
    Y = np.matrix.copy(X)
    for i in range(len(Y)):
        row = Y[i]
        row_sum = np.sum(row)
        if row_sum != 0:
            Y[i] = Y[i]/row_sum   
    return Y

Wn = row_normalize(W) 
Rn = row_normalize(R)

WGn = row_normalize(WG)
RGn = row_normalize(RG)

In [6]:
Ws = sparse.csr_matrix(Wn)
Rs = sparse.csr_matrix(Rn)

WGs = sparse.csr_matrix(WGn)
RGs = sparse.csr_matrix(RGn)

# create prob trans matrices
P = np.transpose(Ws.dot(Rs))
PG = np.transpose(WGs.dot(RGs))

# Computing personalized PageRank rankings

In [7]:
# given probability transition matrix P
# where P_{v,w} = Prob(w -> v)
# find pagerank scores with restart probability r
def compute_pr(P, r, n, home, eps=1e-8):
    
    x = np.ones(n) / n*1.0

    flag = True
    t=0
        
    while flag:
        x_new = (1-r)*P*x

        x_new = x_new + home * r 
        
        if np.linalg.norm(x_new - x,ord=1) < eps and t > 100:
            flag = False
        t=t+1
        x = x_new
    
    return x

In [8]:
r = 0.40

rankings_hg = np.zeros((E, V)) # each row corresponds to a user. 
rankings_g = np.zeros((E, V)) # each row corresponds to a user. 

for i in range(max_users):
    
    # personalize the algorithm by restarting at any of the movies a certain user originally watched
    home_hg = np.zeros(V)
    
    for j in range(V):
        if R[i][j] != 0:
            home_hg[j] = 1
            
    if np.sum(home_hg) > 0:
        home_hg = home_hg / np.sum(home_hg)
     
    rankings_hg[i,:] = compute_pr(P, r, V, home_hg).flatten()
    
    # same process for the graph
    home_g = np.zeros(V+E)
    home_g[V+i] = 1
    
    curr_rankings_g = compute_pr(PG, r, V+E, home_g).flatten()
    rankings_g[i,:] = curr_rankings_g[:V]

# Evaluating rankings

In [9]:
# Source: https://www.aaai.org/Papers/IJCAI/2007/IJCAI07-444.pdf
def calc_avg_doa_unrated(num_users, num_movies, given_ratings, true_ratings, rankings):
    
    n = num_users/3
    m = num_movies/3
    
    avg_doa = 0
    total_users = 0
    
    total_pairs = 0
    correct_pairs = 0
    
    # All pairs of movies. 
    for i in range(num_movies):
        for j in range(i+1, num_movies):
            for user in range(num_users):
                
                if given_ratings[user][i] == 0 and given_ratings[user][j] == 0:
                    if true_ratings[user][i] < true_ratings[user][j]:
                        total_pairs += 1
                        if rankings[user][i] < rankings[user][j]:
                            correct_pairs += 1
                    elif true_ratings[user][i] > true_ratings[user][j]:
                        total_pairs += 1
                        if rankings[user][i] > rankings[user][j]:
                             correct_pairs += 1  
       
    if total_pairs == 0:
        return -1
    return correct_pairs/total_pairs

In [10]:
avgdoa1 = calc_avg_doa_unrated(E, V, R, all_ratings, rankings_hg)
avgdoa2 = calc_avg_doa_unrated(E, V, R, all_ratings, rankings_g)
print("Hypergraph ranking DOA:", avgdoa1)
print("Graph ranking DOA:", avgdoa2)

NameError: name 'calc_avg_doa' is not defined