In [1]:
import pandas as pd
import numpy as np
import json
import csv
import matplotlib.pyplot as plt

import pandas as pd
import gzip

from scipy import sparse

# Reference: https://beckernick.github.io/music_recommender/

np.random.seed(0)

# Loading data

In [2]:
df = pd.read_json('modcloth_final_data.json', lines=True)
df = df[pd.notnull(df['height'])]
df = df[~df.duplicated(subset=['user_id'], keep=False)]

In [3]:
print(df)

       bra size bust   category cup size    fit    height  hips  item_id  \
0          34.0   36        new        d  small   5ft 6in  38.0   123373   
1          36.0  NaN        new        b  small   5ft 2in  30.0   123373   
2          32.0  NaN        new        b  small   5ft 7in   NaN   123373   
4          36.0  NaN        new        b  small   5ft 2in   NaN   123373   
5          36.0  NaN        new        c  small   5ft 4in  41.0   123373   
6          32.0  NaN        new        b  large   5ft 3in   NaN   123373   
7          38.0  NaN        new        d  small   5ft 5in  42.0   123373   
8          42.0  NaN        new        d  small  5ft 10in  50.0   123373   
9          36.0   39        new     dd/e    fit   5ft 6in  41.0   123373   
10         40.0  NaN        new        d  small   5ft 6in  49.0   123373   
11         44.0  NaN        new    ddd/f  small   5ft 4in  60.0   123373   
12         34.0  NaN        new        c    fit   5ft 5in   NaN   123373   
13         3

# Creating hypergraph

In [4]:
max_users = 500 
max_items = 500

E = max_users
V = max_items

# initializing to some values, will add rows/columns as seen fit 
W = np.zeros((V, E)) # hyperedge-weight matrix, |V|x |E|, each row corresponds to a movie. 
R = np.zeros((E, V)) # edge-dependent vertex-weight matrix, |E| x |V|, each row corresponds to a user.
true_R = np.zeros((E, V)) # same but without erasing.

num_pairs = 0
erase_prob = 0.15

In [5]:
curr_avail_user_index = 0
curr_avail_item_index = 0
user_dict = {} 
item_dict = {}
heights = [None for _ in range(max_users)]

In [6]:
for _, row in df.iterrows():
    user = row['user_id']
    item = row['item_id']
    height = row['height']
    rating = row['quality']
    
    user_index = user_dict.get(user)
    
    if user_index == None:
        if curr_avail_user_index < max_users:
            user_dict[user] = curr_avail_user_index
            user_index = curr_avail_user_index
            curr_avail_user_index += 1
        else:
            continue
            
    heights[user_index] = height
    
    item_index = item_dict.get(item)
    
    if item_index == None:
        if curr_avail_item_index < max_items:
            item_dict[item] = curr_avail_item_index
            item_index = curr_avail_item_index
            curr_avail_item_index += 1
        else:
            continue
    
    true_R[user_index][item_index] = rating
    
    err = np.random.rand(1)
    if err > erase_prob:
        W[item_index][user_index] = 1
        R[user_index][item_index] = rating
    
    num_pairs += 1

In [7]:
# We want the nonzero rows of W and R to sum to 1 
def row_normalize(X):
    Y = np.matrix.copy(X)
    for i in range(len(Y)):
        row = Y[i]
        row_sum = np.sum(row)
        if row_sum != 0:
            Y[i] = Y[i]/row_sum   
    return Y

In [8]:
Rs = sparse.csr_matrix(row_normalize(R))
Ws = sparse.csr_matrix(row_normalize(W))
P = np.transpose(Ws.dot(Rs))

In [9]:
# dict of weights according to body type
W_dict = {}

In [10]:
def get_Wi(user_index):
    height = heights[user_index]
    Wi = W_dict.get(height)
    
    if Wi is None:
        Wi = np.copy(W)
        for other_user_index in range(max_users):
            if other_user_index == user_index:
                continue
            elif heights[other_user_index] == height:
                Wi[:,other_user_index] *= 10
       
        W_dict[height] = Wi
        
    return Wi

In [11]:
def get_P_hg(Wi):

    Wis = sparse.csr_matrix(row_normalize(Wi))
    Pi = np.transpose(Wis.dot(Rs))
    
    return Pi

# Creating graph

In [12]:
def get_P_g(Wi):
    WiG = np.zeros((V+E, num_pairs)) 
    RG = np.zeros((num_pairs, V+E)) 

    curr_edge_index = 0 

    for i in range(V):
        for j in range(E):
            if R[j][i] != 0:
                    # movie index = i
                    # user index = V+j

                RG[curr_edge_index][V+j] = 1
                RG[curr_edge_index][i] = 1

                WiG[V+j][curr_edge_index] = Wi[i][j] * R[j][i]
                WiG[i][curr_edge_index] = WiG[V+j][curr_edge_index]

                curr_edge_index += 1
                
    WiGs = sparse.csr_matrix(row_normalize(WiG))
    RGs = sparse.csr_matrix(row_normalize(RG))
    PiG = np.transpose(WiGs.dot(RGs))
    
    return PiG

# Compute rankings

In [13]:
# given probability transition matrix P
# where P_{v,w} = Prob(w -> v)
# find pagerank scores with restart probability r
def compute_pr(P, r, n, home, eps=1e-8):
    
    x = np.ones(n) / n*1.0

    flag = True
    t=0
        
    while flag:
        x_new = (1-r)*P*x

        x_new = x_new + home * r 
        
        if np.linalg.norm(x_new - x,ord=1) < eps and t > 100:
            flag = False
        t=t+1
        x = x_new
    
    return x

In [14]:
def compute_rankings(user_index, P, Pi, PiG, r=0.15):
    # personalize the algorithm by restarting at any of the movies a certain user originally watched
    home_hg = np.zeros(V)

    for j in range(V):
        if R[user_index][j] != 0:
            home_hg[j] = 1
            
    ranking_dumb = None
    ranking_hg = None
    ranking_g = None 

    if np.sum(home_hg) > 0:
        home_hg = home_hg / np.sum(home_hg)
        
        ranking_dumb = compute_pr(P, r, V, home_hg).flatten()

        ranking_hg = compute_pr(Pi, r, V, home_hg).flatten()

        # same process for the graph
        home_g = np.zeros(V+E)
        home_g[V+user_index] = 1

        curr_rankings_g = compute_pr(PiG, r, V+E, home_g).flatten()
        ranking_g = curr_rankings_g[:V]
        
    return ranking_dumb, ranking_hg, ranking_g

# Evaluate rankings

In [15]:
# Source: https://www.aaai.org/Papers/IJCAI/2007/IJCAI07-444.pdf
def calc_doa(num_movies, given_rating, true_rating, ranking):
    
    total_pairs = 0
    correct_pairs = 0
    
    # All pairs of movies. 
    for i in range(num_movies):
        for j in range(i+1, num_movies):
            if given_rating[i] == 0 and true_rating[i] != 0 and given_rating[j] == 0 and true_rating[j] != 0:
                if true_rating[i] < true_rating[j]:
                    total_pairs += 1
                    if ranking[i] < ranking[j]:
                        correct_pairs += 1
                elif true_rating[i] > true_rating[j]:
                    total_pairs += 1
                    if ranking[i] > ranking[j]:
                        correct_pairs += 1
       
    if total_pairs == 0:
        return -1
    return correct_pairs/total_pairs

In [16]:
doa_dumb = 0
doa_hg = 0
doa_g = 0

count = 0

for i in range(E):
    Wi = get_Wi(i)
    Pi = get_P_hg(Wi)
    PiG = get_P_g(Wi)
    r_dumb, r_hg, r_g = compute_rankings(i, P, Pi, PiG)
    
    if r_dumb is None:
        continue
    
    print(i)

    curr_doa_dumb = calc_doa(V, R[i], true_R[i], r_dumb)
    curr_doa_hg = calc_doa(V, R[i], true_R[i], r_hg)
    curr_doa_g = calc_doa(V, R[i], true_R[i], r_g)
    
    print(curr_doa_dumb)
    print(curr_doa_hg)
    print(curr_doa_g)
    print()
    
    
    if curr_doa_dumb > 0 or curr_doa_hg > 0 or curr_doa_g > 0:
        doa_dumb += curr_doa_dumb
        doa_hg += curr_doa_hg
        doa_g += curr_doa_g
        count += 1
    
doa_hg /= count
doa_g /= count
doa_dumb /= count

0
0.0
0.0
0.0

1
-1
-1
-1

2
-1
-1
-1

3
-1
-1
-1

4
-1
-1
-1

5
-1
-1
-1

6
-1
-1
-1

7
-1
-1
-1

8
-1
-1
-1

9
-1
-1
-1

10
-1
-1
-1

11
-1
-1
-1

12
-1
-1
-1

13
-1
-1
-1

14
-1
-1
-1

15
0.3333333333333333
0.3333333333333333
0.3333333333333333

17
-1
-1
-1

18
-1
-1
-1

19
-1
-1
-1

20
-1
-1
-1

21
-1
-1
-1

22
-1
-1
-1

23
-1
-1
-1

25
-1
-1
-1

27
0.0
0.0
0.0

28
-1
-1
-1

29
-1
-1
-1

30
-1
-1
-1

31
-1
-1
-1

32
-1
-1
-1

33
-1
-1
-1

34
-1
-1
-1

35
-1
-1
-1

36
-1
-1
-1

37
-1
-1
-1

38
-1
-1
-1

39
-1
-1
-1

40
-1
-1
-1

41
-1
-1
-1

42
-1
-1
-1

43
-1
-1
-1

44
-1
-1
-1

45
-1
-1
-1

46
-1
-1
-1

47
-1
-1
-1

48
-1
-1
-1

49
-1
-1
-1

50
-1
-1
-1

51
-1
-1
-1

52
-1
-1
-1

54
-1
-1
-1

55
-1
-1
-1

56
-1
-1
-1

57
-1
-1
-1

58
-1
-1
-1

59
-1
-1
-1

60
-1
-1
-1

62
-1
-1
-1

63
-1
-1
-1

64
-1
-1
-1

65
-1
-1
-1

66
-1
-1
-1

68
-1
-1
-1

70
-1
-1
-1

71
-1
-1
-1

72
-1
-1
-1

73
-1
-1
-1

74
-1
-1
-1

75
0.0
0.0
0.0

76
-1
-1
-1

78
-1
-1
-1

79
0.0
0.0
0.0

80
-1
-1
-1

8

KeyboardInterrupt: 

In [None]:
print(doa_hg)
print(doa_g)
print(doa_dumb)

In [18]:
for i in range(max_users):
    print(R[i])

[5. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 5. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 3. 0. 0. 0. 4. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 4. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 5. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 4. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 4. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0