In [None]:
T = 2 #number of iterations

In [None]:
#Generator
%matplotlib inline
import scipy.io as io
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as sp
from numpy import linalg as LA
from sklearn.cluster import KMeans
import subprocess
import time
import math
import pandas as pd
import pdb

class Data_Generator:
    """Generates synthetic data"""
    def __init__(self, p_obs, p_list, k, m):
        ## storing input parameters within the class
        self.p_obs = p_obs
        self.p_list = p_list
        self.k = k # number of clusters of users (unequal sizes)
        self.m = m
        
        ## define additional class variables
        self.U = np.zeros((k, m)) # u_1, u_2, ..., u_K
        self.U_set = False

        
        self.M_ground_truth = {}
        self.M_train = {}
        
        self.M_train = None

        
        
        
        z = io.loadmat("facebook100/Vassar85.mat")

        idx_list = [[] for i in range(k)]
        true_group_index = [0, 2, 3]
        for i in range(k):
            group_index = true_group_index[i]
            idx_list[i] = np.where((z['local_info'][:, -2] == 2006+group_index) & (z['local_info'][:, 0] == 1))[0]
        idx_2006_2009_student = np.concatenate(idx_list)  # [24,28,30,...15xx, 10,,,14xx, ... 1499,1506,1513]
#        print("idx_2006_2009_student",idx_2006_2009_student)
#        print(len(idx_2006_2009_student))


        B = z['A']
#        print("B shape",B.shape)
        C = B[idx_2006_2009_student, :] # will be used for finding the largest connected component only
        C = C[:, idx_2006_2009_student]
#        print("C shape",C.shape)

        n_comp, labels = sp.csgraph.connected_components(C)
#        print("labels", labels)
#        print("0",np.where(labels==0))
#        print(idx_2006_2009_student[np.where(labels==0)])
        idx_2006_2009_student_large = idx_2006_2009_student[np.where(labels==0)]  # to ignore inactive users (who do not have friends)
#        print("the number of connected components", n_comp)
#        print("the largest connected component",idx_2006_2009_student_large)
#        print("length of the largest connected component",len(idx_2006_2009_student_large))

        B_final = z['A'][idx_2006_2009_student_large, :]
        B_final = B_final[:, idx_2006_2009_student_large]
        #print("B_final[0]", B_final[0])
        #print("B_final[1136]", B_final[1136])
        #print("B_final.shape",B_final.shape)
        D = B_final.toarray()
        #print("np.where(D[0] == 1)", np.where(D[0] == 1))
        #print("np.where(D[1136] == 1", np.where(D[1136] == 1))
        #print("D.shape",D.shape)
        # 4 Clusters: Class 06(225), 07(284), 08(351), 09(277)
        cluster_id = np.zeros(len(idx_2006_2009_student_large), dtype='int')

        for i in range(k):
            group_index = true_group_index[i]
            cluster_id[np.where(z['local_info'][idx_2006_2009_student_large, -2] == 2006+group_index)] = i            

        n = len(cluster_id)    
        self.n = n
#        print("n", n)

        Adj_matrix = np.zeros((n,n))   
        Adj_list = [[] for i in range(3)]

        B_final_locations = np.where(D == 1)
        #print("B_final_loc", B_final_locations)


#        print("length", len(B_final_locations[0]))
        for y in range(len(B_final_locations[0])): # O(pnm)
            i = B_final_locations[0][y] # i
            j = B_final_locations[1][y] # j
            if  D[i,j] == 1:
                Adj_matrix[i,j] = 1
                Adj_list[0].append(float(1))
                Adj_list[1].append(i)
                Adj_list[2].append(j)

        #print(Adj_matrix)
        #print(Adj_list)        
        self.Adj_matrix = Adj_matrix
        self.Adj_list = Adj_list
        
        
        n_per_cluster_list = []
        for i in range(k):
            n_per_cluster_list.append(len(np.where(cluster_id==i)[0]))
        
        self.n_per_cluster_list = n_per_cluster_list
#        print("real_n_per_cluster_list",self.n_per_cluster_list)
        self.cluster_id = cluster_id
        
    def set_U(self, U):
        self.U = U
        self.U_set = True

    def generate_rating_data(self):
        if self.U_set:
            X_full_obs = -1+2*np.array(np.random.random((self.n,self.m)) <= np.repeat(self.U, self.n_per_cluster_list, axis=0), dtype=float)
            X_partial_obs = X_full_obs * np.array(np.random.random((self.n,self.m)) <= self.p_obs, dtype=float)
            return X_partial_obs
        else:
            print("U is not set yet")
            return None
    
    def generate_graph(self):
        Adj_matrix = self.Adj_matrix
        Adj_list = self.Adj_list
        cluster_id = self.cluster_id
        n = self.n
        return Adj_matrix, Adj_list, cluster_id, n

In [None]:
#solver
class CVR:
    MAX_N_OF_REFINEMENT_STEPS = 10
    
    def __init__(self, M_obs, Adj_matrix, Adj_list, n, m, k, p_gt):
        self.M_obs = M_obs
        self.M_obs_locations = np.where(M_obs != 0)
        self.Adj_matrix = Adj_matrix
        self.Adj_list = Adj_list
        self.n = n
        self.m = m
        self.k = k #number of clusters of users
        self.d = p_gt.size
        
                
    def spectral_clustering_and_vote(self, truncation_threshold = 6, local_refinement_flag = False):
        M_obs = self.M_obs
        M_obs_locations = self.M_obs_locations
        Adj = self.Adj_matrix
        Adj_original = np.copy(Adj)
        
        Adj_list = self.Adj_list

        n = self.n
        m = self.m
        k = self.k
        d = self.d # number of probabilities p_1,...,p_d
        z = 2 # number of possible ratings binary in Alg 1, but will be bigger than 2 in experiment 3
        
        # Stage 1. Spectral clustering
        # Caution: This may be slow for very large n
        deg_th = truncation_threshold * np.sum(Adj)/n
        heavy_rows = np.where(np.sum(Adj,1) > deg_th)[0]
        Adj[heavy_rows,:] = 0
        Adj[:,heavy_rows] = 0
        dd, vv = sp.linalg.eigs(Adj, k = k)
        kmeans = KMeans(n_clusters=k, random_state=0).fit(np.real(vv))
        k_mean_results = kmeans.labels_
        
#        print("Stage 1 results", k_mean_results)
        stage1_clustering_results = np.copy(k_mean_results)
        
        # Stage 2. Majority voting
#         k_mean_results = 1-np.array(np.floor(np.arange(0,n)/(n/2)), dtype=int)
        
        B_est = np.zeros((k, m)) # Caution: The row indices of B_est and B do not match in general
        n_ct = np.zeros((k, m))
        B_ct = np.zeros((k, m, z)) # B_ct(:,:,0) for 0, B_ct(:,:,1) = for 1, and so on, used for finding p_hat
        R_ct = np.zeros((k, m, d)) 
        R_est = np.zeros((k, m)) # estimation of rating matrix from stage 2; u_hat, v_hat
        
        
        for z in range(len(M_obs_locations[0])): # O(pnm)
            i = M_obs_locations[0][z] # i
            j = M_obs_locations[1][z] # j
#             pdb.set_trace()
            cluster_idx = k_mean_results[i]
            n_ct[cluster_idx, j] += 1
            if M_obs[i,j] == -1:
                B_ct[cluster_idx, j, 0] += 1
            elif M_obs[i,j] == +1:
                B_ct[cluster_idx, j, 1] += 1

                
        for i in range(k):
            for j in range(m):
                if n_ct[i, j] == 0:
                    B_est[i,j] = -1
                else:
                    B_est[i,j] = (B_ct[i,j,1])/n_ct[i, j]
                    
#        for i in range(k):
#            for j in range(m):
#                B_est[i,j] = (B_ct[i,j,1])/n_ct[i, j]
        

        a = []   # a_j, a'_j in Alg 1
        b = []
        c_r = [] # r_1,...,r_d in Alg 1
        c_l = [] # l_1,...,l_d in Alg 1
        p_hat = np.zeros(d)
#        m_0 = m
#        ran = np.random.choice(m, m_0, replace=False)

        m_0 = 5*d*int(np.ceil(np.log(m)))
        ran = np.random.choice(m, m_0)

        for i in range(k):
            for j in range(m_0):
                if B_est[i,ran[j]] >= 0:
                    a.append(B_est[i,ran[j]])
                    
#        for i in range(k):
#            for j in range(m_0):
#                a.append(B_est[i,ran[j]])

        a.sort()
        
        for i in range(len(a)-1):
            b.append(a[i+1]-a[i])

#         pdb.set_trace()
        for i in range(d-1):
            b[np.argmax(b)] = -1
        
        for i in range(len(a)-1):
            if b[i] == -1:
                c_r.append(i)
                
        c_r.append(len(a)-1)
        c_l.append(0)
        
        for i in range(d-1):
            c_l.append(c_r[i]+1)
            
        for i in range(d):
            for j in np.arange(c_l[i],c_r[i]+1):
                p_hat[i] += a[j]
            p_hat[i] = p_hat[i]/(c_r[i]+1-c_l[i])
            if p_hat[i] == 0:
                p_hat[i] = 0.00001
            if p_hat[i] == 1:
                p_hat[i] = 0.99999
#        print("p_hat", p_hat)
                

#         pdb.set_trace()
        
        for z in range(len(M_obs_locations[0])): # O(pnm)
            i = M_obs_locations[0][z] # i
            j = M_obs_locations[1][z] # j
            for l in range(d):
                cluster_idx = k_mean_results[i]
                if M_obs[i,j] == -1:
                    R_ct[cluster_idx, j, l] += -np.log(1-p_hat[l])    #use 1.01 instead of 1 to avoid log(0) case
                else:
                    R_ct[cluster_idx, j, l] += -np.log(p_hat[l])    #use 0.01 instead of 0 to avoid log(0) case
        
        if np.sum(p_hat < 0) or np.sum(p_hat > 1):
            pdb.set_trace()
        emp = []
        
        for i in range(k):
            for j in range(m):
                for l in range(d):
                    emp.append(R_ct[i, j, l])
                R_est[i, j] = p_hat[np.argmin(emp)]
                emp = []
#        print("Stage 2 results", R_est)

        # Stage 3. Local refinement
        observed_entries = [None for i in range(n)]
        row_sums = Adj_original.sum(axis=1)
#        print("row_sums", row_sums)

#         for i in range(n):
#             observed_entries[i] = np.where(~np.isnan(M_train_arr[i,:]))
        
        stage3_clustering_results = np.copy(k_mean_results)
        edges_per_cluster = np.zeros((n, k))
        weighted_sum_of_correct_ratings_per_cluster = np.zeros((n, k))
        weighted_sum_of_incorrect_ratings_per_cluster = np.zeros((n, k))
        number_of_edges_same_cluster = 0
        number_of_edges_diff_cluster = 0
        number_of_total_pairs_same_cluster = 0
        number_of_total_pairs_diff_cluster = 0

        n_per_cluster_stage1_list = []
        for i in range(k):
            n_per_cluster_stage1_list.append(len(np.where(k_mean_results==i)[0]))
            
#        print("stage3_n", n)
#        print("stage1_n_per_cluster",n_per_cluster_stage1_list)        
        
        for i in range(k):
            number_of_total_pairs_same_cluster += n_per_cluster_stage1_list[i]*(n_per_cluster_stage1_list[i]-1)/2
            
        for i in range(k):
            for j in range(i+1,k):
                number_of_total_pairs_diff_cluster += n_per_cluster_stage1_list[i]*n_per_cluster_stage1_list[j]
        
        for i in range(n):
            for j in range(i+1, n):
                if k_mean_results[i] == k_mean_results[j]:
                    number_of_edges_same_cluster += Adj_original[i,j]
                else:
                    number_of_edges_diff_cluster += Adj_original[i,j]
                
#        print("number_of_total_pairs_same_cluster", number_of_total_pairs_same_cluster)
#        print("number_of_total_pairs_diff_cluster", number_of_total_pairs_diff_cluster)
#        print("number_of_edges_same_cluster", number_of_edges_same_cluster)
#        print("number_of_edges_diff_cluster", number_of_edges_diff_cluster)
        alpha_hat = number_of_edges_same_cluster/number_of_total_pairs_same_cluster
        beta_hat = number_of_edges_diff_cluster/number_of_total_pairs_diff_cluster
#        print("a hat", alpha_hat)
#        print("b hat", beta_hat)

    
        if local_refinement_flag:
            n_of_refinement_steps = 0

            while n_of_refinement_steps <= CVR.MAX_N_OF_REFINEMENT_STEPS:
                change_flag = False
                n_of_refinement_steps += 1
#                print(n_of_refinement_steps)
                new_k_mean_results = np.copy(stage3_clustering_results)
  
                nodes_in_each_cluster = {}
                for i in range(k):
                    nodes_in_each_cluster[i] = np.where(stage3_clustering_results == i)
#                 print nodes_in_each_cluster
                    
                if n_of_refinement_steps == 1: # initial update
                    for i in range(n):
                        for j in range(i+1, n): # O(n^2)
                            if Adj_original[i,j] == 1:
                                edges_per_cluster[i, stage3_clustering_results[j]] += 1
                                edges_per_cluster[j, stage3_clustering_results[i]] += 1
                    list_of_changes = []
                    
                    
                    for z in range(len(M_obs_locations[0])): # O(pnm)
                        i = M_obs_locations[0][z] # i
                        j = M_obs_locations[1][z] # j
                        for l in range(k):
                            if M_obs[i,j] == -1:
                                weighted_sum_of_incorrect_ratings_per_cluster[i, l] += np.log(1-R_est[l, j])     
                            else:
                                weighted_sum_of_correct_ratings_per_cluster[i, l] += np.log(R_est[l, j])    
                    
                else:
                    for i in range(n):
                        for each_change in list_of_changes: # O(n)
                            j, cluster_old, cluster_new = each_change
                            if Adj_original[i,j]:
                                edges_per_cluster[i, cluster_old] -= 1
                                edges_per_cluster[i, cluster_new] += 1
#                     pdb.set_trace()
                    list_of_changes = []
#                 pdb.set_trace()

                n_per_cluster_middle_of_stage3_list = []
                for i in range(k):
                    n_per_cluster_middle_of_stage3_list.append(len(np.where(new_k_mean_results==i)[0]))

#                print("n_per_cluster_middle_of_stage3",n_per_cluster_middle_of_stage3_list) 

                for i in range(n):
#                     print(i)
                    likelihood_array = np.zeros(k)
                    
#                     edges_per_cluster = np.zeros(k)
#                     for j in range(n): # O(n^2)
#                         if Adj_original[i,j] == 1:
#                             edges_per_cluster[stage3_clustering_results[j]] += 1



                    for j in range(k): # O(n)
                        cluster_idx = j
                        deg_internal_1 = edges_per_cluster[i, j]
                        deg_internal_0 = n_per_cluster_middle_of_stage3_list[j] -1 - deg_internal_1
                        deg_external_1 = np.int(row_sums[i]) - deg_internal_1
                        deg_external_0 = n-n_per_cluster_middle_of_stage3_list[j] - deg_external_1
                        
                        
#                         print("Node %d, Cluster %d" % (i,j))
#                         print(deg_internal_1, deg_internal_0, deg_external_1, deg_external_0, weighted_sum_of_correct_ratings, weighted_sum_of_incorrect_ratings)

                        likelihood_array[j] = \
                                    np.log(alpha_hat) * deg_internal_1 + \
                                    np.log(1-alpha_hat) * deg_internal_0 + \
                                    np.log(beta_hat) * deg_external_1 + \
                                    np.log(1-beta_hat) * deg_external_0 + \
                                    weighted_sum_of_correct_ratings_per_cluster[i, j] + \
                                    weighted_sum_of_incorrect_ratings_per_cluster[i, j]
#                     pdb.set_trace()
                    opt_clustering_assignment = np.argmax(likelihood_array)
                    if opt_clustering_assignment != stage3_clustering_results[i]:
                        list_of_changes.append((i, stage3_clustering_results[i], opt_clustering_assignment))
                        new_k_mean_results[i] = opt_clustering_assignment
                        change_flag = True
                        
#                         pdb.set_trace()
#                         print "Node %d is removed from %d to %d" % (i, k_mean_results[i], opt_clustering_assignment)

                if not change_flag: # nothing happened
                    break

                stage3_clustering_results = np.copy(new_k_mean_results)                    
                

        n_per_cluster_stage3_list = []
        for i in range(k):
            n_per_cluster_stage3_list.append(len(np.where(stage3_clustering_results==i)[0]))    
            
#        print("stage3_n_per_cluster", n_per_cluster_stage3_list)


        # Stage 4. iteration of p_hat estimation

        
        B_est = np.zeros((k, m)) # Caution: The row indices of B_est and B do not match in general
        n_ct = np.zeros((k, m))
        B_ct = np.zeros((k, m, z)) # B_ct(:,:,0) for 0, B_ct(:,:,1) = for 1, and so on, used for finding p_hat
        R_ct = np.zeros((k, m, d)) 
        R_est = np.zeros((k, m)) # estimation of rating matrix from stage 2; u_hat, v_hat
        
        
        for z in range(len(M_obs_locations[0])): # O(pnm)
            i = M_obs_locations[0][z] # i
            j = M_obs_locations[1][z] # j
#             pdb.set_trace()
            cluster_idx = stage3_clustering_results[i]
            n_ct[cluster_idx, j] += 1
            if M_obs[i,j] == -1:
                B_ct[cluster_idx, j, 0] += 1
            elif M_obs[i,j] == +1:
                B_ct[cluster_idx, j, 1] += 1

                
        for i in range(k):
            for j in range(m):
                if n_ct[i, j] == 0:
                    B_est[i,j] = -1
                else:
                    B_est[i,j] = (B_ct[i,j,1])/n_ct[i, j]
                    
#        for i in range(k):
#            for j in range(m):
#                B_est[i,j] = (B_ct[i,j,1])/n_ct[i, j]
        

        a = []   # a_j, a'_j in Alg 1
        b = []
        c_r = [] # r_1,...,r_d in Alg 1
        c_l = [] # l_1,...,l_d in Alg 1
        p_hat = np.zeros(d)
#        m_0 = m
#        ran = np.random.choice(m, m_0, replace=False)

        m_0 = 5*d*int(np.ceil(np.log(m)))
        ran = np.random.choice(m, m_0)

        for i in range(k):
            for j in range(m_0):
                if B_est[i,ran[j]] >= 0:
                    a.append(B_est[i,ran[j]])
                    
#        for i in range(k):
#            for j in range(m_0):
#                a.append(B_est[i,ran[j]])

        a.sort()
        
        for i in range(len(a)-1):
            b.append(a[i+1]-a[i])

#         pdb.set_trace()
        for i in range(d-1):
            b[np.argmax(b)] = -1
        
        for i in range(len(a)-1):
            if b[i] == -1:
                c_r.append(i)
                
        c_r.append(len(a)-1)
        c_l.append(0)
        
        for i in range(d-1):
            c_l.append(c_r[i]+1)
            
        for i in range(d):
            for j in np.arange(c_l[i],c_r[i]+1):
                p_hat[i] += a[j]
            p_hat[i] = p_hat[i]/(c_r[i]+1-c_l[i])
            if p_hat[i] == 0:
                p_hat[i] = 0.00001
            if p_hat[i] == 1:
                p_hat[i] = 0.99999
#        print("p_hat", p_hat)
                
# k_mean_results
#         pdb.set_trace()
        
        for z in range(len(M_obs_locations[0])): # O(pnm)
            i = M_obs_locations[0][z] # i
            j = M_obs_locations[1][z] # j
            for l in range(d):
                cluster_idx = stage3_clustering_results[i]
                if M_obs[i,j] == -1:
                    R_ct[cluster_idx, j, l] += -np.log(1-p_hat[l])    #use 1.01 instead of 1 to avoid log(0) case
                else:
                    R_ct[cluster_idx, j, l] += -np.log(p_hat[l])    #use 0.01 instead of 0 to avoid log(0) case
        
        if np.sum(p_hat < 0) or np.sum(p_hat > 1):
            pdb.set_trace()
        emp = []
        
        for i in range(k):
            for j in range(m):
                for l in range(d):
                    emp.append(R_ct[i, j, l])
                R_est[i, j] = p_hat[np.argmin(emp)]
                emp = []

        # Stage 5. Iteration of local refinement 
        observed_entries = [None for i in range(n)]
        row_sums = Adj_original.sum(axis=1)
#        print("row_sums", row_sums)

#         for i in range(n):
#             observed_entries[i] = np.where(~np.isnan(M_train_arr[i,:]))
        
#        stage3_clustering_results = np.copy(k_mean_results)
        edges_per_cluster = np.zeros((n, k))
        weighted_sum_of_correct_ratings_per_cluster = np.zeros((n, k))
        weighted_sum_of_incorrect_ratings_per_cluster = np.zeros((n, k))
        number_of_edges_same_cluster = 0
        number_of_edges_diff_cluster = 0
        number_of_total_pairs_same_cluster = 0
        number_of_total_pairs_diff_cluster = 0

        n_per_cluster_stage1_list = []
        for i in range(k):
            n_per_cluster_stage1_list.append(len(np.where(stage3_clustering_results==i)[0]))
            
#        print("stage3_n", n)
#        print("stage1_n_per_cluster",n_per_cluster_stage1_list)        
        
        for i in range(k):
            number_of_total_pairs_same_cluster += n_per_cluster_stage1_list[i]*(n_per_cluster_stage1_list[i]-1)/2
            
        for i in range(k):
            for j in range(i+1,k):
                number_of_total_pairs_diff_cluster += n_per_cluster_stage1_list[i]*n_per_cluster_stage1_list[j]
        
        for i in range(n):
            for j in range(i+1, n):
                if stage3_clustering_results[i] == stage3_clustering_results[j]:
                    number_of_edges_same_cluster += Adj_original[i,j]
                else:
                    number_of_edges_diff_cluster += Adj_original[i,j]
                
#        print("number_of_total_pairs_same_cluster", number_of_total_pairs_same_cluster)
#        print("number_of_total_pairs_diff_cluster", number_of_total_pairs_diff_cluster)
#        print("number_of_edges_same_cluster", number_of_edges_same_cluster)
#        print("number_of_edges_diff_cluster", number_of_edges_diff_cluster)
        alpha_hat = number_of_edges_same_cluster/number_of_total_pairs_same_cluster
        beta_hat = number_of_edges_diff_cluster/number_of_total_pairs_diff_cluster
#        print("a hat", alpha_hat)
#        print("b hat", beta_hat)

    
        if local_refinement_flag:
            n_of_refinement_steps = 0

            while n_of_refinement_steps <= CVR.MAX_N_OF_REFINEMENT_STEPS:
                change_flag = False
                n_of_refinement_steps += 1
#                print(n_of_refinement_steps)
                new_k_mean_results = np.copy(stage3_clustering_results)
  
                nodes_in_each_cluster = {}
                for i in range(k):
                    nodes_in_each_cluster[i] = np.where(stage3_clustering_results == i)
#                 print nodes_in_each_cluster
                    
                if n_of_refinement_steps == 1: # initial update
                    for i in range(n):
                        for j in range(i+1, n): # O(n^2)
                            if Adj_original[i,j] == 1:
                                edges_per_cluster[i, stage3_clustering_results[j]] += 1
                                edges_per_cluster[j, stage3_clustering_results[i]] += 1
                    list_of_changes = []
                    
                    
                    for z in range(len(M_obs_locations[0])): # O(pnm)
                        i = M_obs_locations[0][z] # i
                        j = M_obs_locations[1][z] # j
                        for l in range(k):
                            if M_obs[i,j] == -1:
                                weighted_sum_of_incorrect_ratings_per_cluster[i, l] += np.log(1-R_est[l, j])     
                            else:
                                weighted_sum_of_correct_ratings_per_cluster[i, l] += np.log(R_est[l, j])    
                    
                else:
                    for i in range(n):
                        for each_change in list_of_changes: # O(n)
                            j, cluster_old, cluster_new = each_change
                            if Adj_original[i,j]:
                                edges_per_cluster[i, cluster_old] -= 1
                                edges_per_cluster[i, cluster_new] += 1
#                     pdb.set_trace()
                    list_of_changes = []
#                 pdb.set_trace()

                n_per_cluster_middle_of_stage3_list = []
                for i in range(k):
                    n_per_cluster_middle_of_stage3_list.append(len(np.where(new_k_mean_results==i)[0]))

#                print("n_per_cluster_middle_of_stage3",n_per_cluster_middle_of_stage3_list) 

                for i in range(n):
#                     print(i)
                    likelihood_array = np.zeros(k)
                    
#                     edges_per_cluster = np.zeros(k)
#                     for j in range(n): # O(n^2)
#                         if Adj_original[i,j] == 1:
#                             edges_per_cluster[stage3_clustering_results[j]] += 1



                    for j in range(k): # O(n)
                        cluster_idx = j
                        deg_internal_1 = edges_per_cluster[i, j]
                        deg_internal_0 = n_per_cluster_middle_of_stage3_list[j] -1 - deg_internal_1
                        deg_external_1 = np.int(row_sums[i]) - deg_internal_1
                        deg_external_0 = n-n_per_cluster_middle_of_stage3_list[j] - deg_external_1
                        
                        
#                         print("Node %d, Cluster %d" % (i,j))
#                         print(deg_internal_1, deg_internal_0, deg_external_1, deg_external_0, weighted_sum_of_correct_ratings, weighted_sum_of_incorrect_ratings)

                        likelihood_array[j] = \
                                    np.log(alpha_hat) * deg_internal_1 + \
                                    np.log(1-alpha_hat) * deg_internal_0 + \
                                    np.log(beta_hat) * deg_external_1 + \
                                    np.log(1-beta_hat) * deg_external_0 + \
                                    weighted_sum_of_correct_ratings_per_cluster[i, j] + \
                                    weighted_sum_of_incorrect_ratings_per_cluster[i, j]
#                     pdb.set_trace()
                    opt_clustering_assignment = np.argmax(likelihood_array)
                    if opt_clustering_assignment != stage3_clustering_results[i]:
                        list_of_changes.append((i, stage3_clustering_results[i], opt_clustering_assignment))
                        new_k_mean_results[i] = opt_clustering_assignment
                        change_flag = True
                        
#                         pdb.set_trace()
#                         print "Node %d is removed from %d to %d" % (i, k_mean_results[i], opt_clustering_assignment)

                if not change_flag: # nothing happened
                    break

                stage3_clustering_results = np.copy(new_k_mean_results)                    
                

        n_per_cluster_stage3_list = []
        for i in range(k):
            n_per_cluster_stage3_list.append(len(np.where(stage3_clustering_results==i)[0]))                
                


                
        return B_est, p_hat, stage1_clustering_results, stage3_clustering_results, R_est

In [None]:
#solver
class CVR_Ahns:
    MAX_N_OF_REFINEMENT_STEPS = 10
    
    def __init__(self, M_obs, Adj_matrix, Adj_list, n, m, k, p_gt):
        self.M_obs = M_obs
        self.M_obs_locations = np.where(M_obs != 0)
        self.Adj_matrix = Adj_matrix
        self.Adj_list = Adj_list
        self.n = n
        self.m = m
        self.k = k #number of clusters of users
        self.d = p_gt.size
        
                
    def spectral_clustering_and_vote(self, truncation_threshold = 6, local_refinement_flag = False):
        M_obs = self.M_obs
        M_obs_locations = self.M_obs_locations
        Adj = self.Adj_matrix
        Adj_original = np.copy(Adj)
        
        Adj_list = self.Adj_list

        n = self.n
        m = self.m
        k = self.k
        d = self.d # number of probabilities p_1,...,p_d
        z = 2 # number of possible ratings binary in Alg 1, but will be bigger than 2 in experiment 3
        
        # Stage 1. Spectral clustering
        # Caution: This may be slow for very large n
        deg_th = truncation_threshold * np.sum(Adj)/n
        heavy_rows = np.where(np.sum(Adj,1) > deg_th)[0]
        Adj[heavy_rows,:] = 0
        Adj[:,heavy_rows] = 0
        dd, vv = sp.linalg.eigs(Adj, k = k)
        kmeans = KMeans(n_clusters=k, random_state=0).fit(np.real(vv))
        k_mean_results = kmeans.labels_
        
#        print("Stage 1 results", k_mean_results)
        stage1_clustering_results = np.copy(k_mean_results)
        
        # Stage 2. Majority voting
#         k_mean_results = 1-np.array(np.floor(np.arange(0,n)/(n/2)), dtype=int)
        
        B_est = np.zeros((k, m)) # Caution: The row indices of B_est and B do not match in general
        B_ct = np.zeros((k, m, z)) # B_ct(:,:,0) for 0, B_ct(:,:,1) = for 1, and so on, used for finding p_hat
        R_ct = np.zeros((k, m, d)) 
        R_est = np.zeros((k, m)) # estimation of rating matrix from stage 2; u_hat, v_hat
        
        
        for z in range(len(M_obs_locations[0])): # O(pnm)
            i = M_obs_locations[0][z] # i
            j = M_obs_locations[1][z] # j
#             pdb.set_trace()
            cluster_idx = k_mean_results[i]
            if M_obs[i,j] == -1:
                B_ct[cluster_idx, j, 0] += 1
            elif M_obs[i,j] == +1:
                B_ct[cluster_idx, j, 1] += 1
                
        for i in range(k):
            for j in range(m):
                if B_ct[i, j, 1] >= B_ct[i, j, 0]:
                    B_est[i,j] = 1
                else:
                    B_est[i,j] = -1

                    
        n_ct = 0
        diff_ct = 0

        for z in range(len(M_obs_locations[0])): # O(pnm)
            i = M_obs_locations[0][z] # i
            j = M_obs_locations[1][z] # j
            cluster_idx = k_mean_results[i]
            n_ct += 1
            if M_obs[i,j] != B_est[k_mean_results[i],j]:
                diff_ct += 1
        theta_hat = diff_ct/n_ct
#        print("theta hat", theta_hat)
        
        for i in range(k):
            for j in range(m):
                if B_est[i,j] == 1:
                    R_est[i,j] = 1-theta_hat
                else:
                    R_est[i,j] = theta_hat
                    
#        print("Stage 2 results", R_est)

        # Stage 3. Local refinement
        observed_entries = [None for i in range(n)]
        row_sums = Adj_original.sum(axis=1)
#        print("row_sums", row_sums)

#         for i in range(n):
#             observed_entries[i] = np.where(~np.isnan(M_train_arr[i,:]))
        
        stage3_clustering_results = np.copy(k_mean_results)
        edges_per_cluster = np.zeros((n, k))
        weighted_sum_of_correct_ratings_per_cluster = np.zeros((n, k))
        weighted_sum_of_incorrect_ratings_per_cluster = np.zeros((n, k))
        number_of_edges_same_cluster = 0
        number_of_edges_diff_cluster = 0
        number_of_total_pairs_same_cluster = 0
        number_of_total_pairs_diff_cluster = 0

        n_per_cluster_stage1_list = []
        for i in range(k):
            n_per_cluster_stage1_list.append(len(np.where(k_mean_results==i)[0]))
            
#        print("stage3_n", n)
#        print("stage1_n_per_cluster",n_per_cluster_stage1_list)        
        
        for i in range(k):
            number_of_total_pairs_same_cluster += n_per_cluster_stage1_list[i]*(n_per_cluster_stage1_list[i]-1)/2
            
        for i in range(k):
            for j in range(i+1,k):
                number_of_total_pairs_diff_cluster += n_per_cluster_stage1_list[i]*n_per_cluster_stage1_list[j]
        
        for i in range(n):
            for j in range(i+1, n):
                if k_mean_results[i] == k_mean_results[j]:
                    number_of_edges_same_cluster += Adj_original[i,j]
                else:
                    number_of_edges_diff_cluster += Adj_original[i,j]
                
#        print("number_of_total_pairs_same_cluster", number_of_total_pairs_same_cluster)
#        print("number_of_total_pairs_diff_cluster", number_of_total_pairs_diff_cluster)
#        print("number_of_edges_same_cluster", number_of_edges_same_cluster)
#        print("number_of_edges_diff_cluster", number_of_edges_diff_cluster)
        alpha_hat = number_of_edges_same_cluster/number_of_total_pairs_same_cluster
        beta_hat = number_of_edges_diff_cluster/number_of_total_pairs_diff_cluster
#        print("a hat", alpha_hat)
#        print("b hat", beta_hat)

    
        if local_refinement_flag:
            n_of_refinement_steps = 0

            while n_of_refinement_steps <= CVR.MAX_N_OF_REFINEMENT_STEPS:
                change_flag = False
                n_of_refinement_steps += 1
#                print(n_of_refinement_steps)
                new_k_mean_results = np.copy(stage3_clustering_results)
  
                nodes_in_each_cluster = {}
                for i in range(k):
                    nodes_in_each_cluster[i] = np.where(stage3_clustering_results == i)
#                 print nodes_in_each_cluster
                    
                if n_of_refinement_steps == 1: # initial update
                    for i in range(n):
                        for j in range(i+1, n): # O(n^2)
                            if Adj_original[i,j] == 1:
                                edges_per_cluster[i, stage3_clustering_results[j]] += 1
                                edges_per_cluster[j, stage3_clustering_results[i]] += 1
                    list_of_changes = []
                    
                    
                    for z in range(len(M_obs_locations[0])): # O(pnm)
                        i = M_obs_locations[0][z] # i
                        j = M_obs_locations[1][z] # j
                        for l in range(k):
                            if M_obs[i,j] == -1:
                                weighted_sum_of_incorrect_ratings_per_cluster[i, l] += np.log(1-R_est[l, j])     
                            else:
                                weighted_sum_of_correct_ratings_per_cluster[i, l] += np.log(R_est[l, j])    
                    
                else:
                    for i in range(n):
                        for each_change in list_of_changes: # O(n)
                            j, cluster_old, cluster_new = each_change
                            if Adj_original[i,j]:
                                edges_per_cluster[i, cluster_old] -= 1
                                edges_per_cluster[i, cluster_new] += 1
#                     pdb.set_trace()
                    list_of_changes = []
#                 pdb.set_trace()

                n_per_cluster_middle_of_stage3_list = []
                for i in range(k):
                    n_per_cluster_middle_of_stage3_list.append(len(np.where(new_k_mean_results==i)[0]))

#                print("n_per_cluster_middle_of_stage3",n_per_cluster_middle_of_stage3_list) 

                for i in range(n):
#                     print(i)
                    likelihood_array = np.zeros(k)
                    
#                     edges_per_cluster = np.zeros(k)
#                     for j in range(n): # O(n^2)
#                         if Adj_original[i,j] == 1:
#                             edges_per_cluster[stage3_clustering_results[j]] += 1



                    for j in range(k): # O(n)
                        cluster_idx = j
                        deg_internal_1 = edges_per_cluster[i, j]
                        deg_internal_0 = n_per_cluster_middle_of_stage3_list[j] -1 - deg_internal_1
                        deg_external_1 = np.int(row_sums[i]) - deg_internal_1
                        deg_external_0 = n-n_per_cluster_middle_of_stage3_list[j] - deg_external_1
                        
                        
#                         print("Node %d, Cluster %d" % (i,j))
#                         print(deg_internal_1, deg_internal_0, deg_external_1, deg_external_0, weighted_sum_of_correct_ratings, weighted_sum_of_incorrect_ratings)

                        likelihood_array[j] = \
                                    np.log(alpha_hat) * deg_internal_1 + \
                                    np.log(1-alpha_hat) * deg_internal_0 + \
                                    np.log(beta_hat) * deg_external_1 + \
                                    np.log(1-beta_hat) * deg_external_0 + \
                                    weighted_sum_of_correct_ratings_per_cluster[i, j] + \
                                    weighted_sum_of_incorrect_ratings_per_cluster[i, j]
#                     pdb.set_trace()
                    opt_clustering_assignment = np.argmax(likelihood_array)
                    if opt_clustering_assignment != stage3_clustering_results[i]:
                        list_of_changes.append((i, stage3_clustering_results[i], opt_clustering_assignment))
                        new_k_mean_results[i] = opt_clustering_assignment
                        change_flag = True
                        
#                         pdb.set_trace()
#                         print "Node %d is removed from %d to %d" % (i, k_mean_results[i], opt_clustering_assignment)

                if not change_flag: # nothing happened
                    break

                stage3_clustering_results = np.copy(new_k_mean_results)                    
                

        n_per_cluster_stage3_list = []
        for i in range(k):
            n_per_cluster_stage3_list.append(len(np.where(stage3_clustering_results==i)[0]))    
            
#        print("stage3_n_per_cluster", n_per_cluster_stage3_list)


        # Stage 4. iteration of theta estimation
    
        B_est = np.zeros((k, m)) # Caution: The row indices of B_est and B do not match in general
        B_ct = np.zeros((k, m, z)) # B_ct(:,:,0) for 0, B_ct(:,:,1) = for 1, and so on, used for finding p_hat
        R_ct = np.zeros((k, m, d)) 
        R_est = np.zeros((k, m)) # estimation of rating matrix from stage 2; u_hat, v_hat
        
        
        for z in range(len(M_obs_locations[0])): # O(pnm)
            i = M_obs_locations[0][z] # i
            j = M_obs_locations[1][z] # j
#             pdb.set_trace()
            cluster_idx = stage3_clustering_results[i]
            if M_obs[i,j] == -1:
                B_ct[cluster_idx, j, 0] += 1
            elif M_obs[i,j] == +1:
                B_ct[cluster_idx, j, 1] += 1
                
        for i in range(k):
            for j in range(m):
                if B_ct[i, j, 1] >= B_ct[i, j, 0]:
                    B_est[i,j] = 1
                else:
                    B_est[i,j] = -1

                    
        n_ct = 0
        diff_ct = 0

        for z in range(len(M_obs_locations[0])): # O(pnm)
            i = M_obs_locations[0][z] # i
            j = M_obs_locations[1][z] # j
            cluster_idx = stage3_clustering_results[i]
            n_ct += 1
            if M_obs[i,j] != B_est[stage3_clustering_results[i],j]:
                diff_ct += 1
        theta_hat = diff_ct/n_ct
#        print("theta hat", theta_hat)
        
        for i in range(k):
            for j in range(m):
                if B_est[i,j] == 1:
                    R_est[i,j] = 1-theta_hat
                else:
                    R_est[i,j] = theta_hat
                    
                    
        # Stage 5. Iteration of local refinement 
        observed_entries = [None for i in range(n)]
        row_sums = Adj_original.sum(axis=1)
#        print("row_sums", row_sums)

#         for i in range(n):
#             observed_entries[i] = np.where(~np.isnan(M_train_arr[i,:]))
        
#        stage3_clustering_results = np.copy(k_mean_results)
        edges_per_cluster = np.zeros((n, k))
        weighted_sum_of_correct_ratings_per_cluster = np.zeros((n, k))
        weighted_sum_of_incorrect_ratings_per_cluster = np.zeros((n, k))
        number_of_edges_same_cluster = 0
        number_of_edges_diff_cluster = 0
        number_of_total_pairs_same_cluster = 0
        number_of_total_pairs_diff_cluster = 0

        n_per_cluster_stage1_list = []
        for i in range(k):
            n_per_cluster_stage1_list.append(len(np.where(stage3_clustering_results==i)[0]))
            
#        print("stage3_n", n)
#        print("stage1_n_per_cluster",n_per_cluster_stage1_list)        
        
        for i in range(k):
            number_of_total_pairs_same_cluster += n_per_cluster_stage1_list[i]*(n_per_cluster_stage1_list[i]-1)/2
            
        for i in range(k):
            for j in range(i+1,k):
                number_of_total_pairs_diff_cluster += n_per_cluster_stage1_list[i]*n_per_cluster_stage1_list[j]
        
        for i in range(n):
            for j in range(i+1, n):
                if stage3_clustering_results[i] == stage3_clustering_results[j]:
                    number_of_edges_same_cluster += Adj_original[i,j]
                else:
                    number_of_edges_diff_cluster += Adj_original[i,j]
                
#        print("number_of_total_pairs_same_cluster", number_of_total_pairs_same_cluster)
#        print("number_of_total_pairs_diff_cluster", number_of_total_pairs_diff_cluster)
#        print("number_of_edges_same_cluster", number_of_edges_same_cluster)
#        print("number_of_edges_diff_cluster", number_of_edges_diff_cluster)
        alpha_hat = number_of_edges_same_cluster/number_of_total_pairs_same_cluster
        beta_hat = number_of_edges_diff_cluster/number_of_total_pairs_diff_cluster
#        print("a hat", alpha_hat)
#        print("b hat", beta_hat)

    
        if local_refinement_flag:
            n_of_refinement_steps = 0

            while n_of_refinement_steps <= CVR.MAX_N_OF_REFINEMENT_STEPS:
                change_flag = False
                n_of_refinement_steps += 1
#                print(n_of_refinement_steps)
                new_k_mean_results = np.copy(stage3_clustering_results)
  
                nodes_in_each_cluster = {}
                for i in range(k):
                    nodes_in_each_cluster[i] = np.where(stage3_clustering_results == i)
#                 print nodes_in_each_cluster
                    
                if n_of_refinement_steps == 1: # initial update
                    for i in range(n):
                        for j in range(i+1, n): # O(n^2)
                            if Adj_original[i,j] == 1:
                                edges_per_cluster[i, stage3_clustering_results[j]] += 1
                                edges_per_cluster[j, stage3_clustering_results[i]] += 1
                    list_of_changes = []
                    
                    
                    for z in range(len(M_obs_locations[0])): # O(pnm)
                        i = M_obs_locations[0][z] # i
                        j = M_obs_locations[1][z] # j
                        for l in range(k):
                            if M_obs[i,j] == -1:
                                weighted_sum_of_incorrect_ratings_per_cluster[i, l] += np.log(1-R_est[l, j])     
                            else:
                                weighted_sum_of_correct_ratings_per_cluster[i, l] += np.log(R_est[l, j])    
                    
                else:
                    for i in range(n):
                        for each_change in list_of_changes: # O(n)
                            j, cluster_old, cluster_new = each_change
                            if Adj_original[i,j]:
                                edges_per_cluster[i, cluster_old] -= 1
                                edges_per_cluster[i, cluster_new] += 1
#                     pdb.set_trace()
                    list_of_changes = []
#                 pdb.set_trace()

                n_per_cluster_middle_of_stage3_list = []
                for i in range(k):
                    n_per_cluster_middle_of_stage3_list.append(len(np.where(new_k_mean_results==i)[0]))

#                print("n_per_cluster_middle_of_stage3",n_per_cluster_middle_of_stage3_list) 

                for i in range(n):
#                     print(i)
                    likelihood_array = np.zeros(k)
                    
#                     edges_per_cluster = np.zeros(k)
#                     for j in range(n): # O(n^2)
#                         if Adj_original[i,j] == 1:
#                             edges_per_cluster[stage3_clustering_results[j]] += 1



                    for j in range(k): # O(n)
                        cluster_idx = j
                        deg_internal_1 = edges_per_cluster[i, j]
                        deg_internal_0 = n_per_cluster_middle_of_stage3_list[j] -1 - deg_internal_1
                        deg_external_1 = np.int(row_sums[i]) - deg_internal_1
                        deg_external_0 = n-n_per_cluster_middle_of_stage3_list[j] - deg_external_1
                        
                        
#                         print("Node %d, Cluster %d" % (i,j))
#                         print(deg_internal_1, deg_internal_0, deg_external_1, deg_external_0, weighted_sum_of_correct_ratings, weighted_sum_of_incorrect_ratings)

                        likelihood_array[j] = \
                                    np.log(alpha_hat) * deg_internal_1 + \
                                    np.log(1-alpha_hat) * deg_internal_0 + \
                                    np.log(beta_hat) * deg_external_1 + \
                                    np.log(1-beta_hat) * deg_external_0 + \
                                    weighted_sum_of_correct_ratings_per_cluster[i, j] + \
                                    weighted_sum_of_incorrect_ratings_per_cluster[i, j]
#                     pdb.set_trace()
                    opt_clustering_assignment = np.argmax(likelihood_array)
                    if opt_clustering_assignment != stage3_clustering_results[i]:
                        list_of_changes.append((i, stage3_clustering_results[i], opt_clustering_assignment))
                        new_k_mean_results[i] = opt_clustering_assignment
                        change_flag = True
                        
#                         pdb.set_trace()
#                         print "Node %d is removed from %d to %d" % (i, k_mean_results[i], opt_clustering_assignment)

                if not change_flag: # nothing happened
                    break

                stage3_clustering_results = np.copy(new_k_mean_results)                    
                

        n_per_cluster_stage3_list = []
        for i in range(k):
            n_per_cluster_stage3_list.append(len(np.where(stage3_clustering_results==i)[0]))                         
                    
        return B_est, theta_hat, stage1_clustering_results, stage3_clustering_results, R_est

In [None]:
def split_M_into_train_test(M):
    observation_idx = np.where(M!=0)
    n_of_total_observations = observation_idx[0].size
    n_train = int(0.8 * n_of_total_observations)
    n_test = n_of_total_observations - n_train
    test_idx = np.random.choice(n_of_total_observations, n_test)
    
    M_train = M.copy()
    M_test = {}
    
    for each_test_idx in test_idx:
        i = observation_idx[0][each_test_idx]
        j = observation_idx[1][each_test_idx]
        M_train[i,j] = 0
        M_test[(i,j)] = M[i,j]
    return M_train, M_test

In [None]:
class Export:
    def data(M_data, Adj_list):
        f_data = open('librec-plot2/data/ours/rating/rating.txt', 'w')
        for each_key in M_data.keys():
            f_data.write("%d %d %d\n" % (each_key[0], each_key[1], M_data[each_key]))
        f_data.close()
        
        f_trust = open('librec-plot2/data/ours/trust/trust.txt', 'w')
        for i in range(len(Adj_list[0])):
            f_trust.write("%d %d %d\n" % (Adj_list[1][i], Adj_list[2][i], Adj_list[0][i]))
        f_trust.close()

In [None]:
class Grader:
    @staticmethod
    def measure_accruacy(M_test, stage3_clustering_results, R_est):
        mae_rr = 0
        mae_rp = 0
        mae_pp = 0
        acc = 0
        lke = 0
        n_of_test_entries = len(M_test)
#         print("n_of_test_entries", n_of_test_entries)
#         print("M_test.keys()", M_test.keys())

        for each_key in M_test.keys():
            i,j = each_key
            mae_rr += abs( 2*(R_est[stage3_clustering_results[i], j] >= 0.5) - 1 - M_test[each_key] )   # FAIR MAE COMPARISON 
            mae_rp += abs(2*R_est[stage3_clustering_results[i], j] - 1 - M_test[each_key])
            mae_pp += abs(R_est[stage3_clustering_results[i], j] - U[cluster_id[i]][j])
            acc += 2*(R_est[stage3_clustering_results[i], j] >= 0.5) -1 == M_test[each_key]
            if M_test[each_key] == 1:
                lke += np.log(R_est[stage3_clustering_results[i], j])
            else:
                lke += np.log(1-R_est[stage3_clustering_results[i], j])
        mae_rr = mae_rr/float(n_of_test_entries)
        mae_rp = mae_rp/float(n_of_test_entries)
        mae_pp = mae_pp/float(n_of_test_entries)
        acc = acc/float(n_of_test_entries)
        lke = lke/float(n_of_test_entries)
        
        return mae_rr, mae_rp, mae_pp, acc, lke

In [None]:
#m = arbitrary version
# diff U 0.05 0.95
k = 3
m = 1000
p_gt = np.array([0.05, 0.5, 0.95])
#d = p_gt.size
d = 3

U = np.array([[0.05]*int(m/5)+[0.05]*int(m/5)+[0.5]*int(m/5)+[0.95]*int(m/5)+[0.95]*int(m/5),
              [0.05]*int(m/5)+[0.95]*int(m/5)+[0.05]*int(m/5)+[0.05]*int(m/5)+[0.05]*int(m/5),
              [0.95]*int(m/5)+[0.95]*int(m/5)+[0.95]*int(m/5)+[0.95]*int(m/5)+[0.95]*int(m/5)])

In [None]:
obs_rate = []

mae_rr_ours = []
mae_rp_ours = []
mae_pp_ours = []
acc_ours = []
lke_ours = []

mae_rr_Ahns = []
mae_rp_Ahns = []
mae_pp_Ahns = []
acc_Ahns = []
lke_Ahns = []

algos = ('ours', 'Ahns', 'itemaverage', 'useraverage', 'userknn', 'itemknn', 'biasedmf', 'soreg', 'trustsvd')

maes = [[] for i in range(len(algos))]

In [None]:
from datetime import datetime

encoding = 'utf-8'

for t in range(8):
    print(t)
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)
    p_obs = 0.005+0.005*t
    
    
    mae_rr_ours_tmp = 0
    mae_rp_ours_tmp = 0
    mae_pp_ours_tmp = 0
    acc_ours_tmp = 0
    lke_ours_tmp = 0

    mae_rr_Ahns_tmp = 0
    mae_rp_Ahns_tmp = 0
    mae_pp_Ahns_tmp = 0
    acc_Ahns_tmp = 0
    lke_Ahns_tmp = 0

    maes_tmp = [0 for i in range(len(algos))]

    for l in range(T):
        
        dg = Data_Generator(p_obs, p_gt, k, m)
        dg.set_U(U)
        Adj_matrix, Adj_list, cluster_id, n = dg.generate_graph()
        M = dg.generate_rating_data()

        M_data = {}
        for i in range(M.shape[0]):
            for j in range(M.shape[1]):
                if M[i,j] != 0:
                    M_data[(i,j)] = M[i,j]
        M_train, M_test = split_M_into_train_test(M)
        Export.data(M_data, Adj_list)


        #ours
        solver = CVR(M, Adj_matrix, Adj_list, n, m, k, p_gt)
        B_est, p_hat, stage1_clustering_results, stage3_clustering_results, R_est = solver.spectral_clustering_and_vote(local_refinement_flag = True)
        mae_rr, mae_rp, mae_pp, acc, lke = Grader.measure_accruacy(M_test, stage3_clustering_results, R_est)

        mae_rr_ours_tmp += mae_rr
        mae_rp_ours_tmp += mae_rp
        mae_pp_ours_tmp += mae_pp
        acc_ours_tmp += acc
        lke_ours_tmp += lke

        #Ahns
        solver = CVR_Ahns(M, Adj_matrix, Adj_list, n, m, k, p_gt)
        B_est, theta_hat, stage1_clustering_results, stage3_clustering_results, R_est = solver.spectral_clustering_and_vote(local_refinement_flag = True)
        mae_rr, mae_rp, mae_pp, acc, lke = Grader.measure_accruacy(M_test, stage3_clustering_results, R_est)

        mae_rr_Ahns_tmp += mae_rr
        mae_rp_Ahns_tmp += mae_rp
        mae_pp_Ahns_tmp += mae_pp
        acc_Ahns_tmp += acc
        lke_Ahns_tmp += lke    

        #baseline algorithms
        for i in range(len(algos)-2):
            process = subprocess.Popen('cd librec-plot2/ && bin/librec rec -exec -conf %s.md' %algos[i+2], shell=True,
                                                   stdout=subprocess.PIPE, 
                                                   stderr=subprocess.PIPE)
            # wait for the process to terminate
            out, err = process.communicate()
            errcode = process.returncode

            for line in str(err, encoding).split('\n'):
                if "MAE" in line:
                    maes_tmp[i+2] += (float(line.split()[-1]))  

    obs_rate.append(p_obs)                    
    mae_rr_ours.append(mae_rr_ours_tmp/T)
    mae_rp_ours.append(mae_rp_ours_tmp/T)
    mae_pp_ours.append(mae_pp_ours_tmp/T)
    acc_ours.append(acc_ours_tmp/T)
    lke_ours.append(lke_ours_tmp/T)

    mae_rr_Ahns.append(mae_rr_Ahns_tmp/T)
    mae_rp_Ahns.append(mae_rp_Ahns_tmp/T)
    mae_pp_Ahns.append(mae_pp_Ahns_tmp/T)
    acc_Ahns.append(acc_Ahns_tmp/T)
    lke_Ahns.append(lke_Ahns_tmp/T)

    for i in range(len(algos)-2):
        maes[i+2].append(maes_tmp[i+2]/T)

print(obs_rate)
print("mae_rr_ours", mae_rr_ours, "mae_rp_ours", mae_rp_ours, "mae_pp_ours", mae_pp_ours, "acc_ours", acc_ours, "lke_ours", lke_ours)
print("mae_rr_Ahns", mae_rr_Ahns, "mae_rp_Ahns", mae_rp_Ahns, "mae_pp_Ahns", mae_pp_Ahns, "acc_Ahns", acc_Ahns, "lke_Ahns", lke_Ahns)
print(algos)            
print(maes)

In [None]:
maes[0] = mae_rp_ours
maes[1] = mae_rp_Ahns

In [None]:
import scipy.io as io

io.savemat('Facebook_data',{'maes':maes})

In [None]:
import scipy.io as io


loaded = io.loadmat('Facebook_data')
maes = loaded['maes']
algos = ('Ours', 'Ahn\'s', 'Item Avg', 'User Avg', 'User k-NN', 'Item k-NN', 'Biased MF', 'SoReg', 'Trust SVD')
obs_rate = [0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09]

In [None]:
result = []
for i, p in enumerate(obs_rate):
    for j, algo in enumerate(algos):
        result.append([p, algo, maes[j][i]])

In [None]:
import seaborn as sns
import matplotlib.pylab as plt
import matplotlib as mpl
import pandas as pd

plt.rc('font', family='serif', serif='Times')
plt.rc('text', usetex=True)
plt.rc('xtick', labelsize=11)
plt.rc('ytick', labelsize=11)
plt.rc('axes', labelsize=11)
plt.rc('axes', linewidth=0.5)

width = 3.27
height = width / 1.618
aspect = 1.618


mpl.rc("figure", figsize = (width, height))
df = pd.DataFrame(result, columns=['p', 'Method', 'MAE'])
g = sns.catplot(x='p', y = 'MAE', hue = 'Method', legend = False, palette = ['firebrick','darkgray','b', 'y', 'r', 'm', 'g', 'c', 'k'], data= df, scale = 0.3, height = height, aspect = aspect, kind = 'point', markers = ['o','s','^','<','>','v','x','d','D'] )
g.fig.subplots_adjust(left=.18, bottom=.23, right=.99, top=.97)
#fig.set_size_inches(width, height)
#color = ['firebrick','firebrick','firebrick','firebrick','firebrick','firebrick','firebrick','firebrick','firebrick']
plt.xlabel(r'$p$')
plt.ylabel('MAE')
plt.legend(loc='upper right', fontsize=7, ncol = 3, columnspacing = -0.3, bbox_to_anchor=(1.0, 0.9))
plt.axhline(y=0.23621502748930973, color='r', linestyle=':', linewidth=1)
#plt.locator_params(axis='x', nbins=8)
plt.locator_params(axis='y', nbins=8)
#from matplotlib.ticker import AutoMinorLocator, FormatStrFormatter
#ax.xaxis.set_minor_locator(AutoMinorLocator())
#ax.yaxis.set_minor_locator(AutoMinorLocator())
#ax.xaxis.set_minor_formatter(FormatStrFormatter("%.3f"))
#plt.ticklabel_format(style='plain', axis='x')
plt.axhline(y = 1, color = 'k', linewidth=0.9)
plt.axvline(x = 7.49, ymin = 0, ymax = 1, color = 'k', linewidth=0.7)
plt.ylim(0.2, 1)
plt.savefig('Figure_1_b.eps', format='eps', dpi=1000)
plt.show()