In [None]:
T = 2 #number of iterations for each p_obs, put 100 to get the result of the paper (but it takes 4~5 hours)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as sp
from numpy import linalg as LA
from sklearn.cluster import KMeans
import subprocess
import time
import math
import pandas as pd
import pdb

class Data_Generator:
    """Generates synthetic data"""
    def __init__(self, alpha, beta, p_obs, p_list, k, n, m):
        ## storing input parameters within the class
        self.alpha = alpha
        self.beta = beta
        self.p_obs = p_obs
        self.p_list = p_list
        self.k = k
        self.n = n
        self.m = m
        ## define additional class variables
        self.U = np.zeros((k, m)) # u_1, u_2, ..., u_K
        self.U_set = False
        self.n_per_cluster = int(n/k)
        self.cluster_id = np.arange(n)//self.n_per_cluster # 000111...(k-1)(k-1)...
        
        self.M_ground_truth = {}
        self.M_train = {}
        
        self.M_train = None
        self.Adj_matrix = None
        self.Adj_list = None
        
        assert( self.n % self.k == 0 ) # Equal division possible
        
    def set_U(self, U):
        self.U = U
        self.U_set = True

    def generate_rating_data(self):
        if self.U_set:
            X_full_obs = -1+2*np.array(np.random.random((self.n,self.m)) <= np.repeat(self.U, np.array(np.ones(self.k)*self.n_per_cluster, dtype=int), axis=0), dtype=float)
            X_partial_obs = X_full_obs * np.array(np.random.random((self.n,self.m)) <= self.p_obs, dtype=float)
            return X_partial_obs
        else:
            print("U is not set yet")
            return None
    
    def generate_graph(self):
        alpha = self.alpha
        beta = self.beta
        n = self.n
        n_per_cluster = self.n_per_cluster
        cluster_id = self.cluster_id
        Adj_matrix = np.zeros((n,n))   
        Adj_list = {}
        for i in range(n):
            Adj_list[i] = []
        
        for i in range(n):
            for j in range(i+1,n):
                if cluster_id[i] == cluster_id[j]:
                    if np.random.rand() <= alpha:
                        Adj_matrix[i,j] = 1
                        Adj_matrix[j,i] = 1
                        Adj_list[i].append(j)
                        Adj_list[j].append(i)
                else:
                    if np.random.rand() <= beta:
                        Adj_matrix[i,j] = 1
                        Adj_matrix[j,i] = 1
                        Adj_list[i].append(j)
                        Adj_list[j].append(i)
        
        return Adj_matrix, Adj_list

In [None]:
class CVR:
    MAX_N_OF_REFINEMENT_STEPS = 10
    
    def __init__(self, M_obs, Adj_matrix, Adj_list, n, m, k, p_gt):
        self.M_obs = M_obs
        self.M_obs_locations = np.where(M_obs != 0)
        self.Adj_matrix = Adj_matrix
        self.Adj_list = Adj_list
        self.n = n
        self.m = m
        self.k = k #number of clusters of users
        self.d = p_gt.size
                
    def spectral_clustering_and_vote(self, truncation_threshold = 6, local_refinement_flag = False):
        M_obs = self.M_obs
        M_obs_locations = self.M_obs_locations
        Adj = self.Adj_matrix
        Adj_original = np.copy(Adj)
        
        Adj_list = self.Adj_list

        n = self.n
        m = self.m
        k = self.k
        d = self.d # number of probabilities p_1,...,p_d
        z = 2 # number of possible ratings binary in Alg 1, but will be bigger than 2 in experiment 3
        
        # Stage 1. Spectral clustering
        # Caution: This may be slow for very large n
        deg_th = truncation_threshold * np.sum(Adj)/n
        heavy_rows = np.where(np.sum(Adj,1) > deg_th)[0]
        Adj[heavy_rows,:] = 0
        Adj[:,heavy_rows] = 0
        dd, vv = sp.linalg.eigs(Adj, k = k)
        kmeans = KMeans(n_clusters=k, random_state=0).fit(np.real(vv))
        k_mean_results = kmeans.labels_
        
#        print("Stage 1 results", k_mean_results)
        stage1_clustering_results = np.copy(k_mean_results)
        
        # Stage 2. Majority voting
#         k_mean_results = 1-np.array(np.floor(np.arange(0,n)/(n/2)), dtype=int)
        
        B_est = np.zeros((k, m)) # Caution: The row indices of B_est and B do not match in general
        B_ct = np.zeros((k, m, z)) # B_ct(:,:,0) for 0, B_ct(:,:,1) = for 1, and so on, used for finding p_hat
        R_ct = np.zeros((k, m, d)) 
        R_est = np.zeros((k, m)) # estimation of rating matrix from stage 2; u_hat, v_hat
        
        
        for z in range(len(M_obs_locations[0])): # O(pnm)
            i = M_obs_locations[0][z] # i
            j = M_obs_locations[1][z] # j
#             pdb.set_trace()
            cluster_idx = k_mean_results[i]
            if M_obs[i,j] == -1:
                B_ct[cluster_idx, j, 0] += 1
            elif M_obs[i,j] == +1:
                B_ct[cluster_idx, j, 1] += 1
                
        for i in range(k):
            for j in range(m):
                if B_ct[i, j, 1] >= B_ct[i, j, 0]:
                    B_est[i,j] = 1
                else:
                    B_est[i,j] = -1

                    
        n_ct = 0
        diff_ct = 0

        for z in range(len(M_obs_locations[0])): # O(pnm)
            i = M_obs_locations[0][z] # i
            j = M_obs_locations[1][z] # j
            cluster_idx = k_mean_results[i]
            n_ct += 1
            if M_obs[i,j] != B_est[k_mean_results[i],j]:
                diff_ct += 1
        theta_hat = diff_ct/n_ct
#        print("theta hat", theta_hat)
        
        for i in range(k):
            for j in range(m):
                if B_est[i,j] == 1:
                    R_est[i,j] = 1-theta_hat
                else:
                    R_est[i,j] = theta_hat
                        


#        print("Stage 2 results", R_est)

        # Stage 3. Local refinement
        observed_entries = [None for i in range(n)]
        row_sums = Adj_original.sum(axis=1)
        
#         for i in range(n):
#             observed_entries[i] = np.where(~np.isnan(M_train_arr[i,:]))
        
        stage3_clustering_results = np.copy(k_mean_results)
        edges_per_cluster = np.zeros((n, k))
        weighted_sum_of_correct_ratings_per_cluster = np.zeros((n, k))
        weighted_sum_of_incorrect_ratings_per_cluster = np.zeros((n, k))
        number_of_edges_same_cluster = 0
        number_of_edges_diff_cluster = 0
        
        for i in range(n):
            for j in range(i+1, n):
                if k_mean_results[i] == k_mean_results[j]:
                    number_of_edges_same_cluster += Adj_original[i,j]
                else:
                    number_of_edges_diff_cluster += Adj_original[i,j]
                
        alpha_hat = 4*number_of_edges_same_cluster/(n)/(n-2)
        beta_hat = 4*number_of_edges_diff_cluster/n/n
#        print("a hat", alpha_hat)
#        print("b hat", beta_hat)

    
        if local_refinement_flag:
            n_of_refinement_steps = 0

            while n_of_refinement_steps <= CVR.MAX_N_OF_REFINEMENT_STEPS:
                change_flag = False
                n_of_refinement_steps += 1
#                print(n_of_refinement_steps)
                new_k_mean_results = np.copy(stage3_clustering_results)
  
                nodes_in_each_cluster = {}
                for i in range(k):
                    nodes_in_each_cluster[i] = np.where(stage3_clustering_results == i)
#                 print nodes_in_each_cluster
                    
                if n_of_refinement_steps == 1: # initial update
                    for i in range(n):
                        for j in range(i+1, n): # O(n^2)
                            if Adj_original[i,j] == 1:
                                edges_per_cluster[i, stage3_clustering_results[j]] += 1
                                edges_per_cluster[j, stage3_clustering_results[i]] += 1
                    list_of_changes = []
                    
                    
                    for z in range(len(M_obs_locations[0])): # O(pnm)
                        i = M_obs_locations[0][z] # i
                        j = M_obs_locations[1][z] # j
                        for l in range(k):
                            if M_obs[i,j] == -1:
                                weighted_sum_of_incorrect_ratings_per_cluster[i, l] += np.log(1-R_est[l, j])     
                            else:
                                weighted_sum_of_correct_ratings_per_cluster[i, l] += np.log(R_est[l, j])    
                    
                else:
                    for i in range(n):
                        for each_change in list_of_changes: # O(n)
                            j, cluster_old, cluster_new = each_change
                            if Adj_original[i,j]:
                                edges_per_cluster[i, cluster_old] -= 1
                                edges_per_cluster[i, cluster_new] += 1
#                     pdb.set_trace()
                    list_of_changes = []
#                 pdb.set_trace()
                for i in range(n):
#                     print(i)
                    likelihood_array = np.zeros(k)
                    
#                     edges_per_cluster = np.zeros(k)
#                     for j in range(n): # O(n^2)
#                         if Adj_original[i,j] == 1:
#                             edges_per_cluster[stage3_clustering_results[j]] += 1
                    for j in range(k): # O(n)
                        cluster_idx = j
                        deg_internal_1 = edges_per_cluster[i, j]
                        deg_internal_0 = (n/k) - deg_internal_1
                        deg_external_1 = np.int(row_sums[i]) - deg_internal_1
                        deg_external_0 = n-(n/k) - deg_external_1
                        
                        
#                         print("Node %d, Cluster %d" % (i,j))
#                         print(deg_internal_1, deg_internal_0, deg_external_1, deg_external_0, weighted_sum_of_correct_ratings, weighted_sum_of_incorrect_ratings)

                        likelihood_array[j] = \
                                    np.log((alpha_hat*(1-beta_hat))/(beta_hat*(1-alpha_hat))) * deg_internal_1 + \
                                    weighted_sum_of_correct_ratings_per_cluster[i, j] + \
                                    weighted_sum_of_incorrect_ratings_per_cluster[i, j]
#                     pdb.set_trace()
                    opt_clustering_assignment = np.argmax(likelihood_array)
                    if opt_clustering_assignment != stage3_clustering_results[i]:
                        list_of_changes.append((i, stage3_clustering_results[i], opt_clustering_assignment))
                        new_k_mean_results[i] = opt_clustering_assignment
                        change_flag = True
                        
#                         pdb.set_trace()
#                         print "Node %d is removed from %d to %d" % (i, k_mean_results[i], opt_clustering_assignment)

                if not change_flag: # nothing happened
                    break

                stage3_clustering_results = np.copy(new_k_mean_results)
                

            
        return B_est, stage1_clustering_results, stage3_clustering_results, R_est
    
    

In [None]:
## parameters
alpha = 0.7
beta = 0.3
k = 2
n = 2000
m = 1000
p_gt = np.array([0.3, 0.7])
d = p_gt.size



obs_rate = []
max_error = []
L1_error = []
obs_rate_mean = []
max_error_mean = []
L1_error_mean = []

U = np.array([[0.3]*1000,[0.3]*750+[0.7]*250])


## compute the optimal p
M_max = 0
for i in range(p_gt.size-1):
    p_1, p_2 = p_gt[i], p_gt[i+1]
    if np.sqrt(p_1 * p_2) + np.sqrt((1-p_1)*(1-p_2)) > M_max:
        M_max = np.sqrt(p_1 * p_2) + np.sqrt((1-p_1)*(1-p_2))
gamma = np.sum(U[0,:] != U[1,:])/m
I_s = -2*np.log(np.sqrt(alpha*beta) + np.sqrt((1-alpha)*(1-beta)))
p_opt = max((np.log(n) - n*I_s/2)/(gamma*n), 2*np.log(m)/n)/(1-M_max)
p_opt_wo_graph = max(np.log(n)/(gamma*n), 2*np.log(m)/n)/(1-M_max)
print((np.log(n))/(gamma*n))
print((np.log(n) - n*I_s/2)/(gamma*n))
print(2*np.log(m)/n)
print("I_s", I_s)
print("gamma", gamma)
print("p_opt", p_opt)
print("p_opt_wo_graph", p_opt_wo_graph)
## 

In [None]:
from datetime import datetime

for l in range(5):
    p_obs = 0.04+0.02*l
    mean_max_err = 0
    mean_L1_err = 0
    print(l)
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)
    
    for j in range(T):
        dg = Data_Generator(alpha, beta, p_obs, p_gt, k, n, m)
        dg.set_U(U)
        Adj_matrix, Adj_list = dg.generate_graph()
        solver = CVR(dg.generate_rating_data(), Adj_matrix, Adj_list, n, m, k, p_gt)
        B_est, stage1_clustering_results, stage3_clustering_results, R_est = solver.spectral_clustering_and_vote(local_refinement_flag = True)
        total_err = 0
        max_err = 0
        ground_truth_cluster = np.array(np.floor(np.arange(0,n)/(n/2)), dtype=int)
        for i in range(n):
            total_err += np.linalg.norm(R_est[stage3_clustering_results[i]] - U[ground_truth_cluster[i]], ord=1)
            if np.max(np.abs(R_est[stage3_clustering_results[i]] - U[ground_truth_cluster[i]])) > max_err:
                max_err = np.max(np.abs(R_est[stage3_clustering_results[i]] - U[ground_truth_cluster[i]]))
        normalized_L1_norm = total_err/n/m
        
        obs_rate.append(p_obs)
        max_error.append(max_err)
        L1_error.append(normalized_L1_norm)
        
        mean_max_err += max_err
        mean_L1_err += normalized_L1_norm
        
    mean_max_err = mean_max_err/T #it will be 0 after T iterations
    mean_L1_err = mean_L1_err/T #it will be 0 after T iterations
    
    obs_rate_mean.append(p_obs)
    max_error_mean.append(mean_max_err)
    L1_error_mean.append(mean_L1_err)
    
print("obs_rate", obs_rate)    
print("max_error", max_error) 
print("L1_error", L1_error)
print("obs_rate_mean", obs_rate_mean)
print("max_error_mean", max_error_mean)
print("L1_error_mean", L1_error_mean)

In [None]:
import scipy.io as io

io.savemat('data_4_a_prev',{'obs':obs_rate, 'max':max_error, 'L1':L1_error, 'obs_mean':obs_rate_mean, 'max_mean':max_error_mean, 'L1_mean':L1_error_mean})

In [None]:
import matplotlib.pylab as plt
import pandas as pd



dset = pd.DataFrame({'a':obs_rate_mean, 'b':max_error_mean, 'c':L1_error_mean})

plt.scatter(dset['a'], dset['b'], color='red', label = "(max_err)")
plt.legend(loc="best")
plt.xlabel('Observation Rate')
plt.ylabel('Error Rate')

plt.show()

plt.scatter(dset['a'], dset['c'], label = "(L1_err)")
plt.legend(loc="best")
plt.xlabel('Observation Rate')
plt.ylabel('Error Rate')

plt.show()