In [1]:
import numpy as np

from numba import jit


In [2]:
def load_graph_data(file_path):
    with open(file_path , 'r') as f:
        data = f.read()

    data = [  i.split(',') for i in  data.split('\n')]

    print(f"there's {len(data)} datas")
    return data

In [3]:
def load_IBmdata(file_path):
    
    with open(file_path , 'r') as f:
        data = f.read()

    data = [  i.split()[-2:] for i in  data.split('\n')]

    print(f"there's {len(data)} datas")

    return data

In [10]:
class Graph():
    def __init__(self , graph_data:list ) -> None:

        # init auth and hub matrix , index
        self.init_graph(graph_data)

    def init_graph(self , graph_data:list)->None:

        self.uni_nodes = set( [int(item) for sublist in graph_data for item in sublist])
        self.uni_nodes = sorted(list(self.uni_nodes) , key=lambda x:int(x) )
        self.n = len(self.uni_nodes)

        # create idx in matrix for every node
        self.node_to_idx = {  int(n):int(idx) for idx,n in  enumerate(self.uni_nodes)  }
        self.idx_to_node = {  idx:n for n,idx in self.node_to_idx.items() }

        self.hub_matrix = np.zeros([self.n,self.n]) 

        for a_node , b_node in graph_data:
            a_idx = self.node_to_idx[int(a_node)]
            b_idx = self.node_to_idx[int(b_node)]
            self.hub_matrix[a_idx][b_idx] = 1 

        self.auth_matrix = np.transpose(self.hub_matrix)
        
        self.out_neighbors = { i:np.nonzero(arr)[0] for i,arr in enumerate(self.hub_matrix)  }
        self.in_neighbors = { i:np.nonzero(arr)[0] for i,arr in enumerate(self.auth_matrix)  }
    
    def hits_algorithm(self , iter = 100):
        
        hub_score = np.ones(self.n)
        auth_score = np.ones(self.n)

        for _ in range(iter):

            # new hub and auth score
            new_auth_score =  np.zeros( self.n )
            new_hub_score =  np.zeros( self.n )

            # auth update and hub update
            for i in range(self.n):
                new_auth_score[i]  = np.sum( hub_score[ self.in_neighbors[i] ] )
                new_hub_score[i] = np.sum( auth_score[ self.out_neighbors[i] ] )
        
        # norm and update
        auth_score =  new_auth_score / new_auth_score.sum()
        hub_score = new_hub_score / new_hub_score.sum()

        return hub_score , auth_score
    
    def pagerank_algorithm(self,iter=100 , d = 0.5):
        
        page_rank = np.ones( self.n ) / self.n

        for _ in range(iter):
            new_page_rank = np.zeros_like(page_rank)
            for i in range(self.n):
                # for every node points to me , update page_rank score as sum of (old_page[ni]) / (ni out links)
                for n in self.in_neighbors[i]:
                    new_page_rank[i] +=  d/self.n + (1-d) * (page_rank[n] / len(self.out_neighbors[n]) )
        # norm
        page_rank /= page_rank.sum()     
        
        return page_rank

    def simrank_algorithm(self,iter= 100,C = 0.5):
        
        simrank_matrix = np.eye( self.n )

        for _ in range(iter):
            # create new simrank matrix
            new_simrank_matrix = np.zeros_like(simrank_matrix)
            # update every node of a:b
            for node_a in range(self.n):
                for node_b in range(self.n):
                    if node_a == node_b:
                        continue
                    new_simrank_matrix[node_a][node_b] = self.get_simrank_score( node_a,node_b,simrank_matrix,C=C )

            simrank_matrix = new_simrank_matrix
        return simrank_matrix
        
    def get_simrank_score( self , node_a , node_b , old_simrank_matrix , C=0.5):

        if len(self.in_neighbors[node_a]) == 0 or len(self.in_neighbors[node_b]) == 0:
            return 0.0

        simrank_sum = 0
        for a in self.in_neighbors[node_a]:
            for b in self.in_neighbors[node_b]:
                simrank_sum += old_simrank_matrix[a][b]

        scale = C/(len( self.in_neighbors[node_a])*len(self.in_neighbors[node_b]) )
        simrank_sum *= scale
        
        return simrank_sum

In [11]:
#graph_data = load_graph_data("/home/q56104076/projects/NCKU-work/dataMining/hw3/dataset/hw3dataset/graph_2.txt")

graph_data = load_IBmdata("./dataset/ibm-5000.txt")

there's 4798 datas


In [12]:
#graph_data

In [13]:
g = Graph(graph_data)

In [14]:
g.simrank_algorithm(50,0.9)

array([[0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
        9.08776058e-134, 9.07302289e-134, 0.00000000e+000],
       ...,
       [0.00000000e+000, 0.00000000e+000, 9.08776058e-134, ...,
        0.00000000e+000, 7.18533284e-134, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 9.07302289e-134, ...,
        7.18533284e-134, 0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000]])

In [15]:
g.hits_algorithm(50)

(array([0.2, 0.2, 0.2, 0.2, 0.2]), array([0.2, 0.2, 0.2, 0.2, 0.2]))