In [1]:
import numpy as np
import glob

# 1. load dataset

In [2]:
def load_graph_data(file_path:str):
    
    with open(file_path , 'r') as f:
        data = f.read().strip()

    data = [  i.strip().split(',') for i in  data.split('\n')]
    return data

In [3]:
def load_IBmdata(file_path:str):
    
    with open(file_path , 'r') as f:
        data = f.read().strip()
        
    data = [  i.split()[-2:] for i in  data.split('\n')]
    return data

# 2. graph class

In [4]:
class Graph():
    def __init__(self , graph_data:list ) -> None:

        # init auth and hub matrix , index
        self.init_graph(graph_data)

    def init_graph(self , graph_data:list)->None:

        self.uni_nodes = set( [int(item) for sublist in graph_data for item in sublist])
        self.uni_nodes = sorted(list(self.uni_nodes) , key=lambda x:int(x) )
        self.n = len(self.uni_nodes)

        # create idx in matrix for every node
        self.node_to_idx = {  int(n):int(idx) for idx,n in  enumerate(self.uni_nodes)  }
        self.idx_to_node = {  idx:n for n,idx in self.node_to_idx.items() }

        self.hub_matrix = np.zeros([self.n,self.n]) 

        # if a->b,record as 1
        for a_node , b_node in graph_data:
            a_idx = self.node_to_idx[int(a_node)]
            b_idx = self.node_to_idx[int(b_node)]
            self.hub_matrix[a_idx][b_idx] = 1 

        self.auth_matrix = np.transpose(self.hub_matrix)
        
        self.out_neighbors = { i:np.nonzero(arr)[0] for i,arr in enumerate(self.hub_matrix)  }
        self.in_neighbors = { i:np.nonzero(arr)[0] for i,arr in enumerate(self.auth_matrix)  }
    
    def hits_algorithm(self , iter):
        
        hub_score = np.ones(self.n)
        auth_score = np.ones(self.n)

        for _ in range(iter):
            # new hub and auth score
            new_auth_score =  np.zeros( self.n )
            new_hub_score =  np.zeros( self.n )
            # auth update and hub update
            for i in range(self.n):
                new_auth_score[i]  = np.sum( hub_score[ self.in_neighbors[i] ] )
                new_hub_score[i] = np.sum( auth_score[ self.out_neighbors[i] ] )
            # norm and update
            auth_score =  new_auth_score / new_auth_score.sum()
            hub_score = new_hub_score / new_hub_score.sum()

        return hub_score , auth_score
    
    def pagerank_algorithm(self,iter , d):
        
        page_rank = np.ones( self.n )

        for _ in range(iter):
            new_page_rank = np.zeros_like(page_rank)
            for i in range(self.n):
                # for every node points to me , update page_rank score as sum of (old_page[ni]) / (ni out links)
                for n in self.in_neighbors[i]:
                    new_page_rank[i] += page_rank[n] / len(self.out_neighbors[n])
                
            page_rank = (1-d) * new_page_rank + d/self.n
            
        # norm
        page_rank = page_rank / (page_rank.sum())

        return page_rank

    def simrank_algorithm(self,iter,C):
        
        simrank_matrix = np.eye( self.n )

        for _ in range(iter):
            # create new simrank matrix
            new_simrank_matrix = np.eye( self.n )
            # update every node of a:b
            for node_a in range(self.n):
                for node_b in range(self.n):
                    if len(self.in_neighbors[node_a]) == 0 or len(self.in_neighbors[node_b]) == 0 or node_a == node_b: 
                        continue  
                    new_simrank_matrix[node_a][node_b] = self.get_simrank_score( node_a,node_b,simrank_matrix,C=C )

            simrank_matrix = new_simrank_matrix.copy()
            
        return simrank_matrix
        
    def get_simrank_score( self,node_a,node_b,old_simrank_matrix , C)->float:

        simrank_sum = 0.0
        for a in self.in_neighbors[node_a]:
            for b in self.in_neighbors[node_b]:
                simrank_sum += old_simrank_matrix[a][b]

        simrank_sum = (simrank_sum * C) / (len( self.in_neighbors[node_a])*len(self.in_neighbors[node_b])) 
        


        return simrank_sum

# 3. Page rank ,Hits ,SimRank
+ for every graph and IMB data,output result to 4 file and show auth + hub to stdout
+ damping factor d = 0.15
+ dacay factor C = 0.9
+ iter = 100

In [5]:
import os

In [6]:
# if u wanna recreate file ,run this to clear old file

!rm -rf ./result/*
if not os.path.isdir("./result"):
    os.mkdir("./result/")

In [7]:
data_path = list(sorted(glob.glob("./dataset/*.txt")))
ITER = 100
DECAY_FACTOR_C = 0.9
DAMPING_FACTOR_D = 0.15

In [8]:
# for every dataset,print result and save to file
for d_path in data_path:

    # load data from file,two format
    if d_path.find("ibm") !=-1:
        graph_data = load_IBmdata(str(d_path))
    else:
        graph_data = load_graph_data(str(d_path))

    # create graph
    graph = None
    graph= Graph(graph_data)

    # output hit result
    hitshub,hits_auth =  graph.hits_algorithm(iter=ITER)
    simrank = graph.simrank_algorithm(iter=ITER,C = DECAY_FACTOR_C)
    pagerank = graph.pagerank_algorithm(iter=ITER,d= DAMPING_FACTOR_D)

    # print result
    print(f"Dataset [{d_path}]\n")
    print(f"Authority:\n{hits_auth}")
    print(f"Hub:\n{hitshub}")
    print("\n================================\n")

    # output folder
    out_path = d_path.replace('dataset',"result")
    # save file
    np.savetxt( out_path.replace(".txt","_HITS_authority.txt") , hits_auth ,fmt='%.5f', newline=' ')
    np.savetxt( out_path.replace(".txt","_HITS_hub.txt") , hitshub,fmt='%.5f', newline=' ')
    np.savetxt( out_path.replace(".txt","_SimRank.txt") , simrank,fmt='%.5f', newline='\n')
    np.savetxt( out_path.replace(".txt","_PageRank.txt") , pagerank,fmt='%.5f', newline=' ')

Dataset [./dataset/graph_1.txt]

Authority:
[0.  0.2 0.2 0.2 0.2 0.2]
Hub:
[0.2 0.2 0.2 0.2 0.2 0. ]


Dataset [./dataset/graph_2.txt]

Authority:
[0.2 0.2 0.2 0.2 0.2]
Hub:
[0.2 0.2 0.2 0.2 0.2]


Dataset [./dataset/graph_3.txt]

Authority:
[0.19098301 0.30901699 0.30901699 0.19098301]
Hub:
[0.19098301 0.30901699 0.30901699 0.19098301]


Dataset [./dataset/graph_4.txt]

Authority:
[0.13948389 0.17791203 0.20082321 0.14017775 0.20142536 0.05608926
 0.08408849]
Hub:
[0.27545318 0.04776231 0.10868324 0.19865956 0.1837346  0.11673471
 0.06897241]


Dataset [./dataset/graph_5.txt]

Authority:
[0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
 0.00000000e+000 2.61604965e-057 2.68924599e-027 3.52327050e-047
 3.86356655e-086 3.86356655e-086 3.52327050e-047 2.68924599e-027
 3.86356655e-086 3.86356655e-086 2.68924599e-027 1.47000504e-003
 2.98870217e-046 4.92980706e-003 2.63006533e-048 1.08947980e-036
 1.97060198e-030 2.33388958e-030 1.08947980e-036 2.84750347e-030
 3.36694340e-0