In [205]:
import networkx as nx
import re

import numpy as np
from tabulate import tabulate
from nltk.corpus import stopwords
# nltk.download('stopwords')

In [206]:
web_graph = nx.read_gpickle('web_graph.gpickle')

In [207]:
# web_graph

In [208]:
# web_graph.nodes[50]

In [209]:
# pos = {i: web_graph.nodes[i]['pos'] for i in range(len(web_graph.nodes))}
# nx.draw(web_graph, pos)

In [210]:
# # Create a postings list
# postings_list = {}
# stop_words = set(stopwords.words('english'))
# # print(stop_words)
# for i in range(len(web_graph.nodes)):
#     node = web_graph.nodes[i]
#     content = node['page_content']
#     print(content)
#     content = re.sub(r'[^\w\s]', '', content)
#     content = re.sub(r'\s+', ' ', content)
#     content = content.lower()
#     # print(content)
#     content = [c for c in content.split(' ') if c != '' and len(c) > 2]
#     # print(content)
#     for word in content:
#         if word not in stop_words:
#             if word not in postings_list:
#                 postings_list[word] = [i]
#             elif postings_list[word][-1] != i:
#                 postings_list[word].append(i)

In [211]:
def generate_sets(query_word, postings_list):
    """
    Generates the root and base sets for the query word
    """
    if query_word not in postings_list:
        raise Exception('Word not in the postings list')
    root_set = postings_list[query_word]
    base_set = []
    for node in root_set:
        base_set.append(node)
    # print(root_set)
    root_set = set(root_set)
    # print(root_set)
    # print(base_set)
    base_set = set(base_set)
    # print(base_set)
    # print(web_graph.edges)
    for edge in web_graph.edges:
        if edge[0] in root_set:
            base_set.add(edge[1])
        if edge[1] in root_set:
            base_set.add(edge[0])
    # print(base_set)
    return root_set,base_set

In [212]:
def generate_adj_matrix(base_set):
    adj = np.zeros((len(base_set), len(base_set)))
    # print(adj)
    bslist = list(base_set)
    # print(bslist)
    for edge in web_graph.edges:
        if edge[0] in base_set and edge[1] in base_set:
            # print(edge)
            adj[bslist.index(edge[0])][bslist.index(edge[1])] = 1
    # print(adj)
    return adj


In [213]:
def generate_hub_authority_scores(adj_matrix):
    aTa = np.dot(adj_matrix.T, adj_matrix)
    aaT = np.dot(adj_matrix, adj_matrix.T)
    print(len(adj_matrix))
    v, V = np.linalg.eig(aaT.T)
    left_vec = V[:, 0].T
    left_vec = V[:, v.argmax()]
    left_vec = left_vec / sum(left_vec)
    h_vec = np.reshape(left_vec, (1, -1))
    v, V = np.linalg.eig(aTa.T)
    left_vec = V[:, 0].T
    left_vec = V[:, v.argmax()]
    left_vec = left_vec / sum(left_vec)
    a_vec = np.reshape(left_vec, (1, -1))
    # Return the principal left eigenvector
    return a_vec, h_vec

In [214]:
def generate_hub_authority_scores_power_iteration(adj_matrix):
    aTa = np.dot(adj_matrix.T, adj_matrix)
    aaT = np.dot(adj_matrix, adj_matrix.T)
    a_vec = np.full((1, len(adj_matrix)), 1/len(adj_matrix))
    h_vec = np.full((1, len(adj_matrix)), 1/len(adj_matrix))
    for i in range(100):
        a_vec = np.dot(a_vec, aTa)
        a_vec = a_vec / np.max(a_vec)
        h_vec = np.dot(h_vec, aaT)
        h_vec = h_vec / np.max(h_vec)
    a_vec = a_vec/sum(a_vec[0])
    h_vec = h_vec/sum(h_vec[0])
    return a_vec, h_vec

In [215]:
def main():
    web_graph = nx.read_gpickle('web_graph.gpickle')
    # Create a postings list
    postings_list = {}
    stop_words = set(stopwords.words('english'))
    # print(stop_words)
    for i in range(len(web_graph.nodes)):
        node = web_graph.nodes[i]
        content = node['page_content']
        # print(content)
        content = re.sub(r'[^\w\s]', '', content)
        content = re.sub(r'\s+', ' ', content)
        content = content.lower()
        # print(content)
        content = [c for c in content.split(' ') if c != '' and len(c) > 2]
        # print(content)
        for word in content:
            if word not in stop_words:
                if word not in postings_list:
                    postings_list[word] = [i]
                elif postings_list[word][-1] != i:
                    postings_list[word].append(i)

    query = input("Enter query word: ")
    query = query.split()
    query = query[0]
    rs,bs = generate_sets(query, postings_list)
    adjacency_list = generate_adj_matrix(bs)

    authority, hub = generate_hub_authority_scores_power_iteration(adjacency_list)
    base_set_list = list(bs)
    scores = []
    auth_sum = 0
    hub_sum = 0
    for i in range(len(authority[0])):
        if authority[0][i] <= 0:
            authority[0][i] = authority[0][i]*-1
        if hub[0][i] < 0:
            hub[0][i] = hub[0][i]*-1
        scores.append([base_set_list[i],authority[0][i],hub[0][i]])
        auth_sum = auth_sum+authority[0][i]
        hub_sum = hub_sum+hub[0][i]

    print(tabulate(scores, headers = ["Node", "Authority Score", "Hub Score"]))
    print("Authority score sum = "+str(auth_sum))
    print("Hub score sum = "+str(hub_sum))

In [216]:
main()

  Node    Authority Score     Hub Score
------  -----------------  ------------
     0        0.00538727   0.00304554
     1        4.52991e-05  0.000124198
     2        2.01446e-76  4.52015e-29
     5        2.87112e-82  1.41108e-76
     6        0.0127112    0.0014765
     8        2.81882e-28  2.15991e-28
     9        0.0665967    0.00737559
    10       -0            0.00439812
    11        0.0215921    0.0180423
    12        0.114773     0.0705635
    13        0.0192687    0.029721
    14        0.0041356    0.00115926
    15        1.06546e-28  5.92666e-28
    16        2.03223e-05  1.02745e-05
    17        2.83763e-28  2.15991e-28
    18        0.00791009   0.0240182
    21       -0            0.000870596
    22        0.00174523   0.0111628
    23        0.0837232    0.0546154
    24        3.16924e-05  8.46483e-06
    25        2.01446e-76  1.41108e-76
    27        4.39847e-05  7.60189e-05
    28        6.13698e-05  8.97574e-05
    30        3.37151e-05  0.000101131
   