In [253]:
import networkx as nx
import re

import numpy as np
from tabulate import tabulate
from nltk.corpus import stopwords
# nltk.download('stopwords')

In [254]:
web_graph = nx.read_gpickle('web_graph.gpickle')

In [255]:
# web_graph

In [256]:
# web_graph.nodes[50]

In [257]:
# pos = {i: web_graph.nodes[i]['pos'] for i in range(len(web_graph.nodes))}
# nx.draw(web_graph, pos)

In [258]:
# # Create a postings list
# postings_list = {}
# stop_words = set(stopwords.words('english'))
# # print(stop_words)
# for i in range(len(web_graph.nodes)):
#     node = web_graph.nodes[i]
#     content = node['page_content']
#     print(content)
#     content = re.sub(r'[^\w\s]', '', content)
#     content = re.sub(r'\s+', ' ', content)
#     content = content.lower()
#     # print(content)
#     content = [c for c in content.split(' ') if c != '' and len(c) > 2]
#     # print(content)
#     for word in content:
#         if word not in stop_words:
#             if word not in postings_list:
#                 postings_list[word] = [i]
#             elif postings_list[word][-1] != i:
#                 postings_list[word].append(i)

In [259]:
def generate_sets(query_word, postings_list):
    """
    Generates the root and base sets for the query word
    """
    if query_word not in postings_list:
        raise Exception('Word not in the postings list')
    root_set = postings_list[query_word]
    base_set = []
    for node in root_set:
        base_set.append(node)
    # print(root_set)
    root_set = set(root_set)
    # print(root_set)
    # print(base_set)
    base_set = set(base_set)
    # print(base_set)
    # print(web_graph.edges)
    for edge in web_graph.edges:
        if edge[0] in root_set:
            base_set.add(edge[1])
        if edge[1] in root_set:
            base_set.add(edge[0])
    # print(base_set)
    return root_set,base_set

In [260]:
def generate_adj_matrix(base_set):
    adj = np.zeros((len(base_set), len(base_set)))
    # print(adj)
    bslist = list(base_set)
    # print(bslist)
    for edge in web_graph.edges:
        if edge[0] in base_set and edge[1] in base_set:
            # print(edge)
            adj[bslist.index(edge[0])][bslist.index(edge[1])] = 1
    # print(adj)
    return adj


In [261]:
def generate_hub_authority_scores(adj_matrix):
    aTa = np.dot(adj_matrix.T, adj_matrix)
    aaT = np.dot(adj_matrix, adj_matrix.T)
    print(len(adj_matrix))
    v, V = np.linalg.eig(aaT.T)
    left_vec = V[:, 0].T
    left_vec = V[:, v.argmax()]
    left_vec = left_vec / sum(left_vec)
    h_vec = np.reshape(left_vec, (1, -1))
    v, V = np.linalg.eig(aTa.T)
    left_vec = V[:, 0].T
    left_vec = V[:, v.argmax()]
    left_vec = left_vec / sum(left_vec)
    a_vec = np.reshape(left_vec, (1, -1))
    # Return the principal left eigenvector
    return a_vec, h_vec

In [262]:
def generate_hub_authority_scores_power_iteration(adj_matrix):
    aTa = np.dot(adj_matrix.T, adj_matrix)
    aaT = np.dot(adj_matrix, adj_matrix.T)
    a_vec = np.full((1, len(adj_matrix)), 1/len(adj_matrix))
    h_vec = np.full((1, len(adj_matrix)), 1/len(adj_matrix))
    for i in range(1000):
        a_vec = np.dot(a_vec, aTa)
        a_vec = a_vec / np.max(a_vec)
        h_vec = np.dot(h_vec, aaT)
        h_vec = h_vec / np.max(h_vec)
    a_vec = a_vec/sum(a_vec[0])
    h_vec = h_vec/sum(h_vec[0])
    return a_vec, h_vec

In [263]:
def main():
    web_graph = nx.read_gpickle('web_graph.gpickle')
    # Create a postings list
    postings_list = {}
    stop_words = set(stopwords.words('english'))
    # print(stop_words)
    for i in range(len(web_graph.nodes)):
        node = web_graph.nodes[i]
        content = node['page_content']
        # print(content)
        content = re.sub(r'[^\w\s]', '', content)
        content = re.sub(r'\s+', ' ', content)
        content = content.lower()
        # print(content)
        content = [c for c in content.split(' ') if c != '' and len(c) > 2]
        # print(content)
        for word in content:
            if word not in stop_words:
                if word not in postings_list:
                    postings_list[word] = [i]
                elif postings_list[word][-1] != i:
                    postings_list[word].append(i)

    query = input("Enter query word: ")
    query = query.split()
    query = query[0]
    rs,bs = generate_sets(query, postings_list)
    adjacency_list = generate_adj_matrix(bs)

    authority, hub = generate_hub_authority_scores_power_iteration(adjacency_list)
    base_set_list = list(bs)
    scores = []
    scores_auth = []
    scores_hub = []
    auth_sum = 0
    hub_sum = 0
    for i in range(len(authority[0])):
        if authority[0][i] <= 0:
            authority[0][i] = authority[0][i]*-1
        if hub[0][i] < 0:
            hub[0][i] = hub[0][i]*-1
        scores.append([base_set_list[i],authority[0][i],hub[0][i]])
        scores_auth.append([base_set_list[i],authority[0][i]])
        scores_hub.append([base_set_list[i],hub[0][i]])
        auth_sum = auth_sum+authority[0][i]
        hub_sum = hub_sum+hub[0][i]

    scores_auth = sorted(scores_auth, key=lambda x:x[1], reverse=True)
    scores_auth = scores_auth[0:3]
    scores_hub = sorted(scores_hub, key=lambda x:x[1],reverse=True)
    scores_hub = scores_hub[0:3]

    print("Top 3 authority scores are:")
    print(tabulate(scores_auth, headers = ["Node", "Authority Score"]))
    print("\nTop 3 Hub scores are:")
    print(tabulate(scores_hub, headers = ["Node", "Hub Score"]))

    print("\n List of all scores:")
    print(tabulate(scores, headers = ["Node", "Authority Score", "Hub Score"]))
    print("Authority score sum = "+str(auth_sum))
    print("Hub score sum = "+str(hub_sum))

In [264]:
main()

Top 3 authority scores are:
  Node    Authority Score
------  -----------------
     0           0.320012
     3           0.21121
    61           0.21121

Top 3 Hub scores are:
  Node    Hub Score
------  -----------
    10     0.320012
    75     0.320012
     0     0.204815

 List of all scores:
  Node    Authority Score    Hub Score
------  -----------------  -----------
     0           0.320012     0.204815
     3           0.21121      0
    10          -0            0.320012
    75           0.128785     0.320012
    87           0.128785     0.155162
    61           0.21121      0
Authority score sum = 1.0
Hub score sum = 1.0
