In [19]:
import json
import networkx as nx
from operator import itemgetter

class WikiPage:
    def __init__(self, url, title, snippet):
        self.url = url
        self.title = title
        self.snippet = snippet

##### Parser #####
        
def parse_wiki_json(file_path, use_only_wiki_page_nodes=True):
    graph = nx.DiGraph()
    wike_pages_dict = dict()
    
    nodes = json.load(open(file_path))
    for node in nodes:
        url = node["url"]
        wike_pages_dict[url] = WikiPage(url, node["title"], node["info"])
        graph.add_node(url)

    for node in nodes:
        for url in node["out_urls"]:
            if (url in wike_pages_dict) or (not use_only_wiki_page_nodes):
                graph.add_edge(node["url"], url)
            
    return graph, wike_pages_dict

##### Ranking things #####

def print_wiki_page(url, rank, wiki_pages_dict):
    if url in wiki_pages_dict:
        wiki_page = wiki_pages_dict[url]
        print("%s[rank=%s]\n%s\n%s\n" % (wiki_page.title, rank, wiki_page.url, wiki_page.snippet))
    else:
        print("%s[rank=%s]\n%s\n%s\n" % ("...", rank, url, "..."))

def print_top_ranks(ranks, wiki_pages_dict):
    top_to_bottom_ranks = sorted(list(ranks.items()), key=itemgetter(1), reverse=True)

    for (url, rank) in top_to_bottom_ranks[:10]:
        print_wiki_page(url, rank, wiki_pages_dict)

def print_pagerank_results(graph, wiki_pages_dict, alpha, tag):
    print("PageRank results [%s]:\n" % tag)
    print_top_ranks(nx.pagerank(graph, alpha), wiki_pages_dict)
    
def analyze_wiki_graph_with_pagerank(graph, wiki_pages_dict):
    # Print default PageRank results
    print_pagerank_results(graph, wiki_pages_dict, 0.85, "default")

    # Print PageRank results for different alphas
    alphas = [0.95, 0.5, 0.3]    
    for alpha in alphas:
        tag = "alpha = %s" % alpha
        print_pagerank_results(graph, wiki_pages_dict, alpha, tag)
        
def analyze_wiki_graph_with_hits(graph, wiki_pages_dict):
    hubs, authorities = nx.hits(graph, max_iter=500)
    average = { url: (value + authorities[url]) / 2 for url, value in hubs.items() }
    print("HITS results [hubs]\n")
    print_top_ranks(hubs, wiki_pages_dict)
    print("HITS results [authorities]\n")
    print_top_ranks(authorities, wiki_pages_dict)
    print("HITS results [average]\n")
    print_top_ranks(average, wiki_pages_dict)

In [23]:
print("=== BUILDING GRAPH ONLY BETWEEN WIKI PAGE NODES ===\n")
graph, info_dict = parse_wiki_json("wiki_links_2000.json", use_only_wiki_page_nodes=True)
analyze_wiki_graph_with_pagerank(graph, info_dict)

=== BUILDING GRAPH ONLY BETWEEN WIKI PAGE NODES ===

PageRank results [default]:

World War I[rank=0.005972652748346344]
https://en.wikipedia.org/wiki/First_World_War
World War I (often abbreviated to WWI or WW1), also known as the First World War, the Great War, or the War to End All Wars,[5] was a global war originating in Europe that lasted from 28 July 1914 to 11 November 1918. More than 70 million military personn...

Nitrogen[rank=0.005378930042623438]
https://en.wikipedia.org/wiki/Nitrogen
Nitrogen is a chemical element with symbol N and atomic number 7. It was first discovered and isolated by Scottish physician Daniel Rutherford in 1772. Although Carl Wilhelm Scheele and Henry Cavendish had independently done so at about the same time, Rut...

Middle Ages[rank=0.005259042660362889]
https://en.wikipedia.org/wiki/Middle_Ages
In the history of Europe, the Middle Ages (or Medieval Period) lasted from the 5th to the 15th century. It began with the fall of the Western Roman Empire an

In [21]:
print("=== BUILDING GRAPH USING ALL NODES AND ALL EDGES ===")
graph, info_dict = parse_wiki_json("wiki_links_2000.json", use_only_wiki_page_nodes=False)
analyze_wiki_graph_with_pagerank(graph, info_dict)

PageRank results [default]:

<No Title>[rank=5.689745233317509e-05]
https://en.wikipedia.org/wiki/United_States
<No snippet>

<No Title>[rank=4.301376008126008e-05]
https://en.wikipedia.org/wiki/United_Kingdom
<No snippet>

<No Title>[rank=3.2089155494927085e-05]
https://en.wikipedia.org/wiki/New_York_City
<No snippet>

<No Title>[rank=2.9544802761535557e-05]
https://en.wikipedia.org/wiki/World_War_II
<No snippet>

<No Title>[rank=2.4799725761962793e-05]
https://en.wikipedia.org/wiki/Canada
<No snippet>

<No Title>[rank=2.476351510606923e-05]
https://en.wikipedia.org/wiki/European_Union
<No snippet>

<No Title>[rank=2.466688082339292e-05]
https://en.wikipedia.org/wiki/Mathematics
<No snippet>

<No Title>[rank=2.389933299978422e-05]
https://en.wikipedia.org/wiki/California
<No snippet>

<No Title>[rank=2.3271243841727207e-05]
https://en.wikipedia.org/wiki/Russia
<No snippet>

<No Title>[rank=2.298263974893003e-05]
https://en.wikipedia.org/wiki/London
<No snippet>

PageRank results [alph