## PageRank

In [None]:
import data_readers 
import networkx as nx
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statistics
import math
from networkx.drawing.nx_pydot import graphviz_layout

In [None]:
# Wikispeedia Graph
w = data_readers.read_wikispeedia_graph()
print(w.nodes())

PageRank Dictionary: {key: value} -> {'article_title': pagerank}

In [None]:
dic = nx.pagerank(w)
print(dic)

Assign pagerank as a node attribute of Wikispeedia graph

In [None]:
for node in [nodo for nodo in w.nodes()]:
    w.nodes[node]['pagerank'] = dic[node]
print(w.nodes(data=True))

Average PageRank

In [None]:
average = sum(dic.values()) / len(dic)
print(average)

In [None]:
above_average = {key: value for key, value in dic.items() if value >= average} # igual es mejor poner el de por encima de la mediana, porque los otros grafos ya estan encima de media
above_average_ordered = {k: v for k, v in sorted(above_average.items(), key=lambda item: item[1], reverse=True)}
above_average_graph = nx.Graph()

In [None]:
for key, value in above_average.items():
    above_average_graph.add_node(key, pagerank=value)
print(above_average_graph.nodes)

In [None]:
num_elements_above_avg = len(above_average)
print("Number of elements above average")
print(num_elements_above_avg)

Let's create a graph visualization with node size according to the pagerank. 

In [None]:
nodos_greater_than_average = [n for n, data in w.nodes(data=True) if 'pagerank' in data and data['pagerank'] > average]
# let's sort the nodes so that we can visualize the N elements greater than average
N = 60 # to avoid extra computing time try not to establish N higher than 80
nodes_ordered_per_pagerank = sorted(nodos_greater_than_average, key=lambda n: w.nodes[n]['pagerank'], reverse=True)[:N]
# select this nodes from the original Wikispeedia graph 
wavg = w.subgraph(nodes_ordered_per_pagerank).copy()
print(wavg)

In [None]:
def hierarchical_layout(net):
	return graphviz_layout(net, prog="dot")

node_labels = {node: node for node in wavg.nodes()}

pos = hierarchical_layout(wavg)
plt.figure(figsize=(50,50))
nx.draw(wavg,pos, node_size=[(wavg.nodes[n]['pagerank'] * 10500000) for n in wavg], node_color='skyblue', font_weight='bold', with_labels=True, labels=node_labels, font_color='purple', font_size=35, linewidths=8)
plt.title('Above average Pagerank Nodes', fontsize=100)
plt.show()

When comparing this graph with the other one, we should look for differences in distribution of nodes between both graphs, and differences in size between nodes of the same graph, not differences in size of the same node between graphs.

### PageRank in descending order of the articles above the average

In [None]:
names = list(above_average_ordered.keys())
values = list(above_average_ordered.values())

plt.figure(figsize=(30, 200))
sns.barplot(x=values, y=names, palette='viridis')
plt.xlabel('PageRank')
plt.ylabel('Articles')
plt.title('PageRank Classification of Wikispeedia Articles above the Wikispeedia average PageRank')
plt.show()

Median PageRank

In [None]:
values = list(dic.values())
median = statistics.median(values)
print(median)

In [None]:
above_median = {key: value for key, value in dic.items() if value >= median}
above_median_ordered = {k: v for k, v in sorted(above_median.items(), key=lambda item: item[1], reverse=True)}

In [None]:
num_elements_above_median = len(above_median)
print('Number of elements above average median')
print(num_elements_above_median)

In [None]:
names = list(above_median_ordered.keys())
values = list(above_median_ordered.values())

plt.figure(figsize=(30, 300))
sns.barplot(x=values, y=names, palette='viridis')
plt.xlabel('PageRank')
plt.ylabel('Articles')
plt.title('PageRank Classification of Wikispeedia Articles above the Wikispeedia median PageRank')
plt.show()

### All Articles

In [None]:
dic_ordered = {k: v for k, v in sorted(dic.items(), key=lambda item: item[1], reverse=True)}
half_elements = len(dic_ordered) // 2
second_half_data = dict(list(dic_ordered.items())[half_elements:])
print(second_half_data)
names = list(second_half_data.keys())
values = np.log(list(second_half_data.values()))

plt.figure(figsize=(30, 500))
sns.barplot(x=values, y=names, palette='viridis')
plt.xlabel('PageRank')
plt.ylabel('Articles')
plt.title('Wikispeedia PageRank Classification (all the articles)')
plt.show()


Helpers:

In [None]:
# helpers
print("Directed nodes from Bede")
print(w['Bede'])
print("First 10 edges of the list of edges")
print(list(w.edges)[0:10:None])
print("Number of nodes and edges")
print(w)