In [102]:
import numpy as np
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import eigs
import pandas as pd

edges_file = open('wisconsin_edges.csv', "r")
nodes_file = open('wisconsin_nodes.csv', "r")

# create a dictionary where nodes_dict[i] = name of wikipedia page
nodes_dict = {}
for line in nodes_file:
    nodes_dict[int(line.split(',',1)[0].strip())] = line.split(',',1)[1].strip()

node_count = len(nodes_dict)

# create adjacency matrix
A = np.zeros((node_count, node_count))
for line in edges_file:
    from_node = int(line.split(',')[0].strip())
    to_node = int(line.split(',')[1].strip())
    A[to_node, from_node] = 1.0

## Add code below to (1) prevent traps and (2) find the most important pages     
# Hint -- instead of computing the entire eigen-decomposition of a matrix X using
# s, E = np.linalg.eig(A)
# you can compute just the first eigenvector with:
# s, E = eigs(csc_matrix(A), k = 1)

In [111]:
# Problem 1a - i
# Simply add 0.001 to every entry in A
A = A+0.001

# Problem 1a - ii
# For each entry in each column of A, divide it by the sum down that column
# Compute the sum of each column
column_sums = [np.sum(A[:,col]) for col in range(np.shape(A)[1])]

# Divide each entry in each column by the sum of its column
A = A / column_sums

# Verify that each column sums to 1:
print(f"Column 1 sum: {np.round(np.sum(A[:,0]))}")
print(f"Column 2 sum: {np.round(np.sum(A[:,1]))}")
print(f"Column 2 sum: {np.round(np.sum(A[:,2]))}")

# Problem 1a - iii
# Compute 1st eigenvalue (should be 1) and associated eigenvector
s, E = eigs(csc_matrix(A), k = 1)
# (Add np.real to remove annoying +0j components)
print(f"\nFirst eigenvalue: {np.real(s)}\nAssociated eigenvector:\n {np.real(E)}")

Column 1 sum: 1.0
Column 2 sum: 1.0
Column 2 sum: 1.0

First eigenvalue: [1.]
Associated eigenvector:
 [[0.00849655]
 [0.00852945]
 [0.00849655]
 ...
 [0.02157236]
 [0.00866803]
 [0.00849655]]


In [149]:
# Problem 1b
# Know that the first eigenvector, P, gives importance of pages.

# Turn P into a dictionary so we can keep track of indices
P = dict(enumerate(np.real(E)))

# Sort dictionary P by value while preserving index
sorted_P = sorted(P.items(), key=lambda x: x[1], reverse=True)

# Display first n items of sorted P
n=10
print("First n most important items:\n ")
for index, item in enumerate(sorted_P[:n]):
    print(f"#{index + 1}: {item}")
    print()  # This will add a new line between each item
    
print(f"1st most important article title: {nodes_dict.get(5089)}")

# Conclusion: Item # 5089, title simply "Wisconsin" is the most important.

# Problem 1c
# From the above, we notice that the 3rd most important page is item #1345 in nodes_dict

print(f"3rd most important article title: {nodes_dict.get(1345)}")

# Hooray for Madison!

# For fun... also noticed that index#2230 is UW-Madison: 
print(f"4th most important article title: {nodes_dict.get(2230)}")

First n most important items:
 
#1: (5089, array([0.58556416]))

#2: (2312, array([0.44693652]))

#3: (1345, array([0.07074235]))

#4: (2230, array([0.04778512]))

#5: (379, array([0.03021724]))

#6: (2545, array([0.02981724]))

#7: (517, array([0.02588714]))

#8: (1380, array([0.02586532]))

#9: (4354, array([0.02467508]))

#10: (1603, array([0.02397484]))

1st most important article title: "Wisconsin"
3rd most important article title: "Madison, Wisconsin"
4th most important article title: "University of Wisconsinâ\u80\u93Madison"
