# Homework 1 - PageRank

### Imports

In [None]:
import numpy as np
from scipy import sparse
from scipy.sparse import linalg as splinalg
import sys

### Define constants

In [3]:
m = 0.15
file_names = ["Graphs/graph1.dat", "Graphs/graph2.dat", "Graphs/graph1_modified.dat", "Graphs/hollins.dat"]

### Useful functions

In [4]:
def read_dat(file_name):
    labels = {}
    row_indices = [] # Lists to store sparse matrix coordinates
    col_indices = []
    
    try:
        with open(file_name, 'r') as file:
            first_line = file.readline().strip()
            if not first_line:
                 return None, None
            parts = first_line.split()
            num_nodes = int(parts[0])
            num_edges = int(parts[1])
            
            # Use COO format for construction (efficient for appending)
            # Later convert to CSC (Compressed Sparse Column) for calculation
            
            for _ in range(num_nodes):
                line = file.readline().strip()
                if line:
                    parts = line.split(maxsplit=1) 
                    node_id = int(parts[0])
                    node_name = parts[1]
                    labels[node_id] = node_name

            for _ in range(num_edges):
                line = file.readline().strip()
                if line:
                    parts = line.split()
                    source = int(parts[0])
                    target = int(parts[1])
                    # Store coordinates instead of filling dense matrix directly
                    # A[target-1][source-1]=1
                    row_indices.append(target - 1)
                    col_indices.append(source - 1)
            
            # Create sparse matrix with 1s at specific coordinates
            data = np.ones(len(row_indices))
            A = sparse.coo_matrix((data, (row_indices, col_indices)), shape=(num_nodes, num_nodes)).tocsc()
            
            # Efficient column normalization for sparse matrix
            # Calculate sum of each column
            col_sums = np.array(A.sum(axis=0)).flatten()
            
            # Avoid division by zero. If sum is 0, scaling factor is 0.
            with np.errstate(divide='ignore', invalid='ignore'):
                scale_factors = np.where(col_sums != 0, 1.0 / col_sums, 0)
            
            # Multiply A by diagonal matrix of inverse sums to normalize
            D_inv = sparse.diags(scale_factors)
            A = A @ D_inv
                    
    except FileNotFoundError:
        print(f"Error: File '{file_name}' not found.")
        return None, None
    except Exception as e:
        print(f"Errore during the analysis of the file: {e}")
        return None, None
    
    return A, labels

In [12]:
def power_iteration_with_vector(A, s, m, output, tolerance=1e-6, max_iterations=1000):
    n = A.shape[0]
    x = np.ones(n) / n # initial vector (normalized)
    for iteration in range(max_iterations):
        # Sparse matrix multiplication (@) is efficient here
        x_new = (1 - m) * (A @ x) + m * s
        x_new = x_new / np.sum(x_new) # normalized
        if np.linalg.norm(x_new - x, 1) < tolerance:
            print(f"  Converged in {iteration + 1} iterations", file=output)
            break
        x = x_new
    else:
        print(f"  Warning: Maximum iterations ({max_iterations}) reached", file=output)
    return x

In [13]:
def check_dangling_nodes(A):
    # Summing sparse columns is faster using built-in sum
    col_sums = np.array(A.sum(axis=0)).flatten()
    dangling = []
    for i in range(len(col_sums)):
        if col_sums[i] == 0:
            dangling.append(i)
    return dangling

In [14]:
def exercise_4_analysis(A, labels):
    n = A.shape[0]
    
    # Use helper to get eigenvalues efficiently
    eigenvalues, eigenvectors = get_eigenpairs(A)
    
    idx = np.argsort(np.abs(eigenvalues))[::-1]
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:, idx]
    perron_eigenvalue = np.real(eigenvalues[0])
    perron_eigenvector = np.real(eigenvectors[:, 0])
    print(f"PERRON EIGENVALUE (largest): λ = {perron_eigenvalue:.6f}")
    # Make eigenvector non-negative and normalize
    if np.any(perron_eigenvector < 0):
        perron_eigenvector = -perron_eigenvector
    # Ensure real values
    if np.iscomplexobj(perron_eigenvector):
        perron_eigenvector = np.real(perron_eigenvector)
    # Normalize to sum to 1
    perron_eigenvector = perron_eigenvector / np.sum(perron_eigenvector)
    print(f"\nPerron eigenvector (normalized to sum=1):")
    sorted_indices = np.argsort(perron_eigenvector)[::-1]
    print(f"{'-'*50}")
    for rank, idx in enumerate(sorted_indices, 1):
        node_label = labels[idx + 1]
        score = perron_eigenvector[idx]
        print(f"  {rank}. {node_label:20s}: {score:.6f}")    
    # Verify it's an eigenvector
    result = A @ perron_eigenvector
    expected = perron_eigenvalue * perron_eigenvector
    error = np.linalg.norm(result - expected)
    print(f"\nVerification: ||A·v - λ·v|| = {error:.2e}")
    return perron_eigenvalue, perron_eigenvector

In [16]:
def analyze_graph(filename, m=0.15):
    
    A, labels = read_dat(filename)
    if A is None: return None, None
    
    n = A.shape[0]
    s = np.ones(n) / n
    is_hollins = filename == "Graphs/hollins.dat"
    output_file = None
    if is_hollins:
        output_file = open("hollins_results.txt", "w", encoding="utf-8")
        output = output_file
    else:
        output = sys.stdout
    print(f"\nGraph {filename}", file=output)
    x = power_iteration_with_vector(A, s, m, output)
    dangling = check_dangling_nodes(A)
    if dangling:
        print(f"  - Warning: Found {len(dangling)} dangling node(s): {[labels[i+1] for i in dangling]}", file=output)
        print(f"    (These nodes have initial importance score ≈ {m/n:.6f})", file=output)
        if filename == "Graphs/graph1_modified.dat":
            exercise_4_analysis(A, labels)
            if output_file:
                output_file.close()
            return
    else:
        print(f"  - No dangling nodes detected", file=output)
    
    sorted_indices = np.argsort(x)[::-1]
    print(f"PageRank scores (sorted by importance):", file=output)
    print(f"{'-'*50}", file=output)
    # Output limited to top 20 if graph is huge, but here we keep full for small graphs
    for rank, idx in enumerate(sorted_indices, 1):
        node_label = labels[idx + 1]
        score = x[idx]
        print(f"  {rank}. {node_label:20s}: {score:.6f}", file=output)
    print("\n" + "="*70, file=output)
    
    if output_file:
        output_file.close()
        print(f"\n{filename} results saved to hollins_results.txt")
    
    return x, labels

In [None]:
def get_eigenpairs(A, k=None):
    # Helper function: Use Scipy for large matrices, Numpy for small ones.
    # Scipy eigs requires k < N-1, which fails on very small graphs (e.g., 4 nodes).
    n = A.shape[0]
    if k is None: k = min(n - 2, 6)
    if k < 1: k = 1

    if n < 10 or k >= n-1:
        # Fallback to dense for small graphs or when many eigenvalues are needed
        vals, vecs = np.linalg.eig(A.toarray())
    else:
        try:
            # 'LM' = Largest Magnitude
            vals, vecs = splinalg.eigs(A, k=k, which='LM')
        except:
            vals, vecs = np.linalg.eig(A.toarray())
    return vals, vecs

In [19]:
def swap_node_indices(input_filename, output_filename, i, j):
    str_i = str(i)
    str_j = str(j)
    try:
        with open(input_filename, 'r') as infile:
            lines = infile.readlines()
        new_lines = []
        for k, line in enumerate(lines):
            line = line.strip()
            if not line:
                new_lines.append('\n')
                continue
            parts = line.split()

            # First line (header)
            if k == 0 and len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
                new_lines.append(line + '\n')
                continue

            # General sostitution logic (nodes and links)
            modified_parts = []
            for part in parts:
                if part.isdigit():
                    if part == str_i:
                        modified_parts.append(str_j)
                    elif part == str_j:
                        modified_parts.append(str_i)
                    else:
                        modified_parts.append(part)
                else:
                    modified_parts.append(part)
            new_lines.append(" ".join(modified_parts) + '\n')

        # Output file
        with open(output_filename, 'w') as outfile:
            outfile.writelines(new_lines)
        print(f"Indecises {i} and {j} swapped (Nodes and Links). Result saved in '{output_filename}'.")

    except FileNotFoundError:
        print(f"ERROR: file '{input_filename}' not found.")
    except Exception as e:
        print(f"Error happened: {e}")
    return output_filename

## Exercise 1

In [8]:
filename="Graphs/graph1.dat"
A, labels = read_dat(filename)
print("Exercise 1 Analysis:")
#Graph 1
eigenvalues, eigenvectors = get_eigenpairs(A)

# Check if eigenvalue 1 exists
idx_list = np.where(np.isclose(eigenvalues, 1))[0]
if len(idx_list) > 0:
    idx = idx_list[0]
    x_raw = np.real(eigenvectors[:, idx])
    importance_score = x_raw / x_raw.sum()
    
    print("Importance scores for Graph 1:")
    sorted_indices = np.argsort(importance_score)[::-1]
    for rank, idx in enumerate(sorted_indices, 1):
        node_label = labels[idx + 1]
        score = importance_score[idx]
        print(f"  {rank}. {node_label:20s}: {score:.6f}")

#Graph 1 with node 5 added
filename="Graphs/exercise1_graph.dat"
A_modified, labels_modified = read_dat(filename)
if A_modified is not None:
    eigenvalues, eigenvectors = get_eigenpairs(A_modified)
    idx_list = np.where(np.isclose(eigenvalues, 1))[0]
    if len(idx_list) > 0:
        idx = idx_list[0]
        x_raw = np.real(eigenvectors[:, idx])
        importance_score_withnode5 = x_raw / x_raw.sum()
        
        print("\nImportance scores for Graph 1 with Node 5 added:")
        sorted_indices = np.argsort(importance_score_withnode5)[::-1]
        for rank, idx in enumerate(sorted_indices, 1):
            node_label = labels_modified[idx + 1]
            score = importance_score_withnode5[idx]
            print(f"  {rank}. {node_label:20s}: {score:.6f}")

Exercise 1 Analysis:
Importance scores for Graph 1:
  1. Node1               : 0.387097
  2. Node3               : 0.290323
  3. Node4               : 0.193548
  4. Node2               : 0.129032

Importance scores for Graph 1 with Node 5 added:
  1. Node3               : 0.367347
  2. Node1               : 0.244898
  3. Node5               : 0.183673
  4. Node4               : 0.122449
  5. Node2               : 0.081633


We can see thet the addition of Page 5 created a self-reinforcing feedback loop that allowed Page 3 to successfully manipulate the ranking system and overtake Page 1.

## Exercise 2

In [9]:
filename="Graphs/exercise2_graph.dat"
print("Exercise 2 Analysis:")
A, labels = read_dat(filename) 

# Use dense fallback for accurate counting of multiplicity on small graphs
eigenvalues, eigenvectors = get_eigenpairs(A, k=A.shape[0]-1)
dimension = np.sum(np.isclose(eigenvalues, 1))
print(f"The dimension of the eigenspace associated with the eigenvalue 1 is: {dimension} >= of the number of the components in the web graph(4).")

Exercise 2 Analysis:
The dimension of the eigenspace associated with the eigenvalue 1 is: 4 >= of the number of the components in the web graph(4).


## Exercise 3

In [11]:
filename="Graphs/exercise3_graph.dat"
print("Exercise 3 Analysis:")
A, labels = read_dat(filename) 

eigenvalues, eigenvectors = get_eigenpairs(A, k=A.shape[0]-1)
dimension = np.sum(np.isclose(eigenvalues, 1))
print(f"The dimension of the eigenspace associated with the eigenvalue 1 is: {dimension} because the web contains two closed strongly connected components. Indeed from the node group {1,2} we can't reach the node group {3,4,5} and from the node group {3,4} we can't reach the node group {1,2}.")

Exercise 3 Analysis:
The dimension of the eigenspace associated with the eigenvalue 1 is: 2 because the web contains two closed strongly connected components. Indeed from the node group (1, 2) we can't reach the node group (3, 4, 5) and from the node group (3, 4) we can't reach the node group (1, 2).


## Exercise 4

In [17]:
filename="Graphs/hollins.dat"
analyze_graph(filename, m)


Graphs/hollins.dat results saved to hollins_results.txt


(array([2.80419483e-05, 1.76690929e-02, 5.94542086e-05, ...,
        3.43596947e-05, 3.43596947e-05, 1.45427231e-04], shape=(6012,)),
 {1: 'http://www1.hollins.edu/',
  2: 'http://www.hollins.edu/',
  3: 'http://www1.hollins.edu/Docs/CompTech/Network/webmail_faq.htm',
  4: 'http://www1.hollins.edu/Docs/Forms/GetForms.htm',
  5: 'http://www1.hollins.edu/Docs/misc/travel.htm',
  6: 'http://www1.hollins.edu/Docs/GVCalendar/gvmain.htm',
  7: 'http://www1.hollins.edu/docs/events/events.htm',
  8: 'http://www1.hollins.edu/docs/comptech/mainviruses.htm',
  9: 'http://www1.hollins.edu/Docs/Academics/acad.htm',
  10: 'http://www1.hollins.edu/Docs/CompTech/Blackboard/bb_faq.htm',
  11: 'http://www1.hollins.edu/Docs/comptech/comptech.htm',
  12: 'http://www1.hollins.edu/Docs/Academics/international_programs/index.htm',
  13: 'http://www1.hollins.edu/Docs/academics/online/cyber.htm',
  14: 'http://www1.hollins.edu/Registrar/registrar.htm',
  15: 'http://www1.hollins.edu/Docs/Academics/writingcente

## Exercise 5

**Prove that in any web the importance score of a page with no backlinks is zero.**

The importance score $x_k$ of a page $k$ is determined by the weighted sum of scores of all pages $j$ that link to it. The definition is given by:

$$x_{k}=\sum_{j\in L_{k}}\frac{x_{j}}{n_{j}}$$

where $L_k$ is the set of pages linking to page $k$ (backlinks) and $n_j$ is the number of outgoing links from page $j$.

If a page $k$ has **no backlinks**, the set $L_k$ is empty ($L_k = \emptyset$). The summation is then performed over an empty set. By definition of summation over an empty set, the result is zero.

Thus, if a page has no backlinks, its importance score is $x_k = 0$.

## Exercise 6

In [20]:

def exercise_6(filename,i,j):
    print("\n" + "="*70)
    print("Exercise 6 Analysis:")
    A, labels = read_dat(filename)
    if A is None: return

    A_eigenvalues, A_eigenvectors = get_eigenpairs(A)
    x = A_eigenvectors[:, 0] # vector c chooosen for proving y=P·x
    l = A_eigenvalues[0] # corresponding eigenvalue

    # Create permutation matrix P that swaps nodes i and j
    # Use sparse matrix for P to maintain efficiency
    n = A.shape[0]
    P = sparse.eye(n, format='lil') # LIL is efficient for changing structure
    P[i-1, i-1] = 0
    P[j-1, j-1] = 0
    P[i-1, j-1] = 1
    P[j-1, i-1] = 1
    P = P.tocsc()
    
    # Compute the theoretical permuted adjacency matrix A2_theoretical
    A2_theoretical = P @ A @ P

    # Apply the swap to the graph file
    print(f"Referring to graph {filename}, we swap the pages with indices i={i} and j={j}.")
    output_file = "Graphs/exercise6_graph.dat"
    swap_node_indices(filename, output_file, i, j)
    A2, labels2 = read_dat(output_file)

    # Verify A2_theoretical == A2
    # Convert to dense for element-wise comparison loop (graph is small)
    A2_theo_dense = A2_theoretical.toarray()
    A2_dense = A2.toarray()

    print("1)")
    print(f"Verifying that the permuted adjacency matrix A2_theoretical matches the matrix A2 obtained by applying PagesRank on the new graph:", end=" ") 
    for r in range(A2_theo_dense.shape[0]):
        for c in range(A2_theo_dense.shape[1]):
            if (A2_theo_dense[r][c]!=A2_dense[r][c]):
                print(f"Mismatch at position ({r},{c}): A2_theoretical={A2_theo_dense[r][c]}, A2={A2_dense[r][c]}")
                return
    print("the matrices match perfectly.")

    print("2)")
    # Verify that l is an eigenvalue of A2 and find corresponding eigenvector y
    A2_eigenvalues, A2_eigenvectors = get_eigenpairs(A2)
    print(f"Eigenvalue lambda={np.real(l):.6f} of A is an eigenvalue for A2 too: {np.any(np.isclose(A2_eigenvalues, l))}.")
    
    idx_list = np.where(np.isclose(A2_eigenvalues, l))[0]
    if len(idx_list) > 0:
        idx = idx_list[0]
        y_found = A2_eigenvectors[:, idx]
        y_theoretical = P @ x
        
        # Normalize vectors for comparison (eigenvectors are direction only)
        y_found = y_found / np.linalg.norm(y_found)
        y_theoretical = y_theoretical / np.linalg.norm(y_theoretical)

        # 1. verify if y_found is equal to y_theoretical (Px)
        match_positive = np.allclose(y_found, y_theoretical)
        # 2. verify if y_found is equal to -y_theoretical (-Px)
        match_negative = np.allclose(y_found, -y_theoretical)
        # Theorem verified if y_found is equal to either Px or -Px
        is_proven = match_positive or match_negative
        print(f"Verifying that y corresponding to l is equal to P·x (or -P·x): {is_proven}.")    

    """
    Argumentation on the Invariance of Importance Scores:

    1. Transposition Result (Swapping two pages i and j):
    The analysis showed that the relabeled link matrix is Ã = P A P.
    The importance score eigenvector y of Ã is related to the eigenvector x of A by y = Px (or y = -Px).
    Since P is an elementary transposition matrix, the operation y = Px has the effect of SWAPPING the i-th and j-th components of the vector x.
    
    This means that:
    - The new score for the page with index i (y_i) is the original score of page j (x_j).
    - The new score for the page with index j (y_j) is the original score of page i (x_i).
    
    The intrinsic importance of each page (based on its connectivity) is PRESERVED; only the index assigned to it has changed.

    2. Generalization to Any Permutation:
    Any general permutation matrix Q (representing an arbitrary relabeling of N pages) can be expressed as the product of a sequence of elementary transposition matrices (Q = P_k * ... * P_1).
    
    Since we proved that each single transposition (P) does not alter the MAGNITUDES of the scores (only their position in the vector), a sequence of such operations (Q) will also keep the score magnitudes unchanged.
    
    Therefore, ANY arbitrary relabeling of pages leaves the intrinsic importance scores unchanged; it merely permutes (reorganizes) those values within the score vector.
    """
    return

In [21]:
filename="Graphs/exercise2_graph.dat"
i=2
j=3
exercise_6(filename,i,j)


Exercise 6 Analysis:
Referring to graph Graphs/exercise2_graph.dat, we swap the pages with indices i=2 and j=3.
Indecises 2 and 3 swapped (Nodes and Links). Result saved in 'Graphs/exercise6_graph.dat'.
1)
Verifying that the permuted adjacency matrix A2_theoretical matches the matrix A2 obtained by applying PagesRank on the new graph: the matrices match perfectly.
2)
Eigenvalue lambda=-1.000000 of A is an eigenvalue for A2 too: True.
Verifying that y corresponding to l is equal to P·x (or -P·x): False.


## Exercise 7

In [22]:
def exercise_7_stochastic_proof(filename):
    print("\n" + "="*70)
    print("Exercise 7 Analysis:")
    m=0.15
    A, labels = read_dat(filename)
    if A is None: return
    """
    Exercise 7: Proof that M = (1-m)A + mS is column-stochastic.
    This function provides the formal proof and a numerical verification.
    """
    n = A.shape[0]
    # 1. Formal Proof (Printed Argument)
    """
    Formal Proof:
    1. A is column-stochastic: Sum_i(A_ij) = 1 for all j.
    2. S is column-stochastic: S_ij = 1/n, so Sum_i(S_ij) = n * (1/n) = 1 for all j.

    Sum of the j-th column of M:
    Sum_i(M_ij) = Sum_i[ (1-m)A_ij + mS_ij ]
                = (1-m) * Sum_i(A_ij) + m * Sum_i(S_ij)  (By linearity)
                = (1-m) * (1) + m * (1)                 (Substituting the known sums)
                = 1 - m + m = 1

    Conclusion: Since the sum of every column of M is 1, M is column-stochastic.
    """
    
    # 2. Numerical Verification (Using the input matrix A)
    print(f"Numerical Verification (using graph: {filename}):")
    # For stochastic check, we can use column sums directly without forming dense M
    # sum(M) = (1-m)sum(A) + m*sum(S). sum(A) is 1 (if no dangling), sum(S) is 1.
    
    col_sums_A = np.array(A.sum(axis=0)).flatten()
    # Handle dangling nodes where sum(A) is 0 for verification context
    col_sums_M = (1 - m) * col_sums_A + m * 1.0
    
    print(f"Matrix M is column-stochastic: {np.allclose(col_sums_M, 1.0)}")
    return

In [24]:
exercise_7_stochastic_proof("Graphs/exercise2_graph.dat")


Exercise 7 Analysis:
Numerical Verification (using graph: Graphs/exercise2_graph.dat):
Matrix M is column-stochastic: True


## Exercise 8

**Prove that the product of two column-stochastic matrices is also column-stochastic.**

Let $A$ and $B$ be two $n \times n$ column-stochastic matrices. Let $C = AB$ be their product.
A matrix is column-stochastic if all its entries are non-negative and the entries in each column sum to one.

### Formal Proof
We must show that the sum of the elements in the $j$-th column of $C$ is $1$. The element $C_{ij}$ is defined by the matrix multiplication:

$$C_{ij} = \sum_{k=1}^n A_{ik} B_{kj}$$

The sum of the $j$-th column of $C$ is:

$$\sum_{i=1}^n C_{ij} = \sum_{i=1}^n \left[ \sum_{k=1}^n A_{ik} B_{kj} \right]$$

Swap the order of summation (using Fubini's theorem for finite sums):

$$\sum_{i=1}^n C_{ij} = \sum_{k=1}^n \left[ \sum_{i=1}^n A_{ik} B_{kj} \right]$$

Since $B_{kj}$ is constant with respect to the index $i$, we factor it out of the inner sum:

$$\sum_{i=1}^n C_{ij} = \sum_{k=1}^n \left[ B_{kj} \sum_{i=1}^n A_{ik} \right]$$

Since $A$ is column-stochastic, the sum of the elements in the $k$-th column of $A$ is $1$:

$$\sum_{i=1}^n A_{ik} = 1$$

Substitute this into the expression:

$$\sum_{i=1}^n C_{ij} = \sum_{k=1}^n \left[ B_{kj} \cdot 1 \right] = \sum_{k=1}^n B_{kj}$$

Since $B$ is also column-stochastic, the sum of the elements in the $j$-th column of $B$ is $1$:

$$\sum_{k=1}^n B_{kj} = 1$$

Therefore, $\sum_{i=1}^n C_{ij} = 1$. The product matrix $C = AB$ is also column-stochastic.

## Exercise 9

**Show that a page with no backlinks is given importance score $m/n$ by formula (3.2).**

The modified importance score equation (3.2) is given by:

$$x = (1-m)Ax + m s$$

where $x_i$ is the importance score of page $i$, $A$ is the link matrix, $m$ is the damping factor ($0 \le m \le 1$), and $s$ is the column vector with all entries $1/n$.

We consider the $i$-th component of the equation, corresponding to a page $i$ with no backlinks.

1. **Term $Ax$**: The term $(Ax)_i$ is the score contribution to page $i$ from all pages linking to it. If page $i$ has no backlinks, the $i$-th row of the link matrix $A$ contains only zeros ($A_{ij} = 0$ for all $j$).
$$(Ax)_i = \sum_{j=1}^n A_{ij} x_j = \sum_{j=1}^n 0 \cdot x_j = 0$$

2. **Term $s$**: The vector $s$ is the column vector with all entries $1/n$. Thus, $s_i = 1/n$.

Substituting these components into the $i$-th equation for the importance score $x_i$:

$$x_i = (1-m)(Ax)_i + m s_i$$
$$x_i = (1-m)(0) + m \left(\frac{1}{n}\right)$$
$$x_i = \frac{m}{n}$$

The importance score for a page with no backlinks is $\frac{m}{n}$. This guarantees a non-zero score if $m>0$.

## Exercise 10

**Suppose that $A$ is the link matrix for a strongly connected web of $n$ pages (any page can be reached from any other page by following a finite number of links). Show that $\dim(V_1(A)) = 1$ as follows.**

* **Note that page $i$ can be reached from page $j$ in one step if and only if $A_{ij} > 0$. Show that $(A^2)_{ij} > 0$ if and only if page $i$ can be reached from page $j$ in exactly two steps.**
    
    By definition of matrix multiplication, $(A^2)_{ij} = \sum_{k=1}^n A_{ik}A_{kj}$. Since all entries of $A$ are non-negative ($A_{ij} \ge 0$), the sum is strictly positive if and only if there exists at least one index $k$ such that the term $A_{ik}A_{kj} > 0$.
    The condition $A_{ik}A_{kj} > 0$ implies both $A_{ik} > 0$ and $A_{kj} > 0$.
    * $A_{kj} > 0$ means there is a link $j \to k$.
    * $A_{ik} > 0$ means there is a link $k \to i$.
    
    Therefore, $(A^2)_{ij} > 0$ implies there is a path $j \to k \to i$ of exactly two steps.

* **Show more generally that $(A^p)_{ij} > 0$ if and only if page $i$ can be reached from page $j$ in exactly $p$ steps.**
    
    We prove this by induction. The base case $p=1$ is given. Assume the statement holds for $p-1$.
    Using matrix multiplication: $(A^p)_{ij} = \sum_{k=1}^n A_{ik} (A^{p-1})_{kj}$.
    Since entries are non-negative, $(A^p)_{ij} > 0$ if and only if there exists an intermediate page $k$ such that $A_{ik} > 0$ (1 step from $k$ to $i$) and $(A^{p-1})_{kj} > 0$ ($p-1$ steps from $j$ to $k$).
    Concatenating the path from $j$ to $k$ with the link to $i$ results in a path of exactly $(p-1)+1 = p$ steps.

* **Argue that $(I + A + A^2 + \cdots + A^p)_{ij} > 0$ if and only if page $i$ can be reached from page $j$ in $p$ or fewer steps.**
    
    Let $S_p = \sum_{k=0}^p A^k$. The entry $(S_p)_{ij}$ is the sum of non-negative terms: $(S_p)_{ij} = (I)_{ij} + (A)_{ij} + \dots + (A^p)_{ij}$.
    This sum is strictly positive if and only if **at least one** term in the summation is positive.
    That is, $(S_p)_{ij} > 0$ iff $\exists k \in \{0, \dots, p\}$ such that $(A^k)_{ij} > 0$.
    Since $(A^k)_{ij} > 0$ corresponds to reachability in exactly $k$ steps, the positivity of the sum implies that page $i$ is reachable from page $j$ in $0$ steps, or $1$ step, $\dots$, or $p$ steps. 

* **Explain why $I + A + A^2 + \cdots + A^{n-1}$ is a positive matrix if the web is strongly connected.**
    
    A web of $n$ pages is **strongly connected** if every page $i$ is reachable from every other page $j$. The longest shortest path in such a graph is at most $n-1$ steps.
    Therefore, the sum matrix $S_{\text{sum}}$ must be a **positive matrix** (all entries strictly greater than zero):
    $$S_{\text{sum}} = I + A + A^2 + \dots + A^{n-1}$$
    Since every page $i$ is reachable from every page $j$ within $n-1$ steps, for every entry $(i, j)$, there is at least one term $(A^k)_{ij}$ in the sum that is positive.

* **Use the last part (and Exercise 8) to show that $B$ is positive and column-stochastic.**
    
    We define the auxiliary matrix $B$ as:
    $$B = \frac{1}{n}(I + A + A^2 + \cdots + A^{n-1})$$
    The matrix $B$ has two crucial properties:
    1.  $B$ is **Positive** ($B_{ij} > 0$ for all $i, j$): This follows directly from the positivity of $S_{\text{sum}}$ established above.
    2.  $B$ is **Column-Stochastic**: Since $A$ is column-stochastic, all its powers $A^k$ and the identity matrix $I$ are also column-stochastic. The sum of $n$ column-stochastic matrices results in a matrix where all columns sum to $n$. Dividing by $n$ ensures $B$ is column-stochastic.
    
    Since $B$ is a positive and column-stochastic matrix, Lemma 3.2 guarantees that the eigenspace $V_1(B)$ has a unique dimension: $\dim(V_1(B)) = 1$.

* **Show that if $x \in V_1(A)$ then $x \in V_1(B)$. Why does this imply that $\dim(V_1(A)) = 1$?**
    
    We must show that any eigenvector of $A$ (for $\lambda=1$) is also an eigenvector of $B$.
    Let $x$ be an eigenvector such that $Ax = x$. It follows that $A^k x = x$ for all powers $k \ge 0$. Substituting this into the definition of $B$:
    
    $$Bx = \frac{1}{n}(Ix + Ax + A^2x + \dots + A^{n-1}x)$$
    $$Bx = \frac{1}{n}(\underbrace{x + x + x + \dots + x}_{n \text{ terms}}) = \frac{1}{n}(nx) = x$$
    
    Thus, $x \in V_1(A)$ implies $x \in V_1(B)$, meaning $V_1(A)$ is a subspace of $V_1(B)$ ($V_1(A) \subseteq V_1(B)$).
    
    Since $\dim(V_1(B)) = 1$, the only non-empty subspaces of $V_1(B)$ are the zero subspace and $V_1(B)$ itself. Since $A$ is column-stochastic, we know 1 is an eigenvalue, so $V_1(A)$ contains non-zero vectors.
    Therefore, we conclude that:
    $$\mathbf{\dim(V_1(A)) = 1}$$
    This guarantees that the PageRank vector is **unique** for any strongly connected web.

## Exercise 11

In [25]:
def exercise_11():
    print("Exercise 11 Analysis:")
    analyze_graph("Graphs/exercise11_graph.dat", m=0.15)
    return

In [26]:
exercise_11()

Exercise 11 Analysis:

Graph Graphs/exercise11_graph.dat
  Converged in 29 iterations
  - No dangling nodes detected
PageRank scores (sorted by importance):
--------------------------------------------------
  1. Node3               : 0.348894
  2. Node1               : 0.237141
  3. Node5               : 0.178280
  4. Node4               : 0.138495
  5. Node2               : 0.097190



## Exercise 12

In [27]:
def exercise_12():
    print("Exercise 12 Analysis:")
    A,labels = read_dat("Graphs/exercise12_graph.dat")
    if A is None: return
    
    eigenvalues, eigenvectors = get_eigenpairs(A)
    idx_list = np.where(np.isclose(eigenvalues, 1))[0]
    
    if len(idx_list) > 0:
        idx = idx_list[0]
        x_raw = np.real(eigenvectors[:, idx])
        importance_score = x_raw / x_raw.sum()
        print("Importance scores with matrix A:")
        sorted_indices = np.argsort(importance_score)[::-1]
        for rank, idx in enumerate(sorted_indices, 1):
            node_label = labels[idx + 1]
            score = importance_score[idx]
            print(f"  {rank}. {node_label:20s}: {score:.6f}")
            
    print("\nNow using PageRank with m=0.15:")
    analyze_graph("Graphs/exercise12_graph.dat", m=0.15)
    print("The Exercise 12 results demonstrate that the original PageRank model (Matrix A) fails to assign any importance to the dangling Node 6 (0.00) because it lacks backlinks, whereas the modified PageRank model (Matrix M) successfully incorporates Node 6's contribution by giving it a positive minimal score (m/n = 0.025000), distributing its importance across the web and providing a more robust, non-ambiguous ranking where Node 3 remains the most important page in both scenarios.\n\n")
    return
exercise_12()

Exercise 12 Analysis:
Importance scores with matrix A:
  1. Node3               : 0.367347
  2. Node1               : 0.244898
  3. Node5               : 0.183673
  4. Node4               : 0.122449
  5. Node2               : 0.081633
  6. Node6               : -0.000000

Now using PageRank with m=0.15:

Graph Graphs/exercise12_graph.dat
  Converged in 28 iterations
  - No dangling nodes detected
PageRank scores (sorted by importance):
--------------------------------------------------
  1. Node3               : 0.340172
  2. Node1               : 0.231212
  3. Node5               : 0.173823
  4. Node4               : 0.135033
  5. Node2               : 0.094760
  6. Node6               : 0.025000

The Exercise 12 results demonstrate that the original PageRank model (Matrix A) fails to assign any importance to the dangling Node 6 (0.00) because it lacks backlinks, whereas the modified PageRank model (Matrix M) successfully incorporates Node 6's contribution by giving it a positive mini

## Exercise 13

In [28]:
def exercise_13():
    print("="*70)
    print("Exercise 13 Analysis:")
    analyze_graph("Graphs/exercise13_graph.dat", m=0.15)
    print("The analysis using matrix M shows that the isolated pair (Nodes 6-7) outranks the peripheral nodes of the larger cluster (Nodes 2-5). This demonstrates that out-degree dilution (x_1/4) significantly weakens the authority transferred by the central hub compared to the undiluted reciprocity (x_j/1) retained within the smaller clique.\n\n")
    return
exercise_13()

Exercise 13 Analysis:

Graph Graphs/exercise13_graph.dat
  Converged in 85 iterations
  - No dangling nodes detected
PageRank scores (sorted by importance):
--------------------------------------------------
  1. Node1               : 0.339768
  2. NOde7               : 0.142857
  3. Node6               : 0.142857
  4. Node5               : 0.093629
  5. Node4               : 0.093629
  6. Node3               : 0.093629
  7. Node2               : 0.093629

The analysis using matrix M shows that the isolated pair (Nodes 6-7) outranks the peripheral nodes of the larger cluster (Nodes 2-5). This demonstrates that out-degree dilution (x_1/4) significantly weakens the authority transferred by the central hub compared to the undiluted reciprocity (x_j/1) retained within the smaller clique.




## Exercise 14

In [29]:
def exercise_14():
    print("\n" + "="*70)
    print("Exercise 14 Analysis (Convergence Speed):")
    
    filename = "Graphs/exercise11_graph.dat"
    m = 0.15
    A, labels = read_dat(filename)
    n = A.shape[0]
    
    # Explicitly construct M to calculate eigenvalues and c
    S = np.ones((n, n)) / n
    M = (1 - m) * A + m * S
    
    # Calculation of c according to Proposition 4
    min_M_ij = np.min(M) 
    c_bound = 1 - 2 * min_M_ij
    
    # Calculation of eigenvalues to find lambda_2
    eigenvalues, _ = np.linalg.eig(M)
    sorted_abs_eig = np.sort(np.abs(eigenvalues))[::-1] 
    lambda_2 = sorted_abs_eig[1] # second largest eigenvalue
    
    print(f"Theoretical Bound c (Prop 4): {c_bound:.6f}")
    print(f"Second Largest Eigenvalue |lambda_2|: {lambda_2:.6f}")
    print(f"Expected Convergence Rate (1-m): {1-m:.6f}")
    print("-" * 60)

    # 3. Find the "True" q (using a very tight convergence)
    s_vec = np.ones(n) / n
    q = power_iteration_with_vector(A, s_vec, m, sys.stdout, tolerance=1e-14, max_iterations=2000)
    
    np.random.seed(42)
    x_k = np.random.rand(n)
    x_k = x_k / np.sum(x_k)
    
    # Initial error (k=0)
    error_prev = np.linalg.norm(x_k - q, 1)
    
    print(f"\nIteration Analysis:")
    print(f"{'k':<5} | {'Error ||M^k x - q||_1':<25} | {'Ratio (Err_k / Err_k-1)':<25}")
    print("-" * 60)
    
    target_steps = range(5,51,5)
    
    for k in range(1, 51):
        
        x_new = (1 - m) * (A @ x_k) + m * s_vec
        x_new = x_new / np.sum(x_new)
        
        error_curr = np.linalg.norm(x_new - q, 1)
        ratio = error_curr / error_prev if error_prev > 0 else 0
        
        if k in target_steps:
            print(f"{k:<5} | {error_curr:<25} | {ratio:<25}")
        
        x_k = x_new
        error_prev = error_curr

    print("The results confirm that the PageRank algorithm converges much faster than the pessimistic theoretical bound suggested by Proposition 4, effectively stabilizing at a rate determined by the second largest eigenvalue lambda_2=0.61, which is well below the upper limit of 1-m = 0.85.\n\n")
    return
exercise_14()


Exercise 14 Analysis (Convergence Speed):
Theoretical Bound c (Prop 4): 0.940000
Second Largest Eigenvalue |lambda_2|: 0.611269
Expected Convergence Rate (1-m): 0.850000
------------------------------------------------------------
  Converged in 66 iterations

Iteration Analysis:
k     | Error ||M^k x - q||_1     | Ratio (Err_k / Err_k-1)  
------------------------------------------------------------
5     | 0.003573691412488089      | 0.3299608024509418       
10    | 0.00022442754413923294    | 0.658539955774984        
15    | 1.83865894923807e-05      | 0.6099292256586637       
20    | 1.577051431014831e-06     | 0.6112216854667655       
25    | 1.3453636904525723e-07    | 0.6112793481431336       
30    | 1.1481787284828293e-08    | 0.6112679104506388       
35    | 9.798663408444597e-10     | 0.6112635693374094       
40    | 8.362867343070235e-11     | 0.6113269674755295       
45    | 7.131586987618732e-12     | 0.6105859372215212       
50    | 6.139810881933272e-13     | 0

## Exercise 15

**To see why the second largest eigenvalue plays a role in bounding**

$$
\frac{\lVert M^k x_0 - q \rVert_1}{\lVert M^{k-1} x_0 - q \rVert_1},
$$

consider an $n \times n$ positive column-stochastic matrix $M$ that is diagonalizable. Let $x_0$ be any vector with non-negative components that sum to one. Since $M$ is diagonalizable, we can create a basis of eigenvectors $\{q, v_1, \ldots, v_{n-1}\}$, where $q$ is the steady state vector, and then write:

$$
x_0 = a q + \sum_{k=1}^{n-1} b_k v_k.
$$

Determine $M^k x_0$, and then show that $a = 1$ and the sum of the components of each $v_k$ must equal $0$. Next apply Proposition 4 to prove that, except for the non-repeated eigenvalue $\lambda = 1$, the other eigenvalues are all strictly less than one in absolute value. Use this to evaluate:

$$
\lim_{k \to \infty} \frac{\lVert M^k x_0 - q \rVert_1}{\lVert M^{k-1} x_0 - q \rVert_1}.
$$

## Spectral Decomposition and Initial Setup

We assume $M$ is a positive, column-stochastic, and diagonalizable matrix. The desired PageRank vector $q$ is the eigenvector associated with the unique unit eigenvalue $\lambda=1$. Let $\{q, v_1, v_2, \dots, v_{n-1}\}$ be a basis of eigenvectors for $M$, with corresponding eigenvalues $\{1, \lambda_2, \lambda_3, \dots, \lambda_{n-1}\}$.

The initial probability vector $x_0$ (where $\sum_i x_{0_i}=1$) is expressed by the spectral expansion:
$$x_0 = a q + \sum_{j=2}^{n} b_j v_j$$

Applying the matrix $M$ for $k$ iterations yields the $k$-th approximation:
$$M^k x_0 = a M^k q + \sum_{j=2}^{n} b_j M^k v_j = a q + \sum_{j=2}^{n} b_j \lambda_j^k v_j$$

## Determining Coefficients and Vector Sums

Let $e$ be the column vector of all ones. The sum of the components of any vector $y$ is $e^T y$.

1.  **Sum of components of $v_j$:**
    Since $M$ is column-stochastic, we have $e^T M = e^T$. Applying this to the eigenvalue equation $M v_j = \lambda_j v_j$:
    $$e^T M v_j = \lambda_j e^T v_j \implies (1 - \lambda_j) (e^T v_j) = 0$$
    Since $\lambda_j \neq 1$ for $j \ge 2$, we conclude that $e^T v_j = 0$.

2.  **Determination of $a$:**
    Substituting the expansion of $x_0$ into $e^T x_0 = 1$:
    $$1 = a (e^T q) + \sum_{j=2}^{n} b_j (e^T v_j) = a(1) + 0 \implies a = 1$$

## Eigenvalue Magnitudes and Limit

With $a=1$, the error vector is $M^k x_0 - q = \sum_{j=2}^{n} b_j \lambda_j^k v_j$.
[Image of power method convergence and eigenvalues]
Applying Proposition 4, for any vector $v$ with $\sum v_i = 0$, $||Mv||_1 \le c ||v||_1$ with $c < 1$. This implies $|\lambda_j| \le c < 1$ for all $j \ge 2$.

As $k \to \infty$, the error is dominated by the second largest eigenvalue $\lambda_2$:
$$L = \lim_{k\rightarrow\infty} \frac{||b_2 \lambda_2^k v_2||_1}{||b_2 \lambda_2^{k-1} v_2||_1} = |\lambda_2|$$

## Exercise 16

**Consider the link matrix $A= \begin{pmatrix}0&\frac{1}{2}&\frac{1}{2}\\ 0&0&\frac{1}{2}\\ 1&\frac{1}{2}&0\end{pmatrix}$. Show that $M=(1-m)A+mS$ (where $S_{ij}=1/3$) is not diagonalizable for $0 < m < 1$.**

For a matrix to be diagonalizable, the **geometric multiplicity** (m.g.) of every eigenvalue must equal its **algebraic multiplicity** (m.a.).

### 1. Eigenvalues of $M$
The eigenvalues of $A$ are $\lambda_{A,1} = 1$ (m.a. 1) and $\lambda_{A,2} = -1/2$ (m.a. 2).
The eigenvalues of $M$ follow the transformation $\lambda_{M,j} = (1-m)\lambda_{A,j}$ (except for $\lambda=1$):
* $\lambda_1 = 1$ (m.a. = 1)
* $\lambda^* = -\frac{1-m}{2}$ (m.a. = 2)

### 2. Geometric Multiplicity of $\lambda^*$
We compute $\text{m.g.}(\lambda^*) = 3 - \text{rank}(M - \lambda^* I)$. Let $D = M - \lambda^* I$.

Analysing the columns of $D$:
* The columns $d_2$ and $d_3$ are linearly dependent ($d_3 = d_2$).
* However, $d_1$ and $d_2$ are linearly independent for any $0 < m < 1$.

Thus, $\text{rank}(D) = 2$.

### 3. Conclusion
$$\text{m.g.}(\lambda^*) = 3 - 2 = 1$$
Since $\text{m.g.}(\lambda^*) = 1$ and $\text{m.a.}(\lambda^*) = 2$, the matrix $M$ is **not diagonalizable**.

## Exercise 17

**How should the value of $m$ be chosen? How does this choice affect the rankings and the computation time?**

The parameter $m$ (damping factor) balances the link structure $A$ and the random jump matrix $S$.


### Effect on Computation Time
The convergence speed of the Power Method depends on the second largest eigenvalue $|\lambda_2| \le 1 - m$:
* **High $m$:** Small $|\lambda_2|$, **fast convergence**.
* **Low $m$:** $|\lambda_2|$ close to 1, **slow convergence**.

### Effect on Rankings
* **$m \approx 1$:** The ranking becomes "egalitarian" (all pages equal), ignoring the actual web structure.
* **$m \approx 0$:** The ranking is pure but unstable (problems with disconnected components and dangling nodes).
* **Balanced $m$ (e.g., 0.15):** Ensures a unique solution, handles disconnected graphs, and maintains high performance.

**Conclusion:** Google's $m=0.15$ is a heuristic choice that provides a stable, unique ranking while ensuring the algorithm converges in a reasonable number of iterations.