# Randomized Truncated SVD

- subtitle: My favorite algorithm and why you should have one too.
- badges: true
- categories: [fastpages, jupyter]

In [1]:
!pip install altair

import numpy as np
import altair as alt
import pandas as pd

from collections import namedtuple



- modified from [Scikit learn's implementation](https://github.com/scikit-learn/scikit-learn/blob/15a949460dbf19e5e196b8ef48f9712b72a3b3c3/sklearn/utils/extmath.py#L246)

In [2]:
SVD = namedtuple("SVD", "U, s, Vt")

def svd_flip(svd: SVD):
    max_abs_cols = np.argmax(np.abs(svd.U), axis=0)
    signs = np.sign(svd.U[max_abs_cols, range(svd.U.shape[1])])
    svd.U[:] *= signs
    svd.Vt[:] *= signs[:, np.newaxis]
    return svd

def randomized_range_finder(M, *, size, n_iter, rgen):
    Q = rgen.normal(size=(M.shape[1], size))
    for i in range(n_iter):
        Q = M.T @ (M @ Q)
    Q, _ = np.linalg.qr(M @ Q)
    return Q

def randomized_svd(M, n_components, *, rgen, n_oversamples=10):
    n_random = n_components + n_oversamples
    n_samples, n_features = M.shape

    Q = randomized_range_finder(M, size=n_random, n_iter=7, rgen=rgen)
    Uhat, s, Vt = np.linalg.svd(Q.T @ M)
    U = Q @ Uhat
    return SVD(U[:, :n_components], s[:n_components], Vt[:n_components, :])

In [3]:
def sample_lowrank_matrix(size, max_rank, *, rgen, noise=0.0):
    A = rgen.normal(size=(size, max_rank))
    M = A @ A.T + noise * rgen.normal(size=(size, size)) 
    return M / size

In [4]:
rgen = np.random.RandomState(1234)
M = sample_lowrank_matrix(1000, 3, rgen=rgen)
svd_random = randomized_svd(M, 10, rgen=rgen)
svd_standard = SVD(*np.linalg.svd(M))

In [5]:
def plot_singular_values(limit_ranks, **kwargs):
    data = [
        pd.DataFrame(
            {
                "Rank": np.arange(len(svd.s))[:limit_ranks], 
                "Value": svd.s[:limit_ranks], 
                "Algorithm": name,
            }
        )
    for name, svd in kwargs.items()
    ]                                                                           
    data = pd.concat(data)
        
    return (
        alt.Chart(data)
        .mark_point(size=40)
        .encode(
            x="Rank", 
            y=alt.Y("Value", axis=alt.Axis(title="Singular Value")),
            color="Algorithm", 
            shape="Algorithm",
            tooltip=["Algorithm", "Value"],
        )
    ).interactive()

def plot_singular_vector_errors(svd1, svd2, *, limit_ranks):
    svd1, svd2 = svd_flip(svd1), svd_flip(svd2)
    errors = np.linalg.norm(svd1.U[:, :limit_ranks] - svd2.U[:, :limit_ranks], axis=0)
    data = pd.DataFrame(
        {
            "Rank": np.arange(len(errors)), "Error":errors
        }
    )
    return (
        alt.Chart(data)
        .mark_line(color="gray", strokeDash=[2,2])
        .encode(
            x="Rank", 
            y=alt.Y("Error", axis=alt.Axis(title='Error', titleColor="gray")),
            tooltip=["Error"],
        )
    ).interactive()

alt.layer(
    plot_singular_values(limit_ranks=10, **{"SVD": svd_random, "Randomized SVD": svd_standard}),
    plot_singular_vector_errors(svd_random, svd_standard, limit_ranks=10)
).resolve_scale(y="independent")

In [6]:
M = sample_lowrank_matrix(1000, 3, rgen=rgen, noise=10)
svd_random = randomized_svd(M, 10, rgen=rgen)
svd_standard = SVD(*np.linalg.svd(M))

In [8]:
alt.layer(
    plot_singular_values(limit_ranks=10, **{"SVD": svd_random, "Randomized SVD": svd_standard}),
    plot_singular_vector_errors(svd_random, svd_standard, limit_ranks=10)
).resolve_scale(y="independent")