# Project DD2434
## Run instructions
Tested in python 3.12.6, in a .venv environment

## Datasets
### Text document datasets
The original paper used 4 datsets from different newsgroups, sci.crypt, sci.med, sci.space and soc.religion.christian, which are available at `URL`

We also used `DATASET`

### Image datasets


The original paper used 13 monochromatic images of still life as the basis for the image datasets, which are unfortunately not available any longer. 




In [44]:
import numpy as np
from matplotlib import pyplot as plt
import scipy

In [53]:
def normalizecolumns(d):
    """
    Return a matrix where the columns have unit length
    """
    
    o = np.zeros(d.shape)
    for col in range(d.shape[1]):
        total_length = 0
        for row in range(d.shape[0]):
            total_length += d[(row,col)]**2
        
        for row in range(d.shape[0]):
            o[(row,col)] = d[(row,col)]/np.sqrt(total_length)
    return o
    
    

def RP(dataset,k):
    """
    Dataset - d x N, where d is the # of dimensions, N is the # of data points.
    Creates appropriate matrix R (k x d), to transform into a lower dimension representation
    """
    d = dataset.shape[0]
    return normalizecolumns(np.random.normal(size=(k,d)))


def SRP(dataset,k):
    d = dataset.shape[0]
    o = np.zeros((k,d))
    for row in range(k):
        for col in range(d):
            r = np.random.random()
            if r < 1/6:
                o[(row,col)] = np.sqrt(3)
            elif r > 5/6:
                o[(row,col)] = -np.sqrt(3)
    return normalizecolumns(o)

def SVD(dataset,k):
    """
    Returns d x k matrix, correponding to the k largest eigenvalues's eigenvectors.
    Calculated through SVD - more efficient methods exist since most are ignored
    """
    svd_res = np.linalg.svd(dataset)
    U = svd_res.U
    return U[:,:k].transpose()

def DCT(dataset,k):
    return(scipy.fftpack.dct(dataset.transpose(),n = k,type=2,norm='ortho').transpose())

In [None]:
def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0: 
       return v
    return v / norm

def similarity(v1,v2,v_type):
    """
    Returns a measurement of similarity, depending on the v_type. 
    """
    nv1 = normalize(v1)
    nv2 = normalize(v2)
    
    if v_type == "text":
        return np.dot(nv1,nv2)
    elif v_type == "image":
        return np.linalg.norm(nv1 - nv2)
    else:
        raise ValueError("Invalid v_type, should be either 'text', or 'image'")

In [57]:
a = np.matrix([[1,2,3,4],[1,2,3,8],[1,2,3,8],[1,2,3,8],[1,2,3,8],[1,2,3,8],[1,2,3,8]])

print(SVD(a,2) * a)

print(DCT(a,2))

[[ -2.61604051  -5.23208101  -7.84812152 -19.98687238]
 [ -0.3953885   -0.79077701  -1.18616551   0.7245222 ]]
[[ 1.41421356  2.82842712  4.24264069  8.48528137]
 [ 0.          0.          0.         -2.82842712]]
