In [1]:
import os 
os.environ["SPARK_HOME"] = "/home/com3dian/Documents/github/Period4/DIS/spark-3.2.1-bin-hadoop3.2"
import findspark
findspark.init()
from pyspark.sql import SparkSession
import findspark
findspark.init()
from pyspark.sql import SparkSession

## You can add more config while building 
spark = SparkSession.builder.master("local[8]").\
                    config("spark.app.name","session_one").\
                    getOrCreate() #number of threads = 16

In [201]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix
import numpy as np

def dataframe2NumpyArray(df, colName):
    '''
    convert spark dataframe to numpy array
    '''
    return np.array(df.select(colName).collect())
    
def numpyArray2Matrix(array):
    '''
    convert numpy array to spark Rowmatrix
    ----------------------------
    return: Rowmatrix
    '''
    if len(array.shape) == 3:
        array.reshape((array.shape[0], array.shape[-1]))
    
    
    denseVectorList = []
    for i in range(len(array)):
        denseVectorList.append(Vectors.dense(array[i]))
    
    RDD = spark.sparkContext.parallelize(denseVectorList)
    matrix = RowMatrix(RDD)
        
    return matrix
    
def SVDsimilarity(matrix, numDimension = 1, normalization = False):
    '''
    generalized cosine similarity using SVD(singular value decomposition)
    by doing SVD, the input matrix Y will be decomposited into 3 matrix: U, S, V, with Y = USV^T
    where S can be considered as a lower rank approximation of Y
    the SVD optimal in the sense that minimizing the Frobinius norm of reconstruction error || \hat{Y} - Y ||^{2}_{F}
    therefore, by comparing the 'order K coefficient of determination' \frac{||\hat{Y} ||^2_F}{||Y ||^2_F}, we shall a similarity.
    ----------------------
    in the case of only 2 vectors, the SVD similarity is equal to the cosine similarity
    ----------------------
    the original SVD similarity is ranged from 1/n to 1, where n is the number of vectors
    to get it can range over the entire [0,1] interval, one can normalize it by \frac{}{} if only using the first sigular value
    
    ------------------------------------------
    matrix: pyspark RowMatrix, represents a row-oriented distributed Matrix with no meaningful row indices
            each column is an input vector
            all element in matrix should be positive
    numDimension: integer, if not 1 then use the first(largest) few singular value
    normalization: if true then do normalization
    
    '''
    N = matrix.numRows()
    # SVD
    svd = matrix.computeSVD(numDimension, computeU=False)
    sVector = svd.s.toArray()
    print(sVector)
    YApproximate = sum(sVector*sVector)**0.5
    
    GramianMatrix = matrix.computeGramianMatrix().toArray()
    Y = np.trace(GramianMatrix)**0.5
    
    # normalization
    print(YApproximate)
    print(Y)
    if not normalization:
        similarityScore = YApproximate/Y
    else:
        similarityScore = ((YApproximate/Y * N) - 1)/(N -1)
        
    # return 2 * similarityScore**2 - 1
    return 2 * similarityScore**2 - 1

In [211]:
RDD = spark.sparkContext.parallelize([
    Vectors.dense(4.0, 2.0),
    Vectors.dense(2.0, 1.0),
    Vectors.dense(6.0, 3.0),
    Vectors.dense(4.0, 2.0),
    Vectors.dense(4.0, 2.0)
])
mat = RowMatrix(RDD)

SVDsimilarity(mat, 1)

[10.48808848]
10.488088481701515
10.488088481701515


1.0

In [210]:
RDD = spark.sparkContext.parallelize([
    Vectors.dense(0.5, 0.5*3**.5),
    Vectors.dense(1.0, 0.0)
])
mat = RowMatrix(RDD)

SVDsimilarity(mat, 1, False)

[1.22474487]
1.224744871391589
1.4142135623730951


0.49999999999999933