In [None]:
!wget https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar -xvf spark-3.2.1-bin-hadoop3.2.tgz
!java -version
!pip install findspark

In [2]:
import os 
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"
import findspark
findspark.init()
from pyspark.sql import SparkSession
import findspark
findspark.init()
from pyspark.sql import SparkSession

## You can add more config while building 
spark = SparkSession.builder.master("local[8]").\
                    config("spark.app.name","session_one").\
                    getOrCreate() #number of threads = 16

In [3]:
dfs = spark.read.csv("zomato.csv",header=True,inferSchema=True)
dfs.show()

+--------------------+--------------------+-------+--------------------+-------------+
|               links|               names|ratings|             cuisine|price for one|
+--------------------+--------------------+-------+--------------------+-------------+
|https://www.zomat...|       Sahara Bakers|    3.7|Chinese, Bakery, ...|          100|
|https://www.zomat...|                 KFC|    3.9|Burger, Fast Food...|          100|
|https://www.zomat...| Subbaiah Gari Hotel|    4.1|South Indian, And...|          100|
|https://www.zomat...|    Paradise Biryani|    3.9|Biryani, Kebab, D...|          100|
|https://www.zomat...|  Pista House Bakery|    4.3|Fast Food, Sandwi...|          100|
|https://www.zomat...|Shah Ghouse Hotel...|      4|North Indian, Chi...|          100|
|https://www.zomat...|       Swagath Hotel|    4.2|South Indian, Chi...|          100|
|https://www.zomat...|       Just Parantha|    4.2|        North Indian|          100|
|https://www.zomat...|              Mehfil|

In [4]:
dfs.schema.fields

[StructField(links,StringType,true),
 StructField(names,StringType,true),
 StructField(ratings,StringType,true),
 StructField(cuisine,StringType,true),
 StructField(price for one,IntegerType,true)]

In [5]:
string_cols = [c for c, t in dfs.dtypes if t =='string'] #all stringtype column names in a list
print(string_cols)
stringindex_cols = [(i + "_indexed") for i in string_cols]
print(stringindex_cols)

['links', 'names', 'ratings', 'cuisine']
['links_indexed', 'names_indexed', 'ratings_indexed', 'cuisine_indexed']


In [6]:
from pyspark.ml.feature import StringIndexer
indexer  = StringIndexer( inputCols=string_cols, outputCols=stringindex_cols, handleInvalid='error', stringOrderType='frequencyDesc')
indexer.setHandleInvalid("keep")
indexed = indexer.fit(dfs).transform(dfs)
indexed.show()

+--------------------+--------------------+-------+--------------------+-------------+-------------+-------------+---------------+---------------+
|               links|               names|ratings|             cuisine|price for one|links_indexed|names_indexed|ratings_indexed|cuisine_indexed|
+--------------------+--------------------+-------+--------------------+-------------+-------------+-------------+---------------+---------------+
|https://www.zomat...|       Sahara Bakers|    3.7|Chinese, Bakery, ...|          100|        472.0|        458.0|            6.0|          178.0|
|https://www.zomat...|                 KFC|    3.9|Burger, Fast Food...|          100|        302.0|        301.0|            4.0|          141.0|
|https://www.zomat...| Subbaiah Gari Hotel|    4.1|South Indian, And...|          100|        555.0|        537.0|            1.0|          359.0|
|https://www.zomat...|    Paradise Biryani|    3.9|Biryani, Kebab, D...|          100|        423.0|        409.0|    

In [7]:
from pyspark.sql.types import *
allnonstringcols = [column.name for column in indexed.schema if column.dataType != StringType()]
print(allnonstringcols)

['price for one', 'links_indexed', 'names_indexed', 'ratings_indexed', 'cuisine_indexed']


In [8]:
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(outputCol="features")
vecAssembler.setInputCols(allnonstringcols)
print(vecAssembler)
dataset = vecAssembler.transform(indexed)

VectorAssembler_6af4e5d2e936


In [9]:
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import KMeans
import time
numIterations = 100
numberClusters = 10

start = time.time()
kmeans = KMeans().setMaxIter(numIterations).setK(numberClusters).setSeed(1)
model = kmeans.fit(dataset)
predictions = model.transform(dataset)
end = time.time() - start #time taken to run kmeans and assign cluster labels to each record
print("time taken to cluster with k = 10:", end, "seconds.")

time taken to cluster with k = 10: 6.997028589248657 seconds.


In [10]:
#p1 = predictions.orderBy('prediction')
#p1.show(truncate=False)


In [11]:
#print(p1.count())
#dfs.count()

In [12]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix
import numpy as np

In [13]:
def dataframe2NumpyArray(df, colName):
    '''
    convert spark dataframe to numpy array
    '''
    return np.array(df.select(colName).collect())


def numpyArray2Matrix(array):
    '''
    convert numpy array to spark Rowmatrix
    ----------------------------
    return: Rowmatrix
    '''
    if len(array.shape) == 3:
        array.reshape((array.shape[0], array.shape[-1]))
    
    
    denseVectorList = []
    for i in range(len(array)):
        denseVectorList.append(Vectors.dense(array[i]))
        
    
    RDD = spark.sparkContext.parallelize(denseVectorList)
    normVectors = RDD.map(lambda x: x/(np.linalg.norm(x, 2)))
    
    RDD = spark.sparkContext.parallelize(normVectors.collect())
    matrix = RowMatrix(RDD)
        
    return matrix


def SVDsimilarity(matrix, numDimension = 1, normalization = False):
    '''
    generalized cosine similarity using SVD(singular value decomposition)
    by doing SVD, the input matrix Y will be decomposited into 3 matrix: U, S, V, with Y = USV^T
    where S can be considered as a lower rank approximation of Y
    the SVD optimal in the sense that minimizing the Frobinius norm of reconstruction error || \hat{Y} - Y ||^{2}_{F}
    therefore, by comparing the 'order K coefficient of determination' \frac{||\hat{Y} ||^2_F}{||Y ||^2_F}, we shall a similarity.
    ----------------------
    in the case of only 2 vectors, the SVD similarity is equal to the cosine similarity
    ----------------------
    the original SVD similarity is ranged from 1/n to 1, where n is the number of vectors
    to get it can range over the entire [0,1] interval, one can normalize it by \frac{}{} if only using the first sigular value
    
    ------------------------------------------
    matrix: pyspark RowMatrix, represents a row-oriented distributed Matrix with no meaningful row indices
            each column/row is an input vector
            all element in matrix should be positive
    numDimension: integer, if not 1 then use the first(largest) few singular value
    normalization: if true then do normalization
    
    '''
    N = matrix.numRows()
    # SVD
    svd = matrix.computeSVD(numDimension, computeU=False)
    sVector = svd.s.toArray()
    YApproximate = np.sum(sVector*sVector)**0.5
    
    GramianMatrix = matrix.computeGramianMatrix().toArray()
    Y = np.trace(GramianMatrix)**0.5
    
    # normalization
    if not normalization:
        similarityScore = YApproximate/Y
    else:
        similarityScore = ((YApproximate/Y * N) - 1)/(N -1)
        
    # return 2 * similarityScore**2 - 1
    # double angle formula
    return 2 * similarityScore**2 - 1

In [14]:
def crossHomogeneityScore(df, clusterColName, featureColName):
    '''
    
    '''
    if clusterColName not in df.schema.names:
        
        npArray = dataframe2NumpyArray(df, featureColName)
        matrix = numpyArray2Matrix(npArray)
        similarity = SVDsimilarity(matrix)
        
        return similarity
    
    
    totalRows = df.count()
    queries = list(set(df.select(clusterColName).collect()))
    
    homogeneityScore = 0
    
    for query in queries:
        # get each cluster
        dfQuery = df.filter(df[clusterColName] == query.query)
        # get number of rows
        numRows = dfQuery.count()
        
        npArray = dataframe2NumpyArray(dfQuery, featureColName)
        matrix = numpyArray2Matrix(npArray)
        
        similarity = SVDsimilarity(matrix)
        homogeneityScore += similarity * numRows/totalRows
    
    return homogeneityScore

In [17]:
calcstart = time.time()
score = crossHomogeneityScore(predictions, 'predictions', 'features')
calcend = time.time() - calcstart
print("homog score:", score)
print("time taken to calculate homogeneity:", calcend, "seconds.")

homog score: 0.632042901742482
time taken to calculate homogeneity: 1.453683853149414 seconds.
