In [1]:
import os 
os.environ["SPARK_HOME"] = "/home/com3dian/Documents/github/Period4/DIS/spark-3.2.1-bin-hadoop3.2"
import findspark
findspark.init()
from pyspark.sql import SparkSession
import findspark
findspark.init()
from pyspark.sql import SparkSession

## You can add more config while building 
spark = SparkSession.builder.master("local[16]").\
                    config("spark.app.name","session_one").\
                    getOrCreate() #number of threads = 16

In [2]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix
import numpy as np


In [3]:
def dataframe2NumpyArray(df, colName):
    '''
    convert spark dataframe to numpy array
    '''
    return np.array(df.select(colName).collect())


def numpyArray2Matrix(array):
    '''
    convert numpy array to spark Rowmatrix
    ----------------------------
    return: Rowmatrix
    '''
    if len(array.shape) == 3:
        array.reshape((array.shape[0], array.shape[-1]))
    
    
    denseVectorList = []
    for i in range(len(array)):
        denseVectorList.append(Vectors.dense(array[i]))
        
    
    RDD = spark.sparkContext.parallelize(denseVectorList)
    normVectors = RDD.map(lambda x: x/(np.linalg.norm(x, 2)))
    
    RDD = spark.sparkContext.parallelize(normVectors.collect())
    matrix = RowMatrix(RDD)
        
    return matrix


def SVDsimilarity(matrix, numDimension = 1, normalization = False):
    '''
    generalized cosine similarity using SVD(singular value decomposition)
    by doing SVD, the input matrix Y will be decomposited into 3 matrix: U, S, V, with Y = USV^T
    where S can be considered as a lower rank approximation of Y
    the SVD optimal in the sense that minimizing the Frobinius norm of reconstruction error || \hat{Y} - Y ||^{2}_{F}
    therefore, by comparing the 'order K coefficient of determination' \frac{||\hat{Y} ||^2_F}{||Y ||^2_F}, we shall a similarity.
    ----------------------
    in the case of only 2 vectors, the SVD similarity is equal to the cosine similarity
    ----------------------
    the original SVD similarity is ranged from 1/n to 1, where n is the number of vectors
    to get it can range over the entire [0,1] interval, one can normalize it by \frac{}{} if only using the first sigular value
    
    ------------------------------------------
    matrix: pyspark RowMatrix, represents a row-oriented distributed Matrix with no meaningful row indices
            each column/row is an input vector
            all element in matrix should be positive
    numDimension: integer, if not 1 then use the first(largest) few singular value
    normalization: if true then do normalization
    
    '''
    N = matrix.numRows()
    # SVD
    svd = matrix.computeSVD(numDimension, computeU=False)
    sVector = svd.s.toArray()
    YApproximate = np.sum(sVector*sVector)**0.5
    
    GramianMatrix = matrix.computeGramianMatrix().toArray()
    Y = np.trace(GramianMatrix)**0.5
    
    # normalization
    if not normalization:
        similarityScore = YApproximate/Y
    else:
        similarityScore = ((YApproximate/Y * N) - 1)/(N -1)
        
    # return 2 * similarityScore**2 - 1
    # double angle formula
    return 2 * similarityScore**2 - 1

In [4]:
from tqdm import tqdm
def crossHomogeneityScore(df, queryColName, featureColName):
    '''
    
    '''
    if queryColName not in df.schema.names:
        
        npArray = dataframe2NumpyArray(df, featureColName)
        matrix = numpyArray2Matrix(npArray)
        similarity = SVDsimilarity(matrix)
        return similarity
    
    totalRows = df.count()
    queries = list(set(df.select(queryColName).collect()))
    homogeneityScore = 0
    
    for query in tqdm(queries):
        # get each cluster
        dfQuery = df.filter(df[queryColName] == query.query)
        # get number of rows
        numRows = dfQuery.count()
        npArray = dataframe2NumpyArray(dfQuery, featureColName)
        matrix = numpyArray2Matrix(npArray)
        similarity = SVDsimilarity(matrix)
        homogeneityScore += similarity * numRows/totalRows
    
    return homogeneityScore

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Row
import pyspark.sql.functions as f
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler

def listOfFrequencyTables(df): #take main dataframe, generate frequency dataframes
    '''
    
    '''
    histograms = []
    for col in df.dtypes:
        h=df.groupBy(col[0]).count()
        h = h.sort(desc("count"))
        histograms.append(h)
    return histograms

In [6]:
def getDecompFromTopFrequencies(df, histograms):
    clusterlst=[]
    for i in range(len(histograms)): #query database with top values of all columns
        d= str(histograms[i].first()) #value of the first row
        #print(d)
        d = d.split(",")[0].split('=')[1] #the splits are for formatting the string
        #print(d)
        #print("d before:",d)
        if "'"  in d:
            d = d.split("'")[1]
        
        #print("d after:",d)
        #print(type(d))
        cname = str(histograms[i][0]).split("'")[1]
        data = (df.filter(col(cname) == d))
        clusterlst.append(data)
    return clusterlst

In [7]:
def are_dfs_equal(df1, df2): #this works, i tested it
    res = df1.subtract(df2) #set subtraction on the two dataframes. 
    if res.count() == 0: #subtraction yielded empty set
        print("dataframes are equal")
        return True
    else:
        print("error! these rows are not in the union of your queries:")
        res.show() #show which tuples are not included in your query union
        return False

from functools import reduce
from pyspark.sql.functions import lit
from pyspark.sql import DataFrame
from tqdm import tqdm

def getDecompUsingFreqTable(df, freqdf):
    colName = freqdf.columns[0]
    
    df = df.withColumn('new_query_1', lit(colName))
    df = df.withColumn('new_query', concat_ws(' = ', 'new_query_1', colName))
    df = df.drop('new_query_1')
    
    if 'query' in df.columns:
        df = df.withColumn('new_query', concat_ws(', ', 'new_query', 'query'))
        df = df.drop('query')
    df = df.withColumnRenamed('new_query', 'query')
    return df

In [58]:
def addFeatureVector(df): #get feature vector for any dataframe for homogeneity function
    string_cols = [c for c, t in df.dtypes if t =='string' and c != 'query'] #get all columns that have stringtype, except query column
    
    stringindex_cols = [(i + "_indexed") for i in string_cols]
    indexer  = StringIndexer( inputCols=string_cols, outputCols=stringindex_cols, handleInvalid='error', stringOrderType='frequencyDesc')
    indexer.setHandleInvalid("skip") #change to "skip" to remove problematic rows
    indexed = indexer.fit(df).transform(df) #dataframe with indexed columns attached
    
    allnonstringcols = [column.name for column in indexed.schema if column.dataType != StringType()]
    vecAssembler = VectorAssembler(outputCol="features")
    
    # normalizaing
    # for col in allnonstringcols:
    #    maxValue = indexed.agg({col: "max"}).collect()[0][0]
    #    print(maxValue)
    #    minValue = indexed.agg({col: "min"}).collect()[0][0]
    #    print(minValue)
    #    indexed = indexed.withColumn(col + '_normalized', (indexed[col] - minValue)/(maxValue - minValue + 0.001*minValue))
    #    print(indexed.columns)
    # allnonstringcols = [col + '_normalized' for col in allnonstringcols]
    print(allnonstringcols)
    vecAssembler.setInputCols(allnonstringcols) #all numerical columns are put into feature vector, including indexed cols
    result = ( vecAssembler.transform(indexed)) #return the dataframe with feature column attached
    # for col in allnonstringcols:
    #    result = result.drop(col)
    for col in stringindex_cols:
        result = result.drop(col)
    return result

In [59]:
import datetime
def shuffle(hist):
    return sorted(hist, key = lambda x: x.count())

def getDecompositionbyColumn(df, K):
    histograms = shuffle(listOfFrequencyTables(df))
    histList = list(range(len(histograms)))
    colLeft = len(histograms)
    
    decomUnionWithVec = addFeatureVector(df)
    decomUnionWithVec.show()
    decomUnionWithVec.select('features').show(decomUnionWithVec.count(), False)
    overAllHomoScore = crossHomogeneityScore(decomUnionWithVec, 'query', 'features')
    print('over all homogeneity score: ', overAllHomoScore)
    
    nBucketsBefore = 1
    crossScoreBefore = overAllHomoScore
    
    update = True
    
    while update:
        if nBucketsBefore >= K or colLeft == 0:

            break
        
        
        update = False
        removeIndex = -1
        
        for i in histList:
            freqdf = histograms[i]
            
            unionWithVec = getDecompUsingFreqTable(decomUnionWithVec, freqdf)
            nBucketsAfter = unionWithVec.select('query').distinct().count()
            
            if nBucketsAfter == nBucketsBefore or nBucketsAfter > K:
                continue
            
            crossScoreAfter = crossHomogeneityScore(unionWithVec, 'query', 'features')
            if crossScoreAfter > crossScoreBefore :
                
                # update using new decomposition
                
                crossScoreBefore = crossScoreAfter
                
                removeIndex = i 
                update = True
                
        histList = [_ for _ in histList if _ != removeIndex]
        colLeft = len(histList)
        
        if update:
            
            decomUnionWithVec = getDecompUsingFreqTable(decomUnionWithVec, histograms[removeIndex])
            nBucketsBefore = decomUnionWithVec.select('query').distinct().count()
        
    if 'query' not in decomUnionWithVec.columns:
        print('user requested K =', str(K), ', but we can only got ', str(1), 'clusters.') 
        return decomUnionWithVec.drop('features')
    
    if nBucketsBefore != K:
        print('user requested K =', str(K), ', but we can only got ', str(nBucketsBefore), 'clusters.') 
    return decomUnionWithVec.drop('features')

In [60]:
df = spark.read.csv("/home/com3dian/Documents/github/DIS_project_2022/data/cities.csv",header=True,inferSchema=True)
df.show()

+----+-------+-------+-----+-------+-------+-------+-----+------------------+--------+
|LatD| "LatM"| "LatS"| "NS"| "LonD"| "LonM"| "LonS"| "EW"|            "City"| "State"|
+----+-------+-------+-----+-------+-------+-------+-----+------------------+--------+
|41.0|    5.0|   59.0|  "N"|   80.0|   39.0|    0.0|  "W"|      "Youngstown"|      OH|
|42.0|   52.0|   48.0|  "N"|   97.0|   23.0|   23.0|  "W"|         "Yankton"|      SD|
|46.0|   35.0|   59.0|  "N"|  120.0|   30.0|   36.0|  "W"|          "Yakima"|      WA|
|42.0|   16.0|   12.0|  "N"|   71.0|   48.0|    0.0|  "W"|       "Worcester"|      MA|
|43.0|   37.0|   48.0|  "N"|   89.0|   46.0|   11.0|  "W"| "Wisconsin Dells"|      WI|
|36.0|    5.0|   59.0|  "N"|   80.0|   15.0|    0.0|  "W"|   "Winston-Salem"|      NC|
|49.0|   52.0|   48.0|  "N"|   97.0|    9.0|    0.0|  "W"|        "Winnipeg"|      MB|
|39.0|   11.0|   23.0|  "N"|   78.0|    9.0|   36.0|  "W"|      "Winchester"|      VA|
|34.0|   14.0|   24.0|  "N"|   77.0|   55.0

In [61]:
result = getDecompositionbyColumn(df, 11)

['LatD', ' "LatM"', ' "LatS"', ' "LonD"', ' "LonM"', ' "LonS"', ' "NS"_indexed', ' "EW"_indexed', ' "City"_indexed', ' "State"_indexed']
+----+-------+-------+-----+-------+-------+-------+-----+------------------+--------+-------------+-------------+---------------+----------------+--------------------+
|LatD| "LatM"| "LatS"| "NS"| "LonD"| "LonM"| "LonS"| "EW"|            "City"| "State"| "NS"_indexed| "EW"_indexed| "City"_indexed| "State"_indexed|            features|
+----+-------+-------+-----+-------+-------+-------+-----+------------------+--------+-------------+-------------+---------------+----------------+--------------------+
|41.0|    5.0|   59.0|  "N"|   80.0|   39.0|    0.0|  "W"|      "Youngstown"|      OH|          0.0|          0.0|          119.0|             6.0|[41.0,5.0,59.0,80...|
|42.0|   52.0|   48.0|  "N"|   97.0|   23.0|   23.0|  "W"|         "Yankton"|      SD|          0.0|          0.0|          118.0|            18.0|[42.0,52.0,48.0,9...|
|46.0|   35.0|   5

over all homogeneity score:  0.724587374106495


100%|██████████| 10/10 [00:08<00:00,  1.16it/s]
100%|██████████| 10/10 [00:08<00:00,  1.18it/s]


user requested K = 11 , but we can only got  10 clusters.
