In [None]:
!wget https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar -xvf spark-3.2.1-bin-hadoop3.2.tgz
!java -version
!pip install findspark

In [2]:
import os 
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"
import findspark
findspark.init()
from pyspark.sql import SparkSession
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Row
import pyspark.sql.functions as f
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler

## You can add more config while building 
spark = SparkSession.builder.master("local[8]").\
                    config("spark.app.name","session_one").\
                    getOrCreate() #number of threads = 16



In [3]:
df = spark.read.csv("testpeople.csv",header=True,inferSchema=True)
df.show()

+----+---------+---+
|Name|     city|age|
+----+---------+---+
|John|  utrecht| 20|
|Mary|amsterdam| 21|
|Nick|rotterdam| 22|
|Nick|  utrecht| 21|
|John|rotterdam| 21|
|Mary|rotterdam| 22|
|Nick|  utrecht| 20|
|John|rotterdam| 21|
|Nick|amsterdam| 20|
|Mary|  utrecht| 22|
+----+---------+---+



In [4]:
def listOfFrequencyTables(df): #take main dataframe, generate frequency dataframes
  histograms = []
  for col in df.dtypes:
      h=df.groupBy(col[0]).count()
      h = h.sort(desc("count"))
      histograms.append(h)
      h.show() #comment this line to suppress output
  return histograms
histograms = listOfFrequencyTables(df)

+----+-----+
|Name|count|
+----+-----+
|Nick|    4|
|Mary|    3|
|John|    3|
+----+-----+

+---------+-----+
|     city|count|
+---------+-----+
|  utrecht|    4|
|rotterdam|    4|
|amsterdam|    2|
+---------+-----+

+---+-----+
|age|count|
+---+-----+
| 21|    4|
| 22|    3|
| 20|    3|
+---+-----+



In [5]:
def getDecompFromTopFrequencies(histograms):
  clusterlst=[]
  for i in range(len(histograms)): #query database with top values of all columns
    d= str(histograms[i].first()) #value of the first row
    print(d)
    d = d.split(",")[0].split('=')[1] #the splits are for formatting the string
    print(d)
    #print("d before:",d)
    if "'"  in d:
      d = d.split("'")[1]

    #print("d after:",d)
    #print(type(d))
    cname = str(histograms[i][0]).split("'")[1]
    print(cname,"=",d)
    
    data = (df.filter(col(cname) == d))
    
    data.show(15)
    clusterlst.append(data)
  return clusterlst

clusterlst = getDecompFromTopFrequencies(histograms)


Row(Name='Nick', count=4)
'Nick'
Name = Nick
+----+---------+---+
|Name|     city|age|
+----+---------+---+
|Nick|rotterdam| 22|
|Nick|  utrecht| 21|
|Nick|  utrecht| 20|
|Nick|amsterdam| 20|
+----+---------+---+

Row(city='rotterdam', count=4)
'rotterdam'
city = rotterdam
+----+---------+---+
|Name|     city|age|
+----+---------+---+
|Nick|rotterdam| 22|
|John|rotterdam| 21|
|Mary|rotterdam| 22|
|John|rotterdam| 21|
+----+---------+---+

Row(age=21, count=4)
21
age = 21
+----+---------+---+
|Name|     city|age|
+----+---------+---+
|Mary|amsterdam| 21|
|Nick|  utrecht| 21|
|John|rotterdam| 21|
|John|rotterdam| 21|
+----+---------+---+



In [6]:
def are_dfs_equal(df1, df2): #this works, i tested it
  res = df1.subtract(df2) #set subtraction on the two dataframes. 
  if res.count() == 0: #subtraction yielded empty set
    print("dataframes are equal")
    return True
  else:
    print("error! these rows are not in the union of your queries:")
    res.show() #show which tuples are not included in your query union
    return False

In [7]:
from functools import reduce
from pyspark.sql.functions import lit
from pyspark.sql import DataFrame
def getDecompUsingFreqTable(df,freqdf): #takes original database and one frequency table as input, returns union of all queried dataframes as output
  print("this is frequency table:")
  freqdf.show()
  cname = freqdf.columns[0]
  valuelist = (freqdf.select(freqdf.columns[0]).rdd.flatMap(lambda x: x).collect()) #list of all values of frequency column

  unionlst = []
  for v in valuelist: #each unique value in the freq. table is used as a query
    result = df.filter(col(cname) == v)
    querystr = cname + "=" + str(v)
    containsquery = False
    for c in df.columns: #check if query column exists in the input dataframe
      if "query" in c:
        containsquery= True
    
    if(containsquery): #check if query column already exists in the input
      newquery = str(result.select(col("query")).first()).split("'")[1] + "," + querystr #concatenate existing with new queries

      newres=result.withColumn("query",lit(newquery))
    else:
      print("creating query column:")
      newres=result.withColumn("query",lit(querystr))
    print(querystr)
    #newres.show()
    unionlst.append(newres)
  unn = reduce(DataFrame.unionAll, unionlst) #put all queried dataframes back together as one
  print("union:")
  #unn.show()

  return unn


union = (getDecompUsingFreqTable(df,histograms[0])) #function call with 'Name' frequency table
print("equality result:")
subdf = are_dfs_equal(df,union.drop('query')) #checks if union of queries covers whole database
print(subdf)
print("\nnext run:")
union1 = (getDecompUsingFreqTable(union,histograms[1]))
union2 = (getDecompUsingFreqTable(union1,histograms[2]))

print("after queries:\n")
union.show(10,False)
union1.show(10,False)
union2.show(10,False)


this is frequency table:
+----+-----+
|Name|count|
+----+-----+
|Nick|    4|
|Mary|    3|
|John|    3|
+----+-----+

creating query column:
Name=Nick
creating query column:
Name=Mary
creating query column:
Name=John
union:
equality result:
dataframes are equal
True

next run:
this is frequency table:
+---------+-----+
|     city|count|
+---------+-----+
|  utrecht|    4|
|rotterdam|    4|
|amsterdam|    2|
+---------+-----+

city=rotterdam
city=utrecht
city=amsterdam
union:
this is frequency table:
+---+-----+
|age|count|
+---+-----+
| 21|    4|
| 22|    3|
| 20|    3|
+---+-----+

age=21
age=22
age=20
union:
after queries:

+----+---------+---+---------+
|Name|city     |age|query    |
+----+---------+---+---------+
|Nick|rotterdam|22 |Name=Nick|
|Nick|utrecht  |21 |Name=Nick|
|Nick|utrecht  |20 |Name=Nick|
|Nick|amsterdam|20 |Name=Nick|
|Mary|amsterdam|21 |Name=Mary|
|Mary|rotterdam|22 |Name=Mary|
|Mary|utrecht  |22 |Name=Mary|
|John|utrecht  |20 |Name=John|
|John|rotterdam|21 |Name=J

In [8]:
def are_dfs_equal(df1, df2): #this works, i tested it
  res = df1.subtract(df2) #set subtraction on the two dataframes. 
  if res.count() == 0: #subtraction yielded empty set
    print("dataframes are equal")
    return True
  else:
    print("error! these rows are not in the union of your queries:")
    res.show() #show which tuples are not included in your query union
    return False

In [9]:
listOfFrequencyTables(clusterlst[0])

+----+-----+
|Name|count|
+----+-----+
|Nick|    4|
+----+-----+

+---------+-----+
|     city|count|
+---------+-----+
|  utrecht|    2|
|amsterdam|    1|
|rotterdam|    1|
+---------+-----+

+---+-----+
|age|count|
+---+-----+
| 20|    2|
| 22|    1|
| 21|    1|
+---+-----+



[DataFrame[Name: string, count: bigint],
 DataFrame[city: string, count: bigint],
 DataFrame[age: int, count: bigint]]

In [10]:

def addFeatureVector(df): #get feature vector for any dataframe for homogeneity function
  string_cols = [c for c, t in df.dtypes if t =='string' and c != 'query'] #get all columns that have stringtype, except query column

  stringindex_cols = [(i + "_indexed") for i in string_cols]
  indexer  = StringIndexer( inputCols=string_cols, outputCols=stringindex_cols, handleInvalid='error', stringOrderType='frequencyDesc')
  indexer.setHandleInvalid("keep") #change to "skip" to remove problematic rows
  indexed = indexer.fit(df).transform(df) #dataframe with indexed columns attached

  allnonstringcols = [column.name for column in indexed.schema if column.dataType != StringType()]
  vecAssembler = VectorAssembler(outputCol="features")
  vecAssembler.setInputCols(allnonstringcols) #all numerical columns are put into feature vector, including indexed cols

  result=  ( vecAssembler.transform(indexed)) #return the dataframe with feature column attached
  return result
 

union2withvec = addFeatureVector(union2)
union2withvec.show()

+----+---------+---+--------------------+------------+------------+--------------+
|Name|     city|age|               query|Name_indexed|city_indexed|      features|
+----+---------+---+--------------------+------------+------------+--------------+
|John|rotterdam| 21|Name=Nick,city=ro...|         1.0|         0.0|[21.0,1.0,0.0]|
|John|rotterdam| 21|Name=Nick,city=ro...|         1.0|         0.0|[21.0,1.0,0.0]|
|Nick|  utrecht| 21|Name=Nick,city=ro...|         0.0|         1.0|[21.0,0.0,1.0]|
|Mary|amsterdam| 21|Name=Nick,city=ro...|         2.0|         2.0|[21.0,2.0,2.0]|
|Nick|rotterdam| 22|Name=Nick,city=ro...|         0.0|         0.0|[22.0,0.0,0.0]|
|Mary|rotterdam| 22|Name=Nick,city=ro...|         2.0|         0.0|[22.0,2.0,0.0]|
|Mary|  utrecht| 22|Name=Nick,city=ro...|         2.0|         1.0|[22.0,2.0,1.0]|
|Nick|  utrecht| 20|Name=Nick,city=ut...|         0.0|         1.0|[20.0,0.0,1.0]|
|John|  utrecht| 20|Name=Nick,city=ut...|         1.0|         1.0|[20.0,1.0,1.0]|
|Nic