# DMH artwork

A set of N artworks from Design Museum Helsinki. The dataset was created in the contexto of the [SPICE project](https://spice-h2020.eu/)

In [1]:
import json
  
# Opening JSON file
with open('./DMH/items.json') as f:
    dataFile = json.load(f)
data = dataFile["data"]
data

[{'_id': '634f19096c70ef5cf152f237',
  'title': 'Scandia',
  'Object': 'CUTLERY',
  'Special name': '*',
  'id': '44171',
  'author': 'Franck, Kaj',
  'Production date': '1952',
  'Collection': 'KÃ¤yttÃ¶kokoelma',
  'Manufacturer': 'Hackman Sorsakoski',
  'Dimension in cm': 'fork: 17.5 cm (length) x 0.15cm (thickness)  knife: 18.6 cm (length) x 0.3/0.15 cm (thickness)  spoon: 18 cm (length)',
  'Weight in kg': '*',
  'Materials': ['metal'],
  'Colour': ['metal'],
  '_datasetid': '0daa0287-d7f4-4f03-a068-95f43afcc347',
  '_timestamp': 1668599056,
  '_timestamp_year': 2022,
  '_timestamp_month': 11,
  '_timestamp_day': 16,
  '_timestamp_hour': 11,
  '_timestamp_minute': 44,
  '_timestamp_second': 16,
  '_updated': 1.0,
  'year': 1952.0,
  'ColourRGB': [[109, 114, 120]],
  'image': 'https://i.postimg.cc/Zqn1M35m/44171.png',
  'Object group': 'cutlery',
  'Color': [{'colorName': 'metal', 'rgb': [109, 114, 120]}]},
 {'_id': '634f1908b7693f159a62e2f6',
  'title': 'Savonia',
  'Object': 'CUTL

In [2]:
itemIds = [item["_id"] for item in data ]

In [3]:
from colormath.color_objects import LabColor, sRGBColor
from colormath.color_conversions import convert_color
from colormath.color_diff import delta_e_cie1976, delta_e_cie1994, delta_e_cie2000, delta_e_cmc

import numpy as np
def patch_asscalar(a):
    return a.item()

setattr(np, "asscalar", patch_asscalar)

class EqualSim:
    def __init_(self):
        pass
    
    def compute(self,att1, att2):
        return 1.0 if att1==att2 else 0.0
    
class ColorSim:   
    def __init__(self, distanceFunction= delta_e_cie2000):
        self.__distanceFunction = distanceFunction
        
    def compute(self,att1, att2):
        # Att1 and Att2 must be objects that contain a rgb field (a lists with 3 elements (RGB) )
        if ("rgb" in att1) and (len(att1["rgb"])== 3) and ("rgb" in att2) and (len(att2["rgb"])== 3):
            rgb1 = sRGBColor(att1["rgb"][0], att1["rgb"][1], att1["rgb"][2], is_upscaled=True)
            lab1 = convert_color(rgb1,LabColor)

            rgb2 = sRGBColor(att2["rgb"][0], att2["rgb"][1], att2["rgb"][2], is_upscaled=True)
            lab2 = convert_color(rgb2,LabColor)
    
            # Apply distance function on Lab colors. delta functions return a value [0, 100]
            # so we normalize to [0.0, 1.0]
            deltaValue = self.__distanceFunction(lab1, lab2)/100
            # Clamp values greater than 1.0
            deltaValue = 1.0 if deltaValue>1.0 else deltaValue
            return 1.0-deltaValue
                
        else:
            return 0.0

        
class ListSim:
    def __init__(self, similarityFunction, aggFunction):
        self.__similarityFunction = similarityFunction
        self.__aggFunction = aggFunction

    def compute(self,list1, list2):
        if (type (list1) is not list) or len(list1) == 0:
            return 0.0
        if (type (list2) is not list) or len(list2) == 0:
            return 0.0
        # Choose the smaller list as list1
        if len(list1)>len(list2):
            tempList = list2
            list2 = list1
            list1 = tempList
        
        partialSims = []        
        for i in range(len(list1)):
            simValuesForItem = []
            for j in range(len(list2)):
                simValuesForItem.append(self.__similarityFunction.compute(list1[i], list2[j]))
            # Choose the best similarity value 
            partialSims.append( max(simValuesForItem) )  
        
        return self.__aggFunction.aggregate(partialSims)      
    
    
class AggregationFunction:
    def __init__(self, aggFunction=max):
        self.__aggFunction = aggFunction
        
    def aggregate(self,valueList):
        return self.__aggFunction(valueList)
    


        

In [4]:
def computeLocalSimilarityOnAttribute(itemA, itemB, attName, aLocalSimilarityFunction):
    if (attName in itemA) and (attName in itemB):
        return aLocalSimilarityFunction.compute(itemA[attName], itemB[attName])
    else:
        raise Exception("Attribute not found: "+attName) 


def weightedAverage(values, weights):
    return np.average(values, weights=weights)

def computeSimilarity(simDescription, data, idAtt, itemIds, localSimFunctions, globalAggFunction):
    size = len(itemIds)
    simData = []
    for i in range(size):
        for j in range (i+1, size):
            itemA = data[i]
            itemB = data[j]
            globalData = dict(id1 = itemA[idAtt], id2=itemB[idAtt])
            localData = {}
            localValues = []
            localWeights = []
            for localAtt in localSimFunctions:
                simValue = computeLocalSimilarityOnAttribute(itemA, itemB, localAtt["attName"], localAtt["localSimFunction"])
                localData[localAtt["attName"]] = simValue
                localValues.append(simValue)
                localWeights.append(localAtt["weight"])
            sim = globalAggFunction(localValues, localWeights)
            globalData["value"] = {"global": sim, "local": localData}
            simData.append(globalData)
    dataFile = dict(
        similarityDescription = simDescription,
        similarityData = simData
    )
    return dataFile


## Color + author (0.5 - 0.5)

In [5]:
aSimilarityFunction = ColorSim()
anAggFunction = AggregationFunction()
theColorSimFunction = ListSim(aSimilarityFunction, anAggFunction)

localSimFunctions = [ dict(attName="Color", localSimFunction=theColorSimFunction, weight=0.5), 
                      dict(attName="author", localSimFunction=EqualSim(), weight=0.5)]

simDescription = dict(
    globalSim= dict(
        simFunction="Weighted average"
    ),
    localSim = dict(
        Color = dict(
            simFunction = "max(delta_e_cie2000)",
            weight = 0.5
        ),
        author = dict(
            simFunction = "equals",
            weight = 0.5
        )
    )
)

dataFile = computeSimilarity(simDescription, data, "_id", itemIds, localSimFunctions, weightedAverage)
with open("./DMH/Author50Color50.json", "w") as fp:
    json.dump(dataFile, fp,  indent=2)

## Color + author (0.8 - 0.2)

In [7]:
simDescription = dict(
    globalSim= dict(
        simFunction="Weighted average"
    ),
    localSim = dict(
        Color = dict(
            simFunction = "max(delta_e_cie2000)",
            weight = 0.8
        ),
        author = dict(
            simFunction = "equals",
            weight = 0.2
        )
    )
)

localSimFunctions = [ dict(attName="Color", localSimFunction=theColorSimFunction, weight=0.8), 
                      dict(attName="author", localSimFunction=EqualSim(), weight=0.2)]

dataFile = computeSimilarity(simDescription, data, "_id", itemIds, localSimFunctions, weightedAverage)
with open("./DMH/Author80Color20.json", "w") as fp:
    json.dump(dataFile, fp,  indent=2)

## max(Color)

The similarity is the maximum color similarity between colors in artworks

In [8]:
localSimFunctions = [ dict(attName="Color", localSimFunction=theColorSimFunction, weight=1.0)]
simDescription = dict(
    globalSim= dict(
        simFunction="Weighted average"
    ),
    localSim = dict(
        Color = dict(
            simFunction = "max(delta_e_cie2000)",
            weight = 1.0
        )
    )
)
dataFile = computeSimilarity(simDescription, data, "_id", itemIds, localSimFunctions, weightedAverage)
with open("./DMH/simMaxColor.json", "w") as fp:
    json.dump(dataFile, fp,  indent=2)

## avg(Color)

The similarity is the average color similarity between colors in artworks

In [9]:
aSimilarityFunction = ColorSim()
anAggFunction = AggregationFunction(np.average)
theColorSimFunction = ListSim(aSimilarityFunction, anAggFunction)

localSimFunctions = [ dict(attName="Color", localSimFunction=theColorSimFunction, weight=1.0)]
simDescription = dict(
    globalSim= dict(
        simFunction="Weighted average"
    ),
    localSim = dict(
        Color = dict(
            simFunction = "avg(delta_e_cie2000)",
            weight = 1.0
        )
    )
)
dataFile = computeSimilarity(simDescription, data, "_id", itemIds, localSimFunctions, weightedAverage)
with open("./DMH/simAvgColor.json", "w") as fp:
    json.dump(dataFile, fp,  indent=2)


## range(decades)

The similarity is computed using the year, according to the similarity between the decades that the year belong to

In [10]:
class DecadeSimilarity:
    def __init__(self, minYear, maxYear):
        self.minDecade = self.convertYearToDecade(minYear)
        self.maxDecade = self.convertYearToDecade(maxYear)
        self.decadeRange = self.maxDecade - self.minDecade
    
    def compute(self,year1, year2):
        if (year1 is not None) and (year2 is not None):
            decade1 = self.convertYearToDecade(year1)
            decade2 = self.convertYearToDecade(year2)

            normalizeDecade1 = (decade1 - self.minDecade) / self.decadeRange
            normalizeDecade2 = (decade2 - self.minDecade) / self.decadeRange
            return 1-abs(normalizeDecade2 - normalizeDecade1)
        else:
            return 0.0
    
    def convertYearToDecade(self,year):
        return int((year-1)/10) * 10
    


In [11]:
years = [item["year"] for item in data if item["year"] is not None]

    
decadeSimilarityFunction = DecadeSimilarity(min(years), max(years))
localSimFunctions = [ dict(attName="year", localSimFunction=decadeSimilarityFunction, weight=1.0)]
simDescription = dict(
    globalSim= dict(
        simFunction="Weighted average"
    ),
    localSim = dict(
        year = dict(
            simFunction = "Range(Decades)",
            weight = 1.0
        )
    )
)

dataFile = computeSimilarity(simDescription, data, "_id", itemIds, localSimFunctions, weightedAverage)
with open("./DMH/simDecades.json", "w") as fp:
    json.dump(dataFile, fp,  indent=2)


# Blood Alcohol Domain

Available at: <https://github.com/gateslm/Blood-Alcohol-Domain>

```math
@misc{BAC23,
  title = {Blood Alcohol Content Domain},
  howpublished={\url{https://github.com/gateslm/Blood-Alcohol-Domain}}, 
  year = {2023},
  url={https://github.com/gateslm/Blood-Alcohol-Domain}, 
  author = {Doyle, D{\'o}nal and Cunningham, P{\'a}draig, and Coyle, Lorcan}
}
```

Used in: <https://link-springer-com.bucm.idm.oclc.org/chapter/10.1007/978-3-031-40177-0_10>

In [12]:
import pandas as pd
df = pd.read_csv('./blood-alcohol/blood-alcohol-domain.csv', index_col='casename')
newColumns = [name.replace("features__", "") for name in df.columns.values]
df.columns = newColumns
df

Unnamed: 0_level_0,Gender,FrameSize,AmountConsumed,Meal,Duration,solution
casename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
n1,male,1,1,snack,60,0.2
n2,female,2,3,none,120,0.8
n3,female,4,4,full,90,0.8
n4,male,4,6,none,120,1.0++
n5,male,4,3,none,60,0.5
...,...,...,...,...,...,...
n81,male,4,5,none,120,0.8
n82,male,5,5,none,120,0.7
n83,male,6,5,none,120,0.6
n84,male,7,5,snack,120,0.5


In [13]:
df.reset_index(inplace=True)
content = json.loads(df.to_json(orient='records'))



In [14]:
dataFile = dict(
    description = "Blood Alcohol Domain with additional files about processing and background information.", 
    attributes = dict(
        casename= "string",
        Gender= "string",
        FrameSize= "number",
        AmountConsumed= "number",
        Meal= "string",
        Duration= "number",
        solution= "string"
    ),
    id= "casename",
    data= content
)


{'description': 'Blood Alcohol Domain with additional files about processing and background information.',
 'attributes': {'casename': 'string',
  'Gender': 'string',
  'FrameSize': 'number',
  'AmountConsumed': 'number',
  'Meal': 'string',
  'Duration': 'number',
  'solution': 'string'},
 'id': 'casename',
 'data': [{'casename': 'n1',
   'Gender': 'male',
   'FrameSize': 1,
   'AmountConsumed': 1,
   'Meal': 'snack',
   'Duration': 60,
   'solution': '0.2'},
  {'casename': 'n2',
   'Gender': 'female',
   'FrameSize': 2,
   'AmountConsumed': 3,
   'Meal': 'none',
   'Duration': 120,
   'solution': '0.8'},
  {'casename': 'n3',
   'Gender': 'female',
   'FrameSize': 4,
   'AmountConsumed': 4,
   'Meal': 'full',
   'Duration': 90,
   'solution': '0.8'},
  {'casename': 'n4',
   'Gender': 'male',
   'FrameSize': 4,
   'AmountConsumed': 6,
   'Meal': 'none',
   'Duration': 120,
   'solution': '1.0++'},
  {'casename': 'n5',
   'Gender': 'male',
   'FrameSize': 4,
   'AmountConsumed': 3,
   '

In [15]:
with open("./blood-alcohol/blood-alcohol-domain.json", "w") as fp:
    json.dump(dataFile, fp,  indent=2)


In [68]:
class RangeSimilarity:
    def __init__(self, minValue, maxValue):
        self.minValue = minValue
        self.maxValue = maxValue
        self.range = abs(self.maxValue - self.minValue)
    
    def compute(self,v1, v2):
        if (v1 is not None)  and (v1>=self.minValue) and (v1<=self.maxValue) and (v2 is not None) and (v2>=self.minValue) and (v2<=self.maxValue):
            normalizeV1 = (v1 - self.minValue) / self.range
            normalizeV2 = (v2 - self.minValue) / self.range
            return 1-abs(normalizeV2 - normalizeV1)
        else:
            return 0.0

class NominalRangeSimilarity(RangeSimilarity):
    def __init__(self, listNominalValues):
        self.values = listNominalValues
        super().__init__(0, len(listNominalValues)-1)
    def compute(self,v1, v2):
        if (v1 in self.values) and (v1 in self.values):
            indexV1 = self.values.index(v1)
            indexV2 = self.values.index(v2)
            return super().compute(indexV1, indexV2)
        else:
            return 0.0

In [20]:
itemIds = [item["casename"] for item in content ]
localSimFunctions = [ dict(attName="Gender", localSimFunction=EqualSim(), weight=0.25), 
                      dict(attName="AmountConsumed", localSimFunction=RangeSimilarity(1,14), weight=0.25),
                      dict(attName="Meal", localSimFunction=NominalRangeSimilarity(['none', 'snack', 'full']), weight=0.25),
                      dict(attName="Duration", localSimFunction=RangeSimilarity(30,240), weight=0.25)]

simDescription = dict(
    globalSim= dict(
        simFunction="Weighted average"
    ),
    localSim = dict(
        Gender = dict(
            simFunction = "equals",
            weight = 0.25
        ),
        AmountConsumed = dict(
            simFunction = "RangeSimilarity",
            weight = 0.25
        ),
        Meal = dict(
            simFunction = "NominalRangeSimilarity",
            weight = 0.25
        ),
        Duration = dict(
            simFunction = "RangeSimilarity",
            weight = 0.25
        ),
    )
)

dataFile = computeSimilarity(simDescription, content, "casename", itemIds, localSimFunctions, weightedAverage)
with open("./blood-alcohol/GenderAmountMealDuration.json", "w") as fp:
    json.dump(dataFile, fp,  indent=2)

In [22]:
itemIds = [item["casename"] for item in content ]
localSimFunctions = [ dict(attName="Gender", localSimFunction=EqualSim(), weight=0.25), 
                      dict(attName="AmountConsumed", localSimFunction=RangeSimilarity(1,14), weight=0.25),
                      dict(attName="Meal", localSimFunction=NominalRangeSimilarity(['none', 'snack', 'full']), weight=0.25),
                      dict(attName="FrameSize", localSimFunction=RangeSimilarity(1,8), weight=0.25)]

simDescription = dict(
    globalSim= dict(
        simFunction="Weighted average"
    ),
    localSim = dict(
        Gender = dict(
            simFunction = "equals",
            weight = 0.25
        ),
        AmountConsumed = dict(
            simFunction = "RangeSimilarity",
            weight = 0.25
        ),
        Meal = dict(
            simFunction = "NominalRangeSimilarity",
            weight = 0.25
        ),
        FrameSize = dict(
            simFunction = "RangeSimilarity",
            weight = 0.25
        ),
    )
)

dataFile = computeSimilarity(simDescription, content, "casename", itemIds, localSimFunctions, weightedAverage)
with open("./blood-alcohol/GenderAmountMealFrameSize.json", "w") as fp:
    json.dump(dataFile, fp,  indent=2)

In [23]:
itemIds = [item["casename"] for item in content ]
localSimFunctions = [ dict(attName="AmountConsumed", localSimFunction=RangeSimilarity(1,14), weight=0.33),
                      dict(attName="Meal", localSimFunction=NominalRangeSimilarity(['none', 'snack', 'full']), weight=0.33),
                      dict(attName="FrameSize", localSimFunction=RangeSimilarity(1,8), weight=0.33)]

simDescription = dict(
    globalSim= dict(
        simFunction="Weighted average"
    ),
    localSim = dict(
        AmountConsumed = dict(
            simFunction = "RangeSimilarity",
            weight = 0.33
        ),
        Meal = dict(
            simFunction = "NominalRangeSimilarity",
            weight = 0.33
        ),
        FrameSize = dict(
            simFunction = "RangeSimilarity",
            weight = 0.33
        ),
    )
)

dataFile = computeSimilarity(simDescription, content, "casename", itemIds, localSimFunctions, weightedAverage)
with open("./blood-alcohol/AmountMealFrameSize.json", "w") as fp:
    json.dump(dataFile, fp,  indent=2)

# Breast Cancer Wisconsin Data

<https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original>



In [203]:
columnNames = ["SampleNumber",
"ClumpThickness",
"UniformityCellSize",
"UniformityCellShape",
"MarginalAdhesion",
"SingleEpithelialCellSize",
"BareNuclei",
"BlandChromatin",
"NormalNucleoli",
"Mitoses",
"Class"]
df = pd.read_csv("./BCWD/breast-cancer-wisconsin.csv", header=None, names=columnNames)
df.drop(labels=['SampleNumber'], axis=1, inplace=True)
df.reset_index(inplace=True)
df.columns = columnNames
df

Unnamed: 0,SampleNumber,ClumpThickness,UniformityCellSize,UniformityCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,Class
0,0,5,1,1,1,2,1,3,1,1,2
1,1,5,4,4,5,7,10,3,2,1,2
2,2,3,1,1,1,2,2,3,1,1,2
3,3,6,8,8,1,3,4,3,7,1,2
4,4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,694,3,1,1,1,3,2,1,1,1,2
695,695,2,1,1,1,2,1,1,1,1,2
696,696,5,10,10,3,7,3,8,10,2,4
697,697,4,8,6,4,3,4,10,6,1,4


In [204]:
df['Class'] = df['Class'].apply(lambda value:'benign' if value==2 else 'malignant')
df["BareNuclei"] = df["BareNuclei"].apply(lambda value: 0 if value== '?' else int(value))

In [205]:

content = json.loads(df.to_json(orient='records'))
content

[{'SampleNumber': 0,
  'ClumpThickness': 5,
  'UniformityCellSize': 1,
  'UniformityCellShape': 1,
  'MarginalAdhesion': 1,
  'SingleEpithelialCellSize': 2,
  'BareNuclei': 1,
  'BlandChromatin': 3,
  'NormalNucleoli': 1,
  'Mitoses': 1,
  'Class': 'benign'},
 {'SampleNumber': 1,
  'ClumpThickness': 5,
  'UniformityCellSize': 4,
  'UniformityCellShape': 4,
  'MarginalAdhesion': 5,
  'SingleEpithelialCellSize': 7,
  'BareNuclei': 10,
  'BlandChromatin': 3,
  'NormalNucleoli': 2,
  'Mitoses': 1,
  'Class': 'benign'},
 {'SampleNumber': 2,
  'ClumpThickness': 3,
  'UniformityCellSize': 1,
  'UniformityCellShape': 1,
  'MarginalAdhesion': 1,
  'SingleEpithelialCellSize': 2,
  'BareNuclei': 2,
  'BlandChromatin': 3,
  'NormalNucleoli': 1,
  'Mitoses': 1,
  'Class': 'benign'},
 {'SampleNumber': 3,
  'ClumpThickness': 6,
  'UniformityCellSize': 8,
  'UniformityCellShape': 8,
  'MarginalAdhesion': 1,
  'SingleEpithelialCellSize': 3,
  'BareNuclei': 4,
  'BlandChromatin': 3,
  'NormalNucleoli': 

In [206]:
dataFile = dict(
    description = "[Breast Cancer Wisconsin Data](https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original)", 
    attributes = {
        "SampleNumber": "string",
        "ClumpThickness": "number",
        "UniformityCellSize": "number",
        "UniformityCellShape": "number",
        "MarginalAdhesion": "number",
        "SingleEpithelialCellSize": "number",
        "BareNuclei": "number",
        "BlandChromatin": "number",
        "NormalNucleoli": "number",
        "Mitoses": "number",
        "Class": "string"
    },
    id= "SampleNumber",
    data= content
)

In [207]:
with open("./BCWD/bcwd.json", "w") as fp:
    json.dump(dataFile, fp)

In [208]:
itemIds = [item["SampleNumber"] for item in content ]
atts = columnNames[1:-1]
localSimFunctions = [dict(attName=name, localSimFunction=RangeSimilarity(1,14), weight=1/len(atts)) for name in atts]

localSim = {}
value = dict(simFunction = "RangeSimilarity", weight=1/len(atts))
for att in atts:
    localSim[att]=value
    

simDescription = dict(
    globalSim= dict(
        simFunction="Weighted average"
    ),
    localSim = localSim
)

dataFile = computeSimilarity(simDescription, content, "SampleNumber", itemIds, localSimFunctions, weightedAverage)
with open("./BCWD/AllAttributes.json", "w") as fp:
    json.dump(dataFile, fp)