In [1]:
import os
import sys

# Importing utils file in a notebook requires to add the directory in the 
# sys path before importing it.
utilsModulePath = os.path.abspath("../utils")
if (utilsModulePath not in sys.path):
    sys.path.append(utilsModulePath)
# Now we can import utils
import utils

# Cars

1k sample of [Used cars dataset](https://www.kaggle.com/datasets/austinreese/craigslist-carstrucks-data). Some attributes from the original were removed

In [28]:
import json
# Opening JSON file
with open('./cars-1k.json') as f:
    dataFile = json.load(f)
data = dataFile["cases"]

In [29]:
object_data = { case["id"]: case for case in data }

In [30]:
import cbrkit
casebase=object_data

# Wu-Palmer


In [35]:
taxFunction = cbrkit.sim.taxonomy.build( "./cars-taxonomy.yaml", cbrkit.sim.taxonomy.wu_palmer(),
        )
simFunction = cbrkit.sim.attribute_value(
    attributes={
        "manufacturer": taxFunction
    },
    aggregator=cbrkit.sim.aggregator(pooling="mean"),
)


In [36]:
retriever = cbrkit.retrieval.build(
        simFunction
    )
result = cbrkit.retrieval.apply_queries(casebase, casebase, retriever)

In [37]:
from cbrkit.model import Result, ResultStep
def getSteps(self):
    return self.steps
def getAllSimilarityData(self):
    data = {}
    for query in self.queries.keys():
        data[query] = self.queries[query].similarities
    return data

Result.getSteps = getSteps
ResultStep.getAllSimilarityData = getAllSimilarityData

steps = result.getSteps()
simdata = steps[0].getAllSimilarityData()

In [38]:
import orjson

def default(obj):
    if isinstance(obj, np.float64):
        return float(obj)
    raise TypeError

simprint = orjson.dumps(dict(similarityConfiguration=result.metadata,similarityScores=simdata), default=default, option=orjson.OPT_NON_STR_KEYS)
file_path = "cbrkit_cars_tax_wupalmer.json"
with open(file_path, "w") as file:
    file.write(simprint.decode("utf-8"))

## Year and manufacturer  (Equality)

In [53]:

simFunction = cbrkit.sim.attribute_value(
    attributes={
        "year": cbrkit.sim.numbers.linear_interval(1950,2021),
        "manufacturer":cbrkit.sim.generic.equality()
    },
    aggregator=cbrkit.sim.aggregator(
        pooling="mean",
        pooling_weights= dict(year=0.8, manufacturer=0.2)),
)



In [54]:
retriever = cbrkit.retrieval.build(
        simFunction
    )
result = cbrkit.retrieval.apply_queries(casebase, casebase, retriever)

In [55]:
steps = result.getSteps()
simdata = steps[0].getAllSimilarityData()

In [56]:
import orjson

def default(obj):
    if isinstance(obj, np.float64):
        return float(obj)
    raise TypeError

simprint = orjson.dumps(dict(similarityConfiguration=result.metadata,similarityScores=simdata), default=default, option=orjson.OPT_NON_STR_KEYS)
file_path = "cbrkit_year_manufacturer.json"
with open(file_path, "w") as file:
    file.write(simprint.decode("utf-8"))