In [26]:
import numpy as np
import json

# Load json data from a file
def load_data1():
    with open("data/recommendations.json", "r", encoding="utf-8") as jf:
        data = json.load(jf)
    # Flatten the nested list structure
    flat_data = []

    for entry in data:
        current_item = {}
        current_item["ID"] = entry["ID"]
        current_item["Name"] = entry["Name"]

        for category, distance in entry["Distance"].items():
            key = category
            value = float(distance)
            current_item[key] = value
        flat_data.append(current_item)

    return flat_data

def load_data2():
    with open("data/recommendations.json", "r", encoding="utf-8") as jf:
        data = json.load(jf)
    # Flatten the nested list structure
    flat_data = []

    for entry in data:
        id = entry["ID"]
        name = entry["Name"]

        for category, distance in entry["Distance"].items():
            flat_data.append({
                "ID": id,
                "Name": name,
                "Category": category,
                "Distance": float(distance)
            })

    return flat_data

In [27]:
# # Convert the loaded data into a Numpy table
# data = load_data1()
# dtype = [(key, 'f8') if key.startswith('distance') else (key, 'U50') for key in data[0].keys()]
# table = np.array([tuple(item.values()) for item in data], dtype=dtype)

In [28]:
# Numpy table
data = load_data2()
dtype = [(key, 'f8') if key == 'Distance' else (key, object) for key in data[0].keys()]
table = np.array([tuple(item.values()) for item in data], dtype=dtype)


In [29]:
# # Export to csv
# np.savetxt("data/recommendations.csv", table, delimiter=",", fmt="%s", header="ID,Name,Category,Distance", comments="")

In [30]:
# For every unique category, find the item with the largest distance
result_table = []

unique_categories = np.unique(table['Category'])
for category in unique_categories:
    result = {}
    # Find the item with the largest distance in this category
    category_items = table[table['Category'] == category]
    max_distance_item = category_items[np.argmax(category_items['Distance'])]
    distances = category_items['Distance']

    result['ID'] = max_distance_item['ID']
    result['Category'] = category[:30] # truncate category name to first 30 chars
    result['BestDistance'] = max_distance_item['Distance']
    result['SecondBestDistance'] = np.partition(distances, -2)[-2]
    result['SecondBestID'] = category_items[np.argsort(category_items['Distance'])[-2]]['ID']
    result['MeanDistance'] = np.mean(distances)
    result['MedianDistance'] = np.median(distances)
    result['StdDistance'] = np.std(distances)

    result_table.append(result)

# create table
from tabulate import tabulate
print(f"Table 1: Best items per category with statistics ({len(result_table)} categories)")
print(tabulate(result_table, headers="keys", tablefmt="grid"))

Table 1: Best items per category with statistics (360 categories)
+----------------------------------+--------------------------------+----------------+----------------------+----------------------------------+----------------+------------------+---------------+
| ID                               | Category                       |   BestDistance |   SecondBestDistance | SecondBestID                     |   MeanDistance |   MedianDistance |   StdDistance |
| a7685a4a3b151cd5231df69b8f24b18c | 100 site analysis essentials a |       0.996794 |             0.466668 | 72569a021fcccaa39da52efaa13be225 |      0.184706  |        0.182472  |     0.0619688 |
+----------------------------------+--------------------------------+----------------+----------------------+----------------------------------+----------------+------------------+---------------+
| dee6b1e4a09715a86c1d802649dc21cf | 15 math concepts every data sc |       0.139558 |             0.127244 | 2dc50cbdde05ad54502c14dfae65f173 |  

In [31]:
data.clear()

data = json.load(open("data/base_cases.json", "r", encoding="utf-8"))
# Flat out the data. Mapping pair of (category, title)
flat_data = []

for category, titles in data.items():
    for title in titles:
        flat_data.append({
            "Category": category,
            "Title": title
        })
    print(f"Category: {category}, Titles: {len(titles)}")

Category: HR, Titles: 100
Category: IoT, Titles: 100
Category: accounting, Titles: 100
Category: anthropology, Titles: 100
Category: archeology, Titles: 100
Category: architecture, Titles: 100
Category: arts, Titles: 100
Category: astronomy, Titles: 100
Category: botany, Titles: 100
Category: business, Titles: 100
Category: chemistry, Titles: 100
Category: control, Titles: 100
Category: corporate, Titles: 100
Category: criminology, Titles: 100
Category: culture, Titles: 100
Category: cybersecurity, Titles: 100
Category: data, Titles: 100
Category: database, Titles: 100
Category: design, Titles: 100
Category: ecology, Titles: 100
Category: economics, Titles: 100
Category: education, Titles: 100
Category: electronics, Titles: 100
Category: engineering, Titles: 100
Category: entrepreneurship, Titles: 123
Category: finance, Titles: 100
Category: food, Titles: 100
Category: agriculture, Titles: 100
Category: genetics, Titles: 100
Category: graphics, Titles: 114
Category: health, Titles: 100