In [13]:
import json
from collections import defaultdict

In [14]:
# Load the exported title tokens
with open('data/title_tokens_clean.json', 'r') as file:
    title_tokens_data = json.load(file)

# Load the similarity results
with open('data/similarity_results.json', 'r') as file:
    similarity_results = json.load(file)

In [22]:
# Create a structured dictionary where each section name contains tokens and their matches
similarity_dict = defaultdict(lambda: defaultdict(list))
for result in similarity_results:
    token = result['title_token']
    section = result['section_name']  # Assume section_name is part of the results
    if result['similarity'] > 0.1:  # Filter to only include positive similarity scores
        similarity_entry = {
            'section_token': result['section_token'],
            'similarity': result['similarity'],
            'object_id': result['object_id'],
            'title_text': result['title_text'],
            'section_id': result['section_id']
        }
        similarity_dict[section][token].append(similarity_entry)

In [23]:
# Example of how this structured dictionary is used
for section, tokens in list(similarity_dict.items())[:1]:  # Limit to the first section for example
    print(f"Section: {section}")
    for token, entries in tokens.items():
        print(f"  Token: {token}, Matches Count: {len(entries)}")
        for entry in entries[:3]:  # Show only the first 3 matches for each token
            print(f"    Matches with: {entry['section_token']}, Similarity: {entry['similarity']:.4f}")

Section: Textiles Costume and Jewelry
  Token: wallpap, Matches Count: 343
    Matches with: fan, Similarity: 0.2568
    Matches with: uniform, Similarity: 0.1045
    Matches with: pictur, Similarity: 0.1005
  Token: bit, Matches Count: 912
    Matches with: pocket, Similarity: 0.1487
    Matches with: walk, Similarity: 0.1654
    Matches with: ear, Similarity: 0.1046
  Token: dirk, Matches Count: 38
    Matches with: bouquet, Similarity: 0.2088
    Matches with: bootjack, Similarity: 0.1190
    Matches with: collar, Similarity: 0.1194
  Token: candl, Matches Count: 2176
    Matches with: eyeglass, Similarity: 0.1279
    Matches with: muff, Similarity: 0.1171
    Matches with: ear, Similarity: 0.1663
  Token: holder, Matches Count: 3040
    Matches with: suspend, Similarity: 0.2137
    Matches with: carpet, Similarity: 0.1646
    Matches with: pocket, Similarity: 0.1824
  Token: doorway, Matches Count: 1442
    Matches with: needlework, Similarity: 0.1882
    Matches with: render, Simi

In [24]:
# Check the number of unique object IDs per token
unique_check = {}
for section, tokens in similarity_dict.items():
    for token, entries in tokens.items():
        if token not in unique_check:
            unique_check[token] = set()
        unique_check[token].update([entry['object_id'] for entry in entries])

# Print out the results
for token, object_ids in unique_check.items():
    print(f"Token: {token}, Unique Object IDs: {len(object_ids)}")

Token: wallpap, Unique Object IDs: 49
Token: bit, Unique Object IDs: 57
Token: dirk, Unique Object IDs: 2
Token: candl, Unique Object IDs: 136
Token: holder, Unique Object IDs: 190
Token: doorway, Unique Object IDs: 103
Token: door, Unique Object IDs: 140
Token: restor, Unique Object IDs: 41
Token: draw, Unique Object IDs: 67
Token: grandfath, Unique Object IDs: 40
Token: clock, Unique Object IDs: 227
Token: tripod, Unique Object IDs: 6
Token: tabl, Unique Object IDs: 385
Token: ring, Unique Object IDs: 34
Token: bottl, Unique Object IDs: 155
Token: indian, Unique Object IDs: 246
Token: ladl, Unique Object IDs: 35
Token: sugar, Unique Object IDs: 173
Token: bowl, Unique Object IDs: 264
Token: cover, Unique Object IDs: 124
Token: glass, Unique Object IDs: 212
Token: lamp, Unique Object IDs: 347
Token: chamber, Unique Object IDs: 3
Token: spark, Unique Object IDs: 12
Token: pewter, Unique Object IDs: 273
Token: coffe, Unique Object IDs: 110
Token: pot, Unique Object IDs: 141
Token: mug, 

In [25]:
# Example to limit the number of tokens processed for printing
max_tokens_to_print = 10  # Set a reasonable limit
count = 0

for section, tokens in similarity_dict.items():
    for token, entries in tokens.items():
        if count >= max_tokens_to_print:
            break
        print(f"Section: {section}, Token: {token}")
        for entry in entries[:5]:  # Limit the number of entries per token to print
            print(f"  Object ID: {entry['object_id']}, Title Text: {entry['title_text']}")
            print(f"  Matched Token: {entry['section_token']}, Similarity: {entry['similarity']:.4f}")
        count += 1
        print("---")  # Separator for clarity
    if count >= max_tokens_to_print:
        break


Section: Textiles Costume and Jewelry, Token: wallpap
  Object ID: 29580, Title Text: Wallpaper
  Matched Token: fan, Similarity: 0.2568
  Object ID: 29580, Title Text: Wallpaper
  Matched Token: uniform, Similarity: 0.1045
  Object ID: 29580, Title Text: Wallpaper
  Matched Token: pictur, Similarity: 0.1005
  Object ID: 29580, Title Text: Wallpaper
  Matched Token: clog, Similarity: 0.1385
  Object ID: 29580, Title Text: Wallpaper
  Matched Token: waist, Similarity: 0.1442
---
Section: Textiles Costume and Jewelry, Token: bit
  Object ID: 26713, Title Text: Bit
  Matched Token: pocket, Similarity: 0.1487
  Object ID: 26713, Title Text: Bit
  Matched Token: walk, Similarity: 0.1654
  Object ID: 26713, Title Text: Bit
  Matched Token: ear, Similarity: 0.1046
  Object ID: 26713, Title Text: Bit
  Matched Token: children, Similarity: 0.1674
  Object ID: 26713, Title Text: Bit
  Matched Token: cuff, Similarity: 0.1781
---
Section: Textiles Costume and Jewelry, Token: dirk
  Object ID: 2944

***

### Organize Data by Token and Section

In [41]:
# Initialize a nested dictionary to organize data by token, then by section
token_section_matches = defaultdict(lambda: defaultdict(list))

for result in similarity_results:
    token = result['title_token']
    section = result['section_name']
    # Only include results with a positive similarity or above a certain threshold
    if result['similarity'] > 0.33:
        token_section_matches[token][section].append({
            'matched_token': result['section_token'],
            'similarity': result['similarity'],
            'object_id': result['object_id'],
            'title_text': result['title_text'],
            'section_id': result['section_id']
        })

In [42]:
# Print a sample of the organized data
for token, sections in list(token_section_matches.items())[:5]:  # Limit to first 5 tokens for demonstration
    print(f"Token: {token}")
    for section, matches in sections.items():
        print(f"  Section: {section}, Matches Count: {len(matches)}")
        for match in matches[:3]:  # Show only the first 3 matches for brevity
            print(f"    Matches with: {match['matched_token']}, Similarity: {match['similarity']:.4f}")
        print("---")

Token: wallpap
  Section: Architecture and Naive Art, Matches Count: 49
    Matches with: wallpap, Similarity: 1.0000
    Matches with: wallpap, Similarity: 1.0000
    Matches with: wallpap, Similarity: 1.0000
---
Token: bit
  Section: The Art and Design of Utopian and Religious Communities, Matches Count: 57
    Matches with: bit, Similarity: 1.0000
    Matches with: bit, Similarity: 1.0000
    Matches with: bit, Similarity: 1.0000
---
  Section: Tools Hardware Firearms and Vehicles, Matches Count: 57
    Matches with: bit, Similarity: 1.0000
    Matches with: bit, Similarity: 1.0000
    Matches with: bit, Similarity: 1.0000
---
Token: holder
  Section: Textiles Costume and Jewelry, Matches Count: 190
    Matches with: holder, Similarity: 1.0000
    Matches with: holder, Similarity: 1.0000
    Matches with: holder, Similarity: 1.0000
---
  Section: Domestic Utensils, Matches Count: 190
    Matches with: holder, Similarity: 1.0000
    Matches with: holder, Similarity: 1.0000
    Matche

In [45]:
# Write to CSV for detailed analysis
import csv

with open('token_section_matches.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Token', 'Section', 'Matched Token', 'Similarity', 'Object ID', 'Title Text', 'Section ID'])
    for token, sections in token_section_matches.items():
        for section, matches in sections.items():
            for match in matches:
                writer.writerow([token, section, match['matched_token'], match['similarity'], match['object_id'], match['title_text'], match['section_id']])

## Alt approach (by object id with multiple tokens)

In [55]:
# Assuming similarity_results has been loaded and contains entries with object_id
similarity_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for result in similarity_results:
    object_id = result['object_id']
    token = result['title_token']
    section = result['section_name']
    section_id = result['section_id']
    if result['similarity'] > 0.33:  # Include only positive similarity scores
        entry = {
            'matched_token': result['section_token'],
            'similarity': result['similarity'],
            'section_id': result['section_id'],
            'section_name': section,
            'section_id': section_id
        }
        similarity_dict[object_id][token][section].append(entry)

# Optionally, transform this into a more compact format if needed

In [53]:
# Example of how to print this structure for a sample object
sample_object_id = 29580  # Example object ID
if sample_object_id in similarity_dict:
    for section, tokens in similarity_dict[sample_object_id].items():
        print(f"Object ID: {sample_object_id}, Section: {section}")
        for token, details in tokens.items():
            print(f"  Token: {token}")
            for detail in details:
                print(f"    Matches with: {detail['matched_token']}, Similarity: {detail['similarity']:.4f}")
else:
    print(f"No similarity data available for Object ID: {sample_object_id}")

Object ID: 29580, Section: wallpap
  Token: Architecture and Naive Art
    Matches with: wallpap, Similarity: 1.0000


In [57]:
# Load title tokens data
with open('data/title_tokens_clean.json', 'r') as file:
    title_tokens_data = json.load(file)

# Append structured similarity results to each title based on object_id
for title in title_tokens_data:
    object_id = title['object_id']
    title['similarity_results'] = similarity_dict[object_id] if object_id in similarity_dict else {}

# Print the structured data for a sample of titles
for title in title_tokens_data[:5]:  # Limit to the first five titles for clarity
    print(f"Title: {title['title']} (Object ID: {title['object_id']})")
    tokens = title.get('tokens', [])
    for token in tokens:
        print(f"  Token: {token}")
        token_sections = title['similarity_results'].get(token, {})
        for section, matches in token_sections.items():
            print(f"    Section: {section} (Section Id: {matches[0]['section_id'] if matches else 'N/A'})")
            for match in matches:
                print(f"      Matches with: {match['matched_token']}, Similarity: {match['similarity']:.4f}")
    print("---")


Title: Wallpaper (Object ID: 29580)
  Token: wallpap
    Section: Architecture and Naive Art (Section Id: 2)
      Matches with: wallpap, Similarity: 1.0000
---
Title: Bit (Object ID: 26713)
  Token: bit
    Section: The Art and Design of Utopian and Religious Communities (Section Id: 1)
      Matches with: bit, Similarity: 1.0000
    Section: Tools Hardware Firearms and Vehicles (Section Id: 3)
      Matches with: bit, Similarity: 1.0000
---
Title: Dirk (Object ID: 29441)
  Token: dirk
---
Title: Candle Holder (Object ID: 29740)
  Token: candl
  Token: holder
    Section: Textiles Costume and Jewelry (Section Id: 0)
      Matches with: holder, Similarity: 1.0000
    Section: Domestic Utensils (Section Id: 4)
      Matches with: holder, Similarity: 1.0000
---
Title: Doorway and Doors (Object ID: 29814)
  Token: door
    Section: The Art and Design of Utopian and Religious Communities (Section Id: 1)
      Matches with: religi, Similarity: 0.3486
    Section: Wood Carvings and Weatherva

***

### Export

In [59]:
# Optionally, save the updated data back to JSON
output_file_path = './data/token_section_similarity_match.json'

# with open(output_path, 'w') as file:
#     json.dump(title_tokens_data, file, indent=4)
# print(f"Data enriched with similarity results has been saved to {output_path}")

# Open the file in write mode and use json.dump to serialize the data to JSON and write it to the file
try:
    with open(output_file_path, 'w', encoding='utf-8') as file:
        json.dump(title_tokens_data, file, indent=4, ensure_ascii=False)  # Using indent for pretty-printing
    print(f"Data successfully saved to {output_file_path}")
except Exception as e:
    print(f"An error occurred while writing to the file: {e}")

Data successfully saved to ./data/token_section_similarity_match.json


In [60]:
num_results = len(title_tokens_data)

print(num_results)

18259
