# stem_objectid.pynb
Iterating from previous Major Studio 1 analysis using Google Colab

In [1]:
import json

In [2]:
import nltk
from nltk.stem import PorterStemmer

In [3]:
# Initialize the Porter stemmer
stemmer = PorterStemmer()

## Load Dataset 1
Catalog Stems ("Word") to Title Stems ("LinkedWords")<br>
_"LinkedWords" are stems from the titles; i.e., stem - "appliqu", word - "applique"_

In [4]:
# Read the JSON file containing the stemmed words array
with open('data/textLinks-results-v2.json', 'r') as file:
    try:
        stemmed_words_data = json.load(file)
    except json.JSONDecodeError as e:
        print(f"Error reading JSON file: {e}")
        stemmed_words_data = []

In [5]:
# # ---- DEBUGGING STATEMENT ----
# # Print the loaded stemmed words data
# print("Loaded stemmed words data:")
# print(stemmed_words_data)

In [6]:
# # ---- SAMPLE FOR TESTING ----
# # Read a portion of the JSON file containing the stemmed words array
# sample_size = 10  # Adjust the sample size as needed
# with open('data/textLinks-results-v2.json', 'r') as file:
#     try:
#         stemmed_words_data = json.load(file)[:sample_size]
#     except json.JSONDecodeError as e:
#         print(f"Error reading stemmed words JSON file: {e}")
#         stemmed_words_data = []

# # Print the loaded portion of stemmed words data
# print("Loaded portion of stemmed words data:")
# print(stemmed_words_data)

In [7]:
# Read the entire JSON file containing the stemmed words array
with open('data/textLinks-results-v2.json', 'r') as file:
    try:
        stemmed_words_data = json.load(file)
    except json.JSONDecodeError as e:
        print(f"Error reading stemmed words JSON file: {e}")
        stemmed_words_data = []

# # Print the loaded stemmed words data
# print("Loaded stemmed words data:")
# print(stemmed_words_data)

In [8]:
# # Print the length/count of values per key for the textLinks-results json
# print("Length/count of values per key for textLinks-results JSON:")
# for item in stemmed_words_data:
#     if 'Word' in item and 'LinkedWords' in item:
#         word = item['Word']
#         linked_words = item['LinkedWords'].strip("[]").replace("'", "").split(", ")
#         print(f"Key: {word}, Count: {len(linked_words)}")

## Load Dataset 2
NGA Data (with image URLS appended)

In [9]:
# Read the JSON file containing your data (e.g., 'json_object.json')
with open('data/IoAD_artists_imgs.json', 'r') as file:
    try:
        json_objects = json.load(file)
    except json.JSONDecodeError as e:
        print(f"Error reading JSON file: {e}")
        json_objects = []

In [10]:
# # ---- DEBUGGING STATEMENT ----
# # DATA RATE EXCEEDED: will not print
# # Print the loaded JSON objects
# print("Loaded JSON objects:")
# print(json_objects)

In [11]:
# # Check if json_objects is correctly loaded and print titles and objectids
# for obj in json_objects:
#     if 'title' in obj and 'objectid' in obj:
#         print(f"Title: {obj['title']}, Object ID: {obj['objectid']}")
#     else:
#         print("Invalid object format. Expected 'title' and 'objectid' attributes.")

## Match JSONs (Catalog stems to Titles)
_Purpose: catalog stems linked to related titles (via object id) should enable categorically filtered objects_

In [12]:
pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [13]:
from fuzzywuzzy import fuzz

In [14]:
# Create a dictionary to store the mapping of keys to objectids
key_to_objectid = {}

### Fourth approach / Output to new JSON for Dataset 1
Catalog Stems ("Word") to Title Stems ("LinkedWords") + Object Ids "ObjectIDs"<br>

In [15]:
# Function to add object IDs to the corresponding key
def add_object_ids_to_keys(stemmed_words_data, key_to_objectid):
    # Iterate through the stemmed words data
    for item in stemmed_words_data:
        if 'Word' in item:
            key = item['Word']
            if key in key_to_objectid:
                item['ObjectIDs'] = list(key_to_objectid[key]['object_ids'])
            else:
                item['ObjectIDs'] = []

In [16]:
# Read the JSON file containing your data
with open('data/IoAD_artists_imgs.json', 'r') as file:
    try:
        json_objects = json.load(file)
    except json.JSONDecodeError as e:
        print(f"Error reading JSON objects file: {e}")
        json_objects = []

# Extract stemmed keys from the first dataset and initialize the dictionary
for item in stemmed_words_data:
    if 'Word' in item:
        key = item['Word']  # Use already stemmed keys
        key_to_objectid[key] = {'object_ids': set()}  # Initialize set for unique object IDs

# Iterate through the titles in json_objects
for obj in json_objects:
    if 'title' in obj and isinstance(obj['title'], str):
        title = obj['title'].lower()  # Convert title to lowercase for case-insensitive matching
        for key, values in key_to_objectid.items():
            # Perform fuzzy matching between each key and the title
            ratio = fuzz.partial_ratio(key, title)
            if ratio >= 80:  # Adjust threshold as needed
                values['object_ids'].add(obj['objectid'])  # Add matched objectid to set

# Add object IDs to the corresponding keys in stemmed_words_data
add_object_ids_to_keys(stemmed_words_data, key_to_objectid)

# Write the updated data structure to a new JSON file
output_file = 'data/textLinks-results-v2-with-objectids.json'
with open(output_file, 'w') as file:
    json.dump(stemmed_words_data, file, indent=4)

print(f"Updated JSON data written to {output_file}")

Updated JSON data written to data/textLinks-results-v2-with-objectids.json


***

## Appendix

### Sample Set for Comparison:
_(from textLinks-results-v2.json)_<br>
Length/count of values per key for textLinks-results JSON:<br>
Key: coverlet, Count: 73<br>
Key: section, Count: 24<br>
Key: 1, Count: 11<br>
Key: banknot, Count: 1<br>
Key: virginia, Count: 2<br>
Key: 168, Count: 1<br>
Key: john, Count: 15<br>
Key: street, Count: 11<br>
Key: 1829, Count: 1<br>
Key: show, Count: 7<br>

### Fourth approach - fuzzy word matching based on key only
Results: Most accurate matches after comparing counts from Excel, using find and replace

### Fourth approach / Sample Results
_(from fuzzy matching with threshold: 80, matches based on key only)_<br>
Length/count of values per key for the output from the for loop:<br>
Key: coverlet, Count: 338<br>
Key: section, Count: 65<br>
Key: 1, Count: 137<br>
Key: banknot, Count: 3<br>
Key: virginia, Count: 6<br>
Key: 168, Count: 1<br>
Key: john, Count: 20<br>
Key: street, Count: 20<br>
Key: 1829, Count: 1<br>
Key: show, Count: 8<br>

In [17]:
# # Extract stemmed keys from the first dataset and initialize the dictionary
# for item in stemmed_words_data:
#     if 'Word' in item:
#         key = item['Word']  # Use already stemmed keys
#         key_to_objectid[key] = set()  # Initialize set for unique object IDs

# # Iterate through the titles in json_objects
# for obj in json_objects:
#     if 'title' in obj and isinstance(obj['title'], str):
#         title = obj['title'].lower()  # Convert title to lowercase for case-insensitive matching
#         for key, objectids in key_to_objectid.items():
#             # Perform fuzzy matching between each key and the title
#             ratio = fuzz.partial_ratio(key, title)
#             if ratio >= 80:  # Adjust threshold as needed
#                 objectids.add(obj['objectid'])  # Add matched objectid to set

# # Print the mapping of keys and their associated object IDs
# print("Mapping of keys and object IDs:")
# for key, objectids in key_to_objectid.items():
#     print(f"Key: {key}, Object IDs: {list(objectids)}")

In [18]:
# # Print the length/count of values per key for the output from the for loop
# print("\nLength/count of values per key for the output from the for loop:")
# for key, objectids in key_to_objectid.items():
#     print(f"Key: {key}, Count: {len(objectids)}")

### Add titles to confirm match

In [19]:
# # Extract stemmed keys from the first dataset and initialize the dictionary
# for item in stemmed_words_data:
#     if 'Word' in item:
#         key = item['Word']  # Use already stemmed keys
#         key_to_objectid[key] = {'object_ids': set(), 'matched_titles': []}  # Initialize set for unique object IDs

# # Iterate through the titles in json_objects
# for obj in json_objects:
#     if 'title' in obj and isinstance(obj['title'], str):
#         title = obj['title'].lower()  # Convert title to lowercase for case-insensitive matching
#         for key, values in key_to_objectid.items():
#             # Perform fuzzy matching between each key and the title
#             ratio = fuzz.partial_ratio(key, title)
#             if ratio >= 80:  # Adjust threshold as needed
#                 values['object_ids'].add(obj['objectid'])  # Add matched objectid to set
#                 values['matched_titles'].append(obj['title'])  # Add matched title

# # Print the mapping of keys, object IDs, and matched titles
# print("Mapping of keys, object IDs, and matched titles:")
# for key, values in key_to_objectid.items():
#     print(f"Key: {key}")
#     print(f"Object IDs: {list(values['object_ids'])}")
#     print("Matched Titles:")
#     for title in values['matched_titles']:
#         print(title)
#     print()

### Third approach - fuzzy word matching
Results: by using sets (to prevent duplicate object ids), the match counts are still higher than those from the original textLinks JSON, due to matches from both key and value

### Third approach / Sample Results - Test 2:
_(from fuzzy matching with threshold: 80, using sets for the objectids to prevent duplicates)_<br>
Length/count of values per key for the output from the for loop:<br>
Key: coverlet, Count: 513<br>
Key: section, Count: 743<br>
Key: 1, Count: 183<br>
Key: 168, Count: 1<br>
Key: john, Count: 80<br>
Key: street, Count: 270<br>
Key: 1829, Count: 2<br>
Key: show, Count: 66<br>

### Third approach / Sample Results - Test 1 :
_(from fuzzy matching with threshold: 80, prior to creating sets for the objectids to prevent duplicates)_<br>
Length/count of values per key for the output from the for loop:<br>
Key: coverlet, Count: 11905<br>
Key: section, Count: 2025<br>
Key: 1, Count: 509<br>
Key: 168, Count: 1<br>
Key: john, Count: 176<br>
Key: street, Count: 278<br>
Key: 1829, Count: 2<br>
Key: show, Count: 77<br>

In [20]:
# # Iterate through the array of stemmed words data
# for item in stemmed_words_data:
#     if 'Word' in item and 'LinkedWords' in item:
#         words = item['LinkedWords'].strip("[]").replace("'", "").split(", ")
#         for word in words:
#             for obj in json_objects:
#                 if 'title' in obj and isinstance(obj['title'], str):
#                     # Perform fuzzy matching
#                     ratio = fuzz.partial_ratio(word, obj['title'])
#                     if ratio >= 80:  # Adjust threshold as needed
#                         key = item['Word']
#                         if key in key_to_objectid:
#                             key_to_objectid[key].append(obj['objectid'])
#                         else:
#                             key_to_objectid[key] = [obj['objectid']]

# # Print the mapping of keys and their associated object IDs
# print("Mapping of keys and object IDs:")
# for key, objectids in key_to_objectid.items():
#     print(f"Key: {key}, Object IDs: {objectids}")

### Second approach - stemmed and unstemmed words
Results: still not reflecting what's seen in textLinks-results-v2.json

In [21]:
# # Function to generate stemmed and unstemmed versions of a word
# def generate_word_variants(word):
#     stemmed_word = stemmer.stem(word)
#     return [word, stemmed_word]

In [22]:
# # Iterate through the array of stemmed words data
# for item in stemmed_words_data:
#     if 'Word' in item and 'LinkedWords' in item:
#         words = item['LinkedWords'].strip("[]").replace("'", "").split(", ")
#         for word in words:
#             for word_variant in generate_word_variants(word):
#                 for obj in json_objects:
#                     if 'title' in obj and isinstance(obj['title'], str) and word_variant in obj['title']:
#                         key = item['Word']
#                         if key in key_to_objectid:
#                             key_to_objectid[key].append(obj['objectid'])
#                         else:
#                             key_to_objectid[key] = [obj['objectid']]
#                         break  # Break to avoid duplicate matches

# # Print the mapping of keys and their associated object IDs
# print("Mapping of keys and object IDs:")
# for key, objectids in key_to_objectid.items():
#     print(f"Key: {key}, Object IDs: {objectids}")

### First approach - (exact) match
Results: not precise due to use of word stems in the textLinks-results-v2.json

In [23]:
# # Iterate through the array of stemmed words data
# for item in stemmed_words_data:
#     if 'Word' in item and 'LinkedWords' in item:
#         key = item['Word']
#         # Convert the key to a string for consistent matching
#         str_key = str(key)

#         for obj in json_objects:
#             if 'title' in obj and isinstance(obj['title'], str) and str_key in obj['title']:
#                 if str_key in key_to_objectid:
#                     key_to_objectid[str_key].append(obj['objectid'])
#                 else:
#                     key_to_objectid[str_key] = [obj['objectid']]

# # Print the mapping of keys and their associated object IDs
# print("Mapping of keys and object IDs:")
# for key, objectids in key_to_objectid.items():
#     print(f"Key: {key}, Object IDs: {objectids}")