# Match Title Tokens back to Catalog

In [1]:
import json

In [2]:
pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install python-Levenshtein

Note: you may need to restart the kernel to use updated packages.


In [10]:
from fuzzywuzzy import fuzz

In [5]:
def extract_tokens(data):
    tokens = []
    for item in data:
        if "filtered_tokens" in item:
            tokens.extend(item["filtered_tokens"])
    return tokens

In [6]:
def extract_words_at_level(data, level):
    if level == 1:
        return list(data.keys())
    else:
        return _extract_words_at_level(data, level, 1)

In [7]:
def _extract_words_at_level(data, level, current_level):
    words = []
    for key, value in data.items():
        if isinstance(value, dict):
            if current_level < level:
                words.extend(_extract_words_at_level(value, level, current_level + 1))
            elif current_level == level:
                words.append(key)
    return words

In [11]:
def fuzzy_match(token, words):
    best_match = None
    highest_score = -1
    for word in words:
        ratio = fuzz.ratio(token.lower(), word.lower())
        if ratio > highest_score:
            highest_score = ratio
            best_match = word
    return best_match, highest_score

In [16]:
def match_tokens(json1, json2, level):
    # Load JSON data
    with open(json1, 'r') as f1, open(json2, 'r') as f2:
        data1 = json.load(f1)
        data2 = json.load(f2)

    # Extract tokens/words from JSON data
    tokens1 = extract_tokens(data1)
    words2 = extract_words_at_level(data2, level)

    # Match tokens to words using fuzzy matching
    matched_pairs = []
    if words2 is not None:  # Check if words2 is None before iterating
        for token in tokens1:
            best_match, score = fuzzy_match(token, words2)
            if score >= 80:  # Adjust this threshold as needed
                matched_pairs.append((token, best_match))
    return matched_pairs

In [17]:
# Example usage
json1_file = 'cleaned_titles_lc_tokens_sw.json'  # Use the path to your JSON file for dataset1
json2_file = 'catalog_nested.json'  # Use the path to your JSON file for dataset2
level_to_check = 5  # Specify the level you want to check
matched_pairs = match_tokens(json1_file, json2_file, level_to_check)
matched_pairs_count = len(matched_pairs)
print("Matched pairs at level", level_to_check, ":", matched_pairs)
print("Number of matched pairs:", matched_pairs_count)

Matched pairs at level 5 : [('bit', 'Bits'), ('bit', 'Bits'), ('clock', 'Clocks'), ('table', 'Tables'), ('ring', 'Rings'), ('bottle', 'Bottles'), ('bowl', 'Bowls'), ('bottle', 'Bottles'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('porringer', 'Porringers'), ('porringer', 'Porringers'), ('pitcher', 'Pitcher'), ('iron', 'Irons'), ('wood', 'Wood'), ('kettle', 'Kettles'), ('uniform', 'Uniforms'), ('earthenware', 'Earthenware'), ('churn', 'Churns'), ('churn', 'Churns'), ('bowl', 'Bowls'), ('bowl', 'Bowls'), ('prints', 'Pins'), ('mold', 'Molds'), ('chair', 'Chairs'), ('wooden', 'Wood'), ('chair', 'Chairs'), ('chair', 'Chairs'), ('wooden', 'Wood'), ('chair', 'Chairs'), ('iron', 'Irons'), ('table', 'Tables'), ('bell', 'Bells'), ('bottle', 'Bottles'), ('iron', 'Irons'), ('chair', 'Chairs'), ('sampler', 'Samplers'), ('eagle', 'Eagles'), ('bowl', 'Bowls'), ('f

In [31]:
def match_tokens(json1, json2, level, output_file):
    # Load JSON data
    with open(json1, 'r') as f1:
        data1 = json.load(f1)
    with open(json2, 'r') as f2:
        data2 = json.load(f2)

    # Extract tokens/words from JSON data
    tokens1 = extract_tokens(data1)
    words2 = extract_words_at_level(data2, level)

    # Match tokens to words using fuzzy matching
    matched_pairs = []
    if words2 is not None:  # Check if words2 is None before iterating
        for token in tokens1:
            best_match, score = fuzzy_match(token, words2)
            if score >= 80:  # Adjust this threshold as needed
                matched_pairs.append((token, best_match))
                # Append objectid from dataset1 to dataset2
                matched_objectids = []
                for item in data1:
                    if token in item.get("filtered_tokens", []):
                        objectid = item.get("objectid")
                        if objectid:
                            matched_objectids.append(objectid)
                # Traverse dataset2 to find the matched array and append objectids
                current_level = data2
                for word in best_match.split('/'):
                    current_level = current_level.get(word, {})  # Handle missing levels
                    if not isinstance(current_level, dict):  # Handle non-dict types
                        break  # Stop traversal if a non-dict type is encountered
                if isinstance(current_level, dict):  # Append objectids only if a dict is found
                    current_level.setdefault('matched_objectids', []).extend(matched_objectids)

    # Write modified data to the output file
    with open(output_file, 'w') as f_out:
        json.dump(data2, f_out, indent=4)

    return matched_pairs

In [32]:
# Example usage
json1_file = 'cleaned_titles_lc_tokens_sw.json'  # Use the path to your JSON file for dataset1
json2_file = 'catalog_nested.json' # Use the path to your JSON file for dataset2
level_to_check = 5  # Specify the level you want to check

# Specify the path for the output file
output_file = 'matchTitleTokens.json'

# Call match_tokens with the output file parameter
matched_pairs = match_tokens(json1_file, json2_file, level_to_check, output_file)

matched_pairs_count = len(matched_pairs)
print("Matched pairs at level", level_to_check, ":", matched_pairs)
print("Number of matched pairs:", matched_pairs_count)

Matched pairs at level 5 : [('bit', 'Bits'), ('bit', 'Bits'), ('clock', 'Clocks'), ('table', 'Tables'), ('ring', 'Rings'), ('bottle', 'Bottles'), ('bowl', 'Bowls'), ('bottle', 'Bottles'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('lamp', 'Clamps'), ('porringer', 'Porringers'), ('porringer', 'Porringers'), ('pitcher', 'Pitcher'), ('iron', 'Irons'), ('wood', 'Wood'), ('kettle', 'Kettles'), ('uniform', 'Uniforms'), ('earthenware', 'Earthenware'), ('churn', 'Churns'), ('churn', 'Churns'), ('bowl', 'Bowls'), ('bowl', 'Bowls'), ('prints', 'Pins'), ('mold', 'Molds'), ('chair', 'Chairs'), ('wooden', 'Wood'), ('chair', 'Chairs'), ('chair', 'Chairs'), ('wooden', 'Wood'), ('chair', 'Chairs'), ('iron', 'Irons'), ('table', 'Tables'), ('bell', 'Bells'), ('bottle', 'Bottles'), ('iron', 'Irons'), ('chair', 'Chairs'), ('sampler', 'Samplers'), ('eagle', 'Eagles'), ('bowl', 'Bowls'), ('f

In [35]:
def match_tokens(json1, json2, level, output_file):
    # Load JSON data
    with open(json1, 'r') as f1:
        data1 = json.load(f1)
    with open(json2, 'r') as f2:
        data2 = json.load(f2)

    # Extract tokens/words from JSON data
    tokens1 = extract_tokens(data1)
    words2 = extract_words_at_level(data2, level)

    # Match tokens to words using fuzzy matching
    matched_pairs = []
    if words2 is not None:  # Check if words2 is None before iterating
        for token in tokens1:
            best_match, score = fuzzy_match(token, words2)
            if score >= 80:  # Adjust this threshold as needed
                matched_pairs.append((token, best_match))
                # Append objectid from dataset1 to dataset2
                matched_objectids = []
                for item in data1:
                    if token in item.get("filtered_tokens", []):
                        objectid = item.get("objectid")
                        if objectid:
                            matched_objectids.append(objectid)
                # Traverse dataset2 to find the appropriate array and append objectids
                current_level = data2
                for word in best_match.split('/'):
                    current_level = current_level.get(word, {})  # Handle missing levels
                current_level.setdefault('matched_objectids', []).extend(matched_objectids)

    # Write modified data to the output file
    with open(output_file, 'w') as f_out:
        json.dump(data2, f_out, indent=4)

    return matched_pairs

# Example usage
json1_file = 'cleaned_titles_lc_tokens_sw.json'  # Use the path to your JSON file for dataset1
json2_file = 'catalog_nested.json'
level_to_check = 5  # Specify the level you want to check
output_file = 'matchTitleTokens2.json'
matched_pairs = match_tokens(json1_file, json2_file, level_to_check, output_file)