# Approach 5
## Gensim
Chat GPT: Dedicated Semantic Similarity Libraries, i.e. Gensim: A Python library for topic modeling, document indexing, and similarity retrieval with large corpora. It includes implementations of Word2Vec, Doc2Vec, and other models.

In [129]:
!pip install nltk pandas



In [130]:
import json
from nltk.stem import PorterStemmer
from collections import defaultdict
import pandas as pd

In [131]:
# Function to read JSON data from a file
def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

In [132]:
# Load data from JSON files
try:
    tokenized_titles = read_json_file('data/cleaned_titles_lc_tokens_sw.json')
    tokenized_sections = read_json_file('data/section_tokens.json')
except Exception as e:
    print(f"Error reading JSON files: {e}")
    tokenized_titles, tokenized_sections = [], []

In [133]:
# Stemming setup
stemmer = PorterStemmer()

In [134]:
# Function to stem a list of tokens
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

In [135]:
# Apply stemming to tokens in titles and sections
stemmed_titles = {item['objectid']: stem_tokens(item['filtered_tokens']) for item in tokenized_titles}
stemmed_sections = {item['section_no']: stem_tokens(item['tokens']) for item in tokenized_sections}

In [136]:
# Apply stemming to tokens in titles and sections
stemmed_titles = {}
for item in tokenized_titles:
    title_id = item.get('objectid')
    title_name = item.get('title')
    filtered_tokens = item.get('filtered_tokens', [])
    if title_id is not None and title_name is not None and filtered_tokens:
        stemmed_titles[title_id] = (title_name, stem_tokens(filtered_tokens))

stemmed_sections = {item['section_no']: stem_tokens(item['tokens']) for item in tokenized_sections}

In [137]:
from gensim.models import Word2Vec

# Example texts
texts = [["Wallpaper"], ["Textiles", "and", "Decorative", "Arts"]]

# Train Word2Vec model
model = Word2Vec(texts, min_count=1)

# Get vectors for each word
vectors = [model.wv[word] for word in model.wv.key_to_index]

# Calculate similarity between two texts
similarity = model.wv.n_similarity(texts[0], texts[1])
print("Word2Vec Similarity:", similarity)


Word2Vec Similarity: 0.052267753


## Apply to datasets

### Step 1: Read and Prepare Data
Ensure the data is correctly loaded and then format it appropriately for creating the Gensim dictionary and corpus

In [138]:
import json
from gensim import corpora, models, similarities

In [139]:
# Function to read JSON data from a file
def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Load your data
try:
    tokenized_titles = read_json_file('data/cleaned_titles_lc_tokens_sw.json')
    tokenized_sections = read_json_file('data/section_tokens.json')
except Exception as e:
    print(f"Error reading JSON files: {e}")
    tokenized_titles, tokenized_sections = [], []

In [140]:
# Extract tokens
title_tokens = [item['filtered_tokens'] for item in tokenized_titles]
section_tokens = [item['tokens'] for item in tokenized_sections]

In [141]:
# Combine all tokens into a single list for creating the dictionary
all_tokens = title_tokens + section_tokens

# Create a dictionary
dictionary = corpora.Dictionary(all_tokens)

# Create a corpus in bag-of-words format
corpus = [dictionary.doc2bow(tokens) for tokens in all_tokens]

# Create a TF-IDF model from the corpus
tfidf = models.TfidfModel(corpus)

# Create a similarity index
index = similarities.MatrixSimilarity(tfidf[corpus])

### Step 2: Calculate Similarities
Calculate the similarity of each title to each section using the TF-IDF vectors

In [142]:
num_titles = len(title_tokens)
num_sections = len(section_tokens)

print(num_titles)
print(num_sections)

# # Iterate over each title's tokens
# for i in range(num_titles):
#     title_vec = tfidf[corpus[i]]  # TF-IDF vector for the title
#     sims = index[title_vec]  # similarities of this title to all documents in the corpus

#     # Output similarity of this title to each section
#     print(f"Title {tokenized_titles[i]['objectid']}:")
#     for j in range(num_sections):
#         section_index = num_titles + j  # index of the section in the combined corpus
#         if sims[section_index] > 0:
#             print(f"  Similarity to Section {tokenized_sections[j]['section_no']} ({tokenized_sections[j]['section']}): {sims[section_index]:.4f}")
#     print()

18259
10


In [143]:
# Print sample corpus and TF-IDF vectors to ensure they're populated
print("Sample corpus:", corpus[:5])
print("Sample TF-IDF vectors:", [tfidf[corpus[i]] for i in range(5)])

Sample corpus: [[(0, 1)], [(1, 1)], [(2, 1)], [(3, 1), (4, 1)], [(5, 1), (6, 1)]]
Sample TF-IDF vectors: [[(0, 1.0)], [(1, 1.0)], [(2, 1.0)], [(3, 0.7316271281383305), (4, 0.6817050281258449)], [(5, 0.7846883266728146), (6, 0.6198904983813013)]]


In [144]:
# Let's assume you have a limited number of titles and sections for this sample.
num_sample_titles = 5
num_sample_sections = 5

# Initialize the dictionary and corpus if not already done
dictionary = corpora.Dictionary([item['filtered_tokens'] for item in tokenized_titles] + 
                                [item['tokens'] for item in tokenized_sections])
corpus = [dictionary.doc2bow(tokens) for tokens in [item['filtered_tokens'] for item in tokenized_titles] + 
                                             [item['tokens'] for item in tokenized_sections]]

# Create TF-IDF model
tfidf = models.TfidfModel(corpus)

# Print matched titles and sections with their TF-IDF vectors
for i in range(num_sample_titles):  # limit to a sample of titles
    title_vec = tfidf[corpus[i]]
    title_id = tokenized_titles[i]['objectid']
    title_text = tokenized_titles[i]['title'] if 'title' in tokenized_titles[i] else 'No title'
    
    print(f"Title ID: {title_id}")
    print(f"Title Text: {title_text}")
    print("TF-IDF Vector for Title:", [(dictionary[id], freq) for id, freq in title_vec])
    
    for j in range(num_sample_sections):  # limit to a sample of sections
        section_index = len(tokenized_titles) + j  # index of section in the corpus
        section_vec = tfidf[corpus[section_index]]
        section_id = tokenized_sections[j]['section_no']
        section_text = tokenized_sections[j]['section']

        print(f"  Section ID: {section_id}")
        print(f"  Section Text: {section_text}")
        print("  TF-IDF Vector for Section:", [(dictionary[id], freq) for id, freq in section_vec])
    print("\n" + "-"*80 + "\n")


Title ID: 29580
Title Text: Wallpaper
TF-IDF Vector for Title: [('wallpaper', 1.0)]
  Section ID: 0
  Section Text: Textiles Costume and Jewelry
  TF-IDF Vector for Section: [('Accessories', 0.09498514806771222), ('Applique', 0.10695986418278142), ('Aprons', 0.10695986418278142), ('Belts', 0.10695986418278142), ('Bootjacks', 0.10695986418278142), ('Bouquet', 0.10695986418278142), ('Box', 0.10695986418278142), ('Bracelets', 0.10695986418278142), ('Brooches', 0.10695986418278142), ('Buckles', 0.10695986418278142), ('Buttons', 0.10695986418278142), ('Canes', 0.10695986418278142), ('Card', 0.10695986418278142), ('Carpets', 0.10695986418278142), ('Cases', 0.09940465949521264), ('Children', 0.10695986418278142), ('Clogs', 0.10695986418278142), ('Coats', 0.10695986418278142), ('Collar', 0.10695986418278142), ('Combs', 0.09940465949521264), ('Costume', 0.09940465949521264), ('Cotton', 0.10695986418278142), ('Coverlets', 0.10695986418278142), ('Crewel', 0.10695986418278142), ('Cuff', 0.10695986

In [145]:
from gensim import similarities

# Assuming the dictionary and tfidf model are already created
index = similarities.MatrixSimilarity(tfidf[corpus])

# Sample limits
num_sample_titles = 5
num_sample_sections = 5

for i in range(num_sample_titles):
    title_vec = tfidf[corpus[i]]
    title_id = tokenized_titles[i]['objectid']
    title_text = tokenized_titles[i]['title'] if 'title' in tokenized_titles[i] else 'No title'
    
    # Print Title Information
    print(f"Title ID: {title_id}")
    print(f"Title Text: {title_text}")
    print("TF-IDF Vector for Title:", [(dictionary[id], freq) for id, freq in title_vec])
    
    # Compute similarity scores
    sims = index[title_vec]
    
    # Track if any section has a similarity above the threshold
    has_high_similarity = False

    for j in range(num_sample_sections):
        section_index = len(tokenized_titles) + j
        if sims[section_index] > 0.2:
            has_high_similarity = True
            section_id = tokenized_sections[j]['section_no']
            section_text = tokenized_sections[j]['section']
            section_vec = tfidf[corpus[section_index]]

            # Print Section Information with Similarity Score
            print(f"  Section ID: {section_id} - Similarity Score: {sims[section_index]:.4f}")
            print(f"  Section Text: {section_text}")
            print("  TF-IDF Vector for Section:", [(dictionary[id], freq) for id, freq in section_vec])
    
    if not has_high_similarity:
        print("  No sections found with similarity > 0.2")
    
    print("\n" + "-"*80 + "\n")


Title ID: 29580
Title Text: Wallpaper
TF-IDF Vector for Title: [('wallpaper', 1.0)]
  No sections found with similarity > 0.2

--------------------------------------------------------------------------------

Title ID: 26713
Title Text: Bit
TF-IDF Vector for Title: [('bit', 1.0)]
  No sections found with similarity > 0.2

--------------------------------------------------------------------------------

Title ID: 29441
Title Text: Dirk
TF-IDF Vector for Title: [('dirk', 1.0)]
  No sections found with similarity > 0.2

--------------------------------------------------------------------------------

Title ID: 29740
Title Text: Candle Holder
TF-IDF Vector for Title: [('candle', 0.7316271281383305), ('holder', 0.6817050281258449)]
  No sections found with similarity > 0.2

--------------------------------------------------------------------------------

Title ID: 29814
Title Text: Doorway and Doors
TF-IDF Vector for Title: [('doors', 0.7846883266728146), ('doorway', 0.6198904983813013)]
  

## Previous Approach's Problem
Computes similarity between a title and a section as whole entities, not between individual tokens of a title and tokens within a section

# Token-by-Token Matching Approach

In [146]:
!pip install gensim



In [147]:
from gensim.models import Word2Vec
import numpy as np

In [148]:
# Example: Tokenized data that might come from your actual data preparation step
tokenized_data = [['token1', 'token2'], ['tokenA', 'tokenB']]

# Train a Word2Vec model
model = Word2Vec(sentences=tokenized_data, vector_size=50, window=5, min_count=1, workers=4)

# Getting vector for a word from the model
vector1 = model.wv['token1']
vector2 = model.wv['token2']
vectorA = model.wv['tokenA']
vectorB = model.wv['tokenB']

# Tokens with their vectors
title_tokens = {'token1': vector1, 'token2': vector2}
section_tokens = {'tokenA': vectorA, 'tokenB': vectorB}

In [149]:
from scipy.spatial.distance import cosine

In [150]:
# Example tokens with hypothetical vector representations
title_tokens = {'token1': vector1, 'token2': vector2}  # vectors are numpy arrays
section_tokens = {'tokenA': vectorA, 'tokenB': vectorB}

# Calculate cosine similarity for each token pair
for title_token, title_vec in title_tokens.items():
    for section_token, section_vec in section_tokens.items():
        similarity = 1 - cosine(title_vec, section_vec)  # cosine returns distance, 1-distance = similarity
        print(f"Similarity between {title_token} and {section_token}: {similarity}")

Similarity between token1 and tokenA: -0.17424815893173218
Similarity between token1 and tokenB: -0.0144752636551857
Similarity between token2 and tokenA: 0.01107197254896164
Similarity between token2 and tokenB: 0.126700758934021


In [151]:
from scipy.spatial.distance import cosine

# Calculate cosine similarity for each token pair
for title_token, title_vec in title_tokens.items():
    for section_token, section_vec in section_tokens.items():
        similarity = 1 - cosine(title_vec, section_vec)  # cosine returns the distance, 1 - distance = similarity
        print(f"Similarity between {title_token} and {section_token}: {similarity:.4f}")


Similarity between token1 and tokenA: -0.1742
Similarity between token1 and tokenB: -0.0145
Similarity between token2 and tokenA: 0.0111
Similarity between token2 and tokenB: 0.1267


In [152]:
# Assuming `tokenized_titles` and `tokenized_sections` are your datasets
# and `model` is your loaded Word2Vec, FastText, etc., model
import gensim
print(gensim.__version__)

# If using Gensim 4.0.0 or newer, use 'key_to_index'
# If using an older version of Gensim, use 'vocab'

# Get vectors for the first 20 titles
title_tokens = {}
for title in tokenized_titles[:20]:  # Limit to the first 20 titles
    for token in title['filtered_tokens']:  # Assuming tokens are stored in 'filtered_tokens'
        if token in model.wv.key_to_index:  # Use 'key_to_index' for gensim 4.0.0 or newer
            title_tokens[token] = model.wv[token]

# Get vectors for all tokens in each section
section_tokens = {}
for section in tokenized_sections:
    for token in section['tokens']:  # Assuming tokens are stored in 'tokens'
        if token in model.wv.key_to_index:  # Use 'key_to_index' for gensim 4.0.0 or newer
            section_tokens[token] = model.wv[token]

4.3.0


In [153]:
# from scipy.spatial.distance import cosine

# # Calculate cosine similarity for each token pair
# results = []  # Store results in a list to possibly sort and filter later
# for title_token, title_vec in title_tokens.items():
#     for section_token, section_vec in section_tokens.items():
#         similarity = 1 - cosine(title_vec, section_vec)  # cosine returns the distance; 1 - distance = similarity
#         results.append((title_token, section_token, similarity))

# # Optionally, sort by similarity score in descending order
# results_sorted = sorted(results, key=lambda x: x[2], reverse=True)

# # Print top 20 results or as needed
# print("Top similarity scores between title and section tokens:")
# for result in results_sorted[:20]:  # Show only top 20 results
#     print(f"Title Token: {result[0]}, Section Token: {result[1]}, Similarity: {result[2]:.4f}")

In [154]:
# # Print tokens and check their presence in the model's vocabulary
# title_sample_tokens = [token for title in tokenized_titles[:20] for token in title['filtered_tokens']]
# section_sample_tokens = [token for section in tokenized_sections for token in section['tokens']]

# print("Sample tokens from first 20 titles:")
# print(title_sample_tokens[:10])  # print first 10 tokens

# print("Sample tokens from sections:")
# print(section_sample_tokens[:10])  # print first 10 tokens

# # Check if these sample tokens are in the model
# title_tokens_present = [token for token in title_sample_tokens if token in model.wv.key_to_index]
# section_tokens_present = [token for token in section_sample_tokens if token in model.wv.key_to_index]

# print("Title tokens present in the model:", title_tokens_present[:10])
# print("Section tokens present in the model:", section_tokens_present[:10])

In [155]:
import re
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def preprocess(token):
    # Normalize: lowercase and remove non-alphanumeric characters
    token = token.lower()
    token = re.sub(r'\W+', '', token)
    # Optionally apply stemming
    token = stemmer.stem(token)
    return token

# Apply preprocessing to all tokens in titles and sections
preprocessed_title_tokens = [preprocess(token) for title in tokenized_titles for token in title['filtered_tokens']]
preprocessed_section_tokens = [preprocess(token) for section in tokenized_sections for token in section['tokens']]

# Check if these preprocessed tokens are now in the model
title_tokens_in_model = [token for token in preprocessed_title_tokens if token in model.wv.key_to_index]
section_tokens_in_model = [token for token in preprocessed_section_tokens if token in model.wv.key_to_index]

print("Preprocessed title tokens in model:", len(title_tokens_in_model))
print("Preprocessed section tokens in model:", len(section_tokens_in_model))

# import re

# def preprocess(token):
#     token = token.lower()  # Lowercase
#     token = re.sub(r"[^a-zA-Z0-9]", "", token)  # Remove non-alphanumeric characters
#     return token

Preprocessed title tokens in model: 0
Preprocessed section tokens in model: 0


In [156]:
# Collect all tokens from titles and preprocess them
all_title_tokens = [preprocess(token) for title in tokenized_titles for token in title['filtered_tokens']]

# Check the total number of tokens and the number of unique tokens
total_tokens = len(all_title_tokens)
unique_tokens = len(set(all_title_tokens))

print(f"Total tokens processed: {total_tokens}")
print(f"Unique tokens after preprocessing: {unique_tokens}")


Total tokens processed: 42454
Unique tokens after preprocessing: 3185


In [157]:
# Correctly collect all tokens and ensure none are missed
all_title_tokens = []
for title in tokenized_titles:
    for token in title['filtered_tokens']:
        preprocessed_token = preprocess(token)
        all_title_tokens.append(preprocessed_token)

print(f"Total number of preprocessed tokens collected: {len(all_title_tokens)}")

Total number of preprocessed tokens collected: 42454


In [158]:
# Filter out empty or non-useful tokens and count them
non_empty_tokens = [t for t in all_title_tokens if t.strip()]
empty_or_removed = len(all_title_tokens) - len(non_empty_tokens)

print(f"Non-empty, useful tokens: {len(non_empty_tokens)}")
print(f"Empty or removed tokens during preprocessing: {empty_or_removed}")

Non-empty, useful tokens: 41419
Empty or removed tokens during preprocessing: 1035


In [159]:
# Apply preprocessing to title and section tokens
title_tokens_preprocessed = [preprocess(token) for token in title_sample_tokens]
section_tokens_preprocessed = [preprocess(token) for token in section_sample_tokens]

In [161]:
from gensim.models import Word2Vec

In [162]:
# Example combined_tokens could be the full set of all tokens from titles and sections
combined_tokens = [preprocess(token) for token in title_sample_tokens + section_sample_tokens]

# Assuming 'combined_tokens' is a list of lists of tokens
model = Word2Vec(sentences=[combined_tokens], vector_size=100, window=5, min_count=1, workers=4)

# Now, check presence again with your own model
title_tokens_present = [token for token in title_tokens_preprocessed if token in model.wv.key_to_index]
section_tokens_present = [token for token in section_tokens_preprocessed if token in model.wv.key_to_index]

print("Preprocessed title tokens present in custom model:", title_tokens_present[:10])
print("Preprocessed section tokens present in custom model:", section_tokens_present[:10])

Preprocessed title tokens present in custom model: ['wallpap', 'bit', 'dirk', 'candl', 'holder', 'doorway', 'door', 'restor', 'draw', 'bit']
Preprocessed section tokens present in custom model: ['suspend', 'mitten', 'glove', 'bouquet', 'fob', 'needlework', 'carpet', 'pocket', 'sampler', 'render']


In [163]:
# Dictionary to hold the token embeddings for titles and sections
title_token_embeddings = {token: model.wv[token] for token in title_tokens_present}
section_token_embeddings = {token: model.wv[token] for token in section_tokens_present}


In [178]:
section_token_embeddings = {}
for section in tokenized_sections:
    section_id = section['section_no']
    section_name = section['section']  # Assuming 'section' holds the name
    for token in section['tokens']:
        processed_token = preprocess(token)  # Apply preprocessing
        if processed_token in model.wv.key_to_index:
            section_token_embeddings[processed_token] = {
                'vector': model.wv[processed_token],
                'section_id': section_id,
                'section_name': section_name
            }

In [180]:
# Adjusted dictionary to include title text and section names
title_token_embeddings = {}
for title in tokenized_titles[:20]:  # Assuming you are still focusing on the first 20 titles
    title_id = title['objectid']
    title_text = title['title'] if 'title' in title else "No Title Available"  # Ensure title has the key 'title'
    for token in title['filtered_tokens']:
        if token in model.wv.key_to_index:
            title_token_embeddings[token] = {
                'vector': model.wv[token],
                'object_id': title_id,
                'title_text': title_text  # Storing title text
            }

In [181]:
# List to store enhanced similarity results
similarity_results = []

# Compute cosine similarity for each pair of title and section tokens
for title_token, title_info in title_token_embeddings.items():
    title_vec = title_info['vector']
    object_id = title_info['object_id']
    title_text = title_info['title_text']
    for section_token, section_info in section_token_embeddings.items():
        section_vec = section_info['vector']
        section_id = section_info['section_id']
        section_name = section_info['section_name']
        similarity = 1 - cosine(title_vec, section_vec)
        similarity_results.append((title_token, section_token, similarity, object_id, title_text, section_id, section_name))

# Sort results by similarity score in descending order
similarity_results.sort(key=lambda x: x[2], reverse=True)

In [182]:
# Print top results with detailed context
print("Top similarity scores between title tokens and section tokens:")
for result in similarity_results[:20]:  # Show only top 20 results, adjust as needed
    title_token, section_token, similarity, object_id, title_text, section_id, section_name = result
    print(f"Title Token: {title_token}, Title: {title_text}, Object ID: {object_id}, Section Token: {section_token}, Section: {section_name}, Similarity: {similarity:.4f}, Section ID: {section_id}")

Top similarity scores between title tokens and section tokens:
Title Token: bit, Title: Bit, Object ID: 26710, Section Token: bit, Section: Tools Hardware Firearms and Vehicles, Similarity: 1.0000, Section ID: 3
Title Token: holder, Title: Candle Holder, Object ID: 29740, Section Token: holder, Section: Domestic Utensils, Similarity: 1.0000, Section ID: 4
Title Token: clock, Title: Grandfather Clock, Object ID: 17059, Section Token: clock, Section: The Art and Design of Utopian and Religious Communities, Similarity: 1.0000, Section ID: 1
Title Token: ring, Title: Ring Bottle, Object ID: 18462, Section Token: ring, Section: Textiles Costume and Jewelry, Similarity: 1.0000, Section ID: 0
Title Token: indian, Title: Indian, Object ID: 20171, Section Token: indian, Section: Wood Carvings and Weathervanes, Similarity: 1.0000, Section ID: 6
Title Token: sugar, Title: Sugar Bowl with Cover, Object ID: 22765, Section Token: sugar, Section: Domestic Utensils, Similarity: 1.0000, Section ID: 4
T

******

# Prepare for Export

In [236]:
# Adjusting for multiple occurrences of the same token across different titles
title_token_embeddings = {}
for title in tokenized_titles:
    title_id = title['objectid']
    title_text = title['title'] if 'title' in title else "No Title Available"
    for token in title['filtered_tokens']:
        processed_token = preprocess(token)  # Ensuring preprocessing is consistent
        if processed_token in model.wv.key_to_index:
            if processed_token not in title_token_embeddings:
                title_token_embeddings[processed_token] = []
            title_token_embeddings[processed_token].append({
                'vector': model.wv[processed_token],
                'object_id': title_id,
                'title_text': title_text
            })

# Adjusting for multiple occurrences of the same token across different sections
section_token_embeddings = {}
for section in tokenized_sections:
    section_id = section['section_no']
    section_name = section['section']
    for token in section['tokens']:
        processed_token = preprocess(token)  # Applying the same preprocessing
        if processed_token in model.wv.key_to_index:
            if processed_token not in section_token_embeddings:
                section_token_embeddings[processed_token] = []
            section_token_embeddings[processed_token].append({
                'vector': model.wv[processed_token],
                'section_id': section_id,
                'section_name': section_name
            })


In [237]:
# Debugging output to check some entries
for token, entries in list(title_token_embeddings.items())[:5]:  # Check first 5 tokens
    print(f"Token: {token}, Entries: {entries}")

for token, entries in list(section_token_embeddings.items())[:5]:  # Check first 5 tokens
    print(f"Token: {token}, Entries: {entries}")


Token: wallpap, Entries: [{'vector': array([-0.00174155,  0.00643847, -0.00928389, -0.00917366, -0.00521729,
        0.00092949, -0.00716396, -0.00937762, -0.00891214, -0.00045996,
        0.00739363, -0.0083992 , -0.0053274 ,  0.00151182,  0.00330741,
       -0.00848255,  0.00788013, -0.0082737 ,  0.00473306,  0.00939297,
        0.0056492 ,  0.00619705,  0.00824458,  0.00343484, -0.00716139,
        0.00574457, -0.00832279, -0.00577046,  0.00585838, -0.00791271,
       -0.00585655, -0.00353885,  0.00337825,  0.00520058,  0.00205254,
        0.00514775, -0.00265569,  0.00432234, -0.00707009, -0.00171653,
       -0.00772737,  0.0001331 ,  0.00470148,  0.00479964, -0.00461452,
        0.00171783,  0.00780559, -0.0055684 , -0.00380387, -0.00058878,
       -0.00470388,  0.00868807,  0.00756429,  0.00923703,  0.00935222,
       -0.00843839, -0.00047474, -0.00759986, -0.00483257, -0.00883871,
        0.00341446,  0.00011393, -0.00693071,  0.0079291 ,  0.00092258,
        0.0003016 , -0.0035

In [238]:
similarity_results = []

# Iterate over each title token and its list of occurrences
for title_token, title_infos in title_token_embeddings.items():
    for title_info in title_infos:  # Handle multiple title occurrences
        title_vec = title_info['vector']
        object_id = title_info['object_id']
        title_text = title_info['title_text']

        # Iterate over each section token and its list of occurrences
        for section_token, section_infos in section_token_embeddings.items():
            for section_info in section_infos:  # Handle multiple section occurrences
                section_vec = section_info['vector']
                section_id = section_info['section_id']
                section_name = section_info['section_name']

                # Compute the cosine similarity
                similarity = 1 - cosine(title_vec, section_vec)

                # Append each result as a dictionary
                similarity_results.append({
                    'title_token': title_token,
                    'section_token': section_token,
                    'similarity': similarity,
                    'object_id': object_id,
                    'title_text': title_text,
                    'section_id': section_id,
                    'section_name': section_name
                })

# For debugging, print a few sample results
for result in similarity_results[:5]:  # Print first 5 results
    print(result)

{'title_token': 'wallpap', 'section_token': 'suspend', 'similarity': 0.02379784733057022, 'object_id': 29580, 'title_text': 'Wallpaper', 'section_id': 0, 'section_name': 'Textiles Costume and Jewelry'}
{'title_token': 'wallpap', 'section_token': 'mitten', 'similarity': -0.10478957742452621, 'object_id': 29580, 'title_text': 'Wallpaper', 'section_id': 0, 'section_name': 'Textiles Costume and Jewelry'}
{'title_token': 'wallpap', 'section_token': 'glove', 'similarity': -0.19784335792064667, 'object_id': 29580, 'title_text': 'Wallpaper', 'section_id': 0, 'section_name': 'Textiles Costume and Jewelry'}
{'title_token': 'wallpap', 'section_token': 'bouquet', 'similarity': -0.15985095500946045, 'object_id': 29580, 'title_text': 'Wallpaper', 'section_id': 0, 'section_name': 'Textiles Costume and Jewelry'}
{'title_token': 'wallpap', 'section_token': 'fob', 'similarity': 0.009587308391928673, 'object_id': 29580, 'title_text': 'Wallpaper', 'section_id': 0, 'section_name': 'Textiles Costume and Jew

In [248]:
from scipy.spatial.distance import cosine

# Define a list to store similarity results
similarity_results = []

# Iterate over each title token and its list of occurrences
for title_token, title_infos in title_token_embeddings.items():
    for title_info in title_infos:  # Handle multiple title occurrences
        title_vec = title_info['vector']
        object_id = title_info['object_id']
        title_text = title_info['title_text']

        # Iterate over each section token and its list of occurrences
        for section_token, section_infos in section_token_embeddings.items():
            for section_info in section_infos:  # Handle multiple section occurrences
                section_vec = section_info['vector']
                section_id = section_info['section_id']
                section_name = section_info['section_name']

                # Compute the cosine similarity
                similarity = 1 - cosine(title_vec, section_vec)

                # Append each result as a dictionary
                similarity_results.append({
                    'title_token': title_token,
                    'section_token': section_token,
                    'similarity': similarity,
                    'object_id': object_id,
                    'title_text': title_text,
                    'section_id': section_id,
                    'section_name': section_name
                })

In [254]:
# # Sort results by similarity score in descending order (optional, can be removed if not needed)
# similarity_results.sort(key=lambda x: x['similarity'], reverse=True)

# Define the folder name where the file should be saved
folder_name = 'data'

# Define the filename for the JSON file
file_name = 'similarity_results.json'

# Create the complete file path
file_path = f'./{folder_name}/{file_name}'

In [255]:
import json
import os

# Create the directory if it does not exist
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Write results to a JSON file
with open(file_path, 'w') as f:
    json.dump(similarity_results, f, indent=4)

print(f"Results saved to {file_path}")


Results saved to ./data/similarity_results.json


In [256]:
# Calculate the length of the similarity results list
num_results = len(similarity_results)

# Print the number of results
print(f"The number of similarity results is: {num_results}")

The number of similarity results is: 9578256


In [257]:
import json

# Example data structure: list of dictionaries for each title with tokens and other relevant data
data_to_export = [{'object_id': title['objectid'], 'title': title['title'], 'tokens': list(set([preprocess(token) for token in title['filtered_tokens']]))} for title in tokenized_titles]

# Define the file path
output_file_path = './data/title_tokens_clean.json'

# Write the data to a JSON file
with open(output_file_path, 'w') as f:
    json.dump(data_to_export, f, indent=4)

print(f"Data exported successfully to {output_file_path}")
num_results = len(data_to_export)
print(num_results)

Data exported successfully to ./data/title_tokens_clean.json
18259


In [258]:
# Example of correctly extracting tokens from sections
for section in tokenized_sections:
    print(f"Original Section Text: {section['section']}")
    tokens = [preprocess(token) for token in section['tokens']]
    print(f"Processed Tokens: {tokens}")

Original Section Text: Textiles Costume and Jewelry
Processed Tokens: ['suspend', 'mitten', 'glove', 'bouquet', 'fob', 'needlework', 'carpet', 'pocket', 'sampler', 'render', 'cane', 'eyeglass', 'walk', 'overs', 'fan', 'case', 'comb', 'bootjack', 'skate', 'collar', 'muff', 'spectacl', 'ear', 'card', 'coverlet', 'satchel', 'ornament', 'necklac', 'tray', 'parasol', 'cotton', 'holder', 'children', 'watch', 'umbrella', 'embroid', 'uniform', 'bracelet', 'cuff', 'brooch', 'hair', 'jacquard', 'pictur', 'form', 'shoe', 'textil', 'four', 'cufflink', 'spur', 'embroideri', 'ring', 'hook', 'giant', 'dress', 'sachet', 'mitt', 'coat', 'wallet', 'pin', 'costum', 'garter', 'box', 'appliqu', 'jewelri', 'piec', 'buckl', 'clog', 'waist', 'belt', 'pipe', 'linen', 'accessori', 'button', 'quilt', 'wool', 'stock', 'handbag', 'rug', 'purs', 'woven', 'har', 'suit', 'stick', 'crewel', 'apron', 'silk', 'men', 'trouser', 'shoe', 'women']
Original Section Text: The Art and Design of Utopian and Religious Communitie