In [2]:
import hnswlib
import numpy as np
from sentence_transformers import SentenceTransformer
import time
import sys

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# --- Configuration Parameters ---
# Embedding Model
MODEL_NAME = 'all-MiniLM-L6-v2' # A good balance of size/performance for embeddings
                               # You could try larger models like 'all-mpnet-base-v2' for higher accuracy
                               # but they will be slower and use more RAM.

# HNSW Index Parameters
HNSW_SPACE = 'cosine'          # Distance metric for HNSW (cosine similarity is common for embeddings)
EF_CONSTRUCTION = 200          # HNSW build time / graph quality trade-off (higher = better graph, slower build)
M_HNSW = 16                    # HNSW connections per node (higher = denser graph, better accuracy, more memory)
K_SEARCH_RESULTS = 10         # Number of nearest neighbors to retrieve per search query
SEARCH_EF = 100                # HNSW search time / accuracy trade-off (higher = more accurate, slower search)
                               # Typically, SEARCH_EF >= K_SEARCH_RESULTS

# Result Display Thresholds (for filtering what's printed to console)
SEMANTIC_SIMILARITY_THRESHOLD = 0.4 # Minimum semantic similarity score (0.0 to 1.0) to display a match
STRING_FUZZY_THRESHOLD = 10    # Minimum basic string similarity score (0 to 100) to display a match

In [3]:
company_names_sample = [
    "Google LLC", "Alphabet Inc.", "Google Inc.", "Microsoft Corporation",
    "Microsft Corp.", "Apple Inc.", "Apple Computers LLC", "Amazon.com Inc.",
    "International Business Machines", "IBM Corp.", "The Coca-Cola Company",
    "PepsiCo Inc.", "ExxonMobil Corporation", "Shell Global", "BP p.l.c.",
    "Siemens AG", "Bosch GmbH", "General Electric Co.", "GE Power",
    "Walmart Inc.", "Target Corporation", "Costco Wholesale Corp.", "NVIDIA Corp.",
    "Advanced Micro Devices Inc.", "Intel Corporation", "Oracle Corp.",
    "SAP SE", "Accenture plc", "Deloitte Touche Tohmatsu Limited",
    "PricewaterhouseCoopers LLP", "Ernst & Young Global Limited",
    "Goldman Sachs Group Inc.", "JPMorgan Chase & Co.", "Bank of America Corp.",
    "Wells Fargo & Company", "Johnson & Johnson", "Pfizer Inc.",
    "Novartis AG", "Roche Holding AG", "Sanofi S.A.", "T-Mobile US Inc.",
    "Verizon Communications Inc.", "AT&T Inc.", "Samsung Electronics Co. Ltd.",
    "Sony Group Corporation", "LG Electronics Inc.",
    "International Business Management", "Big Blue Tech", "Apple Global",
    "General Motors Company", "Ford Motor Company", "Toyota Motor Corporation"
]

In [4]:
# --- Preprocessing Function (Minimalist) ---
def preprocess_name(name):
    """
    Applies basic normalization to company names.
    This version keeps it minimal as discussed, relying on embeddings for complexity.
    """
    return name.lower().replace('.', '').replace(',', '').replace(';', '').replace('&', 'and').strip()

# --- Basic String Similarity (for re-ranking insight) ---
# This is a very simple string similarity. For production, consider fuzzywuzzy or rapidfuzz.
def basic_string_similarity_score(s1, s2):
    """Calculates a basic character-overlap string similarity score (0-100)."""
    s1_clean = s1.replace(' ', '').lower()
    s2_clean = s2.replace(' ', '').lower()
    if not s1_clean or not s2_clean: return 0.0 # Handle empty strings
    
    # Calculate common characters
    common_chars = set(s1_clean) & set(s2_clean)
    matches = len(common_chars)
    
    # Simple overlap ratio
    return (matches / max(len(s1_clean), len(s2_clean))) * 100

In [5]:
print(f"--- HNSW Index Search Performance ({MODEL_NAME} Embeddings) ---")
    
# 3.1. Load Embedding Model
print(f"\n1. Loading embedding model: {MODEL_NAME}...")
start_time = time.time()
try:
    model = SentenceTransformer(MODEL_NAME)
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please ensure you have an internet connection or the model is cached locally.")
    # In a Jupyter Notebook, you might not want to sys.exit(1), but rather alert the user.
    # For this script, we'll let it proceed, but subsequent steps might fail if the model isn't loaded.
    model = None # Indicate model loading failure
embedding_load_time = time.time() - start_time
print(f"   Model loaded in {embedding_load_time:.2f} seconds.")

if model is None:
    print("\n! MODEL FAILED TO LOAD. Cannot proceed with embedding and HNSW building.")
else:
    # 3.2. Generate Embeddings for Dataset
    print(f"\n2. Generating embeddings for {len(company_names_sample)} sample companies...")
    start_time = time.time()
    preprocessed_names = [preprocess_name(name) for name in company_names_sample]
    embeddings = model.encode(preprocessed_names, convert_to_numpy=True)
    embedding_dim = embeddings.shape[1]
    embedding_gen_time = time.time() - start_time
    print(f"   Embeddings generated in {embedding_gen_time:.2f} seconds. (Dim: {embedding_dim})")

    # Store original names with their IDs and preprocessed versions for lookup
    company_data = {
        i: {'original_name': company_names_sample[i], 'preprocessed_name': preprocessed_names[i]}
        for i in range(len(company_names_sample))
    }

--- HNSW Index Search Performance (all-MiniLM-L6-v2 Embeddings) ---

1. Loading embedding model: all-MiniLM-L6-v2...
   Model loaded in 1.44 seconds.

2. Generating embeddings for 52 sample companies...
   Embeddings generated in 2.72 seconds. (Dim: 384)


In [6]:
if 'embeddings' not in locals(): # Check if embeddings were successfully created
    print("\n! EMBEDDINGS NOT GENERATED. Cannot proceed with HNSW index building.")
else:
    print(f"\n3. Building HNSW index with {len(company_names_sample)} elements...")
    start_time = time.time()
    hnsw_index = hnswlib.Index(space=HNSW_SPACE, dim=embedding_dim)
    hnsw_index.init_index(max_elements=len(company_names_sample), ef_construction=EF_CONSTRUCTION, M=M_HNSW)
    hnsw_index.add_items(embeddings, np.arange(len(company_names_sample), dtype=np.int32))
    index_build_time = time.time() - start_time
    print(f"   HNSW index built in {index_build_time:.2f} seconds. (ef_construction={EF_CONSTRUCTION}, M={M_HNSW})")
    
    hnsw_index.set_ef(SEARCH_EF) # Set ef for search operations


3. Building HNSW index with 52 elements...
   HNSW index built in 0.00 seconds. (ef_construction=200, M=16)


In [None]:
if 'hnsw_index' not in locals():
    print("\n! HNSW INDEX NOT BUILT. Cannot proceed with search.")
else:
    print("\n--- Ready for Interactive Search ---")
    print(f"Type a company name and press Enter to search. Type 'exit' to quit.")
    print(f"Searching for top {K_SEARCH_RESULTS} neighbors with ef_search={SEARCH_EF}.")

    loop = True

    while loop:
        try:
            query = input("\nEnter company name: ").strip()
        except EOFError: # Handles graceful exit if running as a script and input stream ends
            print("\nEOF received. Exiting search.")
            loop = False
            break
        except KeyboardInterrupt: # Handles Ctrl+C
            print("\nKeyboardInterrupt received. Exiting search.")
            loop = False
            break

        if query.lower() == 'exit':
            print("Exiting search. Goodbye!")
            loop = False
            break

        if not query:
            print("Please enter a non-empty query.")
            continue

        # --- Search Process ---
        search_start_time = time.time()

        # 5.1. Preprocess Query
        preprocessed_query = preprocess_name(query)
        
        # 5.2. Embed Query
        # Reshape to (1, -1) because hnswlib expects a 2D array for queries
        query_embedding = model.encode(preprocessed_query, convert_to_numpy=True).reshape(1, -1)
        query_embedding_time = time.time() - search_start_time

        # 5.3. Perform HNSW Query
        hnsw_query_start_time = time.time()
        # hnswlib.knn_query returns distances (float64) and indices (int64) NumPy arrays
        distances, indices = hnsw_index.knn_query(query_embedding, k=K_SEARCH_RESULTS)
        hnsw_query_time = time.time() - hnsw_query_start_time
        
        total_search_time = time.time() - search_start_time

        # 5.4. Display Results
        print(f"\nSearch for '{query}':")
        print(f"   Query embedding time: {query_embedding_time:.4f} seconds")
        print(f"   HNSW query time:      {hnsw_query_time:.4f} seconds")
        print(f"   Total search operation time: {total_search_time:.4f} seconds")

        found_matches = []
        # Loop through the raw NumPy arrays returned by knn_query
        for i in range(indices.shape[1]): # Iterate k times
            raw_indexed_id = indices[0, i] # Get the raw ID from the NumPy array
            distance = distances[0, i] # Get the distance for this ID

            # --- FIX START ---
            # Robustly convert to integer and filter invalid IDs.
            # Check if it's a floating point type and very close to zero or negative,
            # indicating it's likely a placeholder for an unfound neighbor.
            if np.issubdtype(type(raw_indexed_id), np.floating) and (raw_indexed_id < 0.0 or abs(raw_indexed_id) < 1e-6):
                # If it's a problematic float (e.g., very small positive or negative filler), skip it.
                continue
            
            # Now, safely convert to integer.
            try:
                indexed_id = int(raw_indexed_id)
            except ValueError:
                # This catches cases where raw_indexed_id might be NaN or other non-convertible types
                print(f"Warning: Could not convert ID {raw_indexed_id} to integer. Skipping.")
                continue

            # Robustly handle standard invalid indices (-1 from hnswlib) or out of bounds
            if not (0 <= indexed_id < len(company_names_sample)):
                continue 
            # --- FIX END ---

            original_name = company_data[indexed_id]['original_name']
            preprocessed_indexed_name = company_data[indexed_id]['preprocessed_name']
            
            semantic_similarity = 1 - distance # Cosine similarity = 1 - cosine distance

            string_fuzzy_score = basic_string_similarity_score(preprocessed_query, preprocessed_indexed_name)

            if semantic_similarity >= SEMANTIC_SIMILARITY_THRESHOLD and \
               string_fuzzy_score >= STRING_FUZZY_THRESHOLD:
                found_matches.append({
                    'original_name': original_name,
                    'semantic_similarity': semantic_similarity,
                    'string_fuzzy_score': string_fuzzy_score
                })
        
        if found_matches:
            # Sort by semantic similarity first, then string fuzzy score
            found_matches.sort(key=lambda x: (x['semantic_similarity'], x['string_fuzzy_score']), reverse=True)
            print(f"\n   Found Matches (Semantic Sim >= {SEMANTIC_SIMILARITY_THRESHOLD:.2f} & String Fuzzy >= {STRING_FUZZY_THRESHOLD:.1f}%):")
            for match in found_matches:
                print(f"     - {match['original_name']} (Semantic: {match['semantic_similarity']:.3f}, String: {match['string_fuzzy_score']:.1f}%)")
        else:
            print("\n   No strong matches found based on current thresholds.")
            print("   (Consider adjusting SEMANTIC_SIMILARITY_THRESHOLD or STRING_FUZZY_THRESHOLD if expected matches aren't showing.)")


--- Ready for Interactive Search ---
Type a company name and press Enter to search. Type 'exit' to quit.
Searching for top 5 neighbors with ef_search=100.


  semantic_similarity = 1 - distance # Cosine similarity = 1 - cosine distance



Search for 'apple':
   Query embedding time: 0.3555 seconds
   HNSW query time:      0.0002 seconds
   Total search operation time: 0.3557 seconds

   No strong matches found based on current thresholds.
   (Consider adjusting SEMANTIC_SIMILARITY_THRESHOLD or STRING_FUZZY_THRESHOLD if expected matches aren't showing.)

Search for 'motor':
   Query embedding time: 0.0536 seconds
   HNSW query time:      0.0001 seconds
   Total search operation time: 0.0536 seconds

   No strong matches found based on current thresholds.
   (Consider adjusting SEMANTIC_SIMILARITY_THRESHOLD or STRING_FUZZY_THRESHOLD if expected matches aren't showing.)

Search for 'general motor':
   Query embedding time: 0.6979 seconds
   HNSW query time:      0.0000 seconds
   Total search operation time: 0.6979 seconds

   No strong matches found based on current thresholds.
   (Consider adjusting SEMANTIC_SIMILARITY_THRESHOLD or STRING_FUZZY_THRESHOLD if expected matches aren't showing.)
Please enter a non-empty quer