In [None]:
import hnswlib
import numpy as np
from sentence_transformers import SentenceTransformer
from pyvis.network import Network
import random

# --- 1. Simulate Company Data and Embeddings (Replace with your actual data) ---
# In a real scenario, you would load your 200M company names and their embeddings.
# For visualization, we create a small, representative sample.


  from .autonotebook import tqdm as notebook_tqdm


Loading SentenceTransformer model...
Generating embeddings for sample company names...
Building HNSW index with 46 embeddings...
HNSW index built.
Extracting up to 5 connections per node for visualization...
Added 45 connections to the visualization.
hnsw_company_network_visualization.html


  similarity = float(1 - distances[0][j])


In [None]:

company_names_sample = [
    "Google LLC", "Alphabet Inc.", "Google Inc.", "Microsoft Corporation",
    "Microsft Corp.", "Apple Inc.", "Apple Computers LLC", "Amazon.com Inc.",
    "International Business Machines", "IBM Corp.", "The Coca-Cola Company",
    "PepsiCo Inc.", "ExxonMobil Corporation", "Shell Global", "BP p.l.c.",
    "Siemens AG", "Bosch GmbH", "General Electric Co.", "GE Power",
    "Walmart Inc.", "Target Corporation", "Costco Wholesale Corp.", "NVIDIA Corp.",
    "Advanced Micro Devices Inc.", "Intel Corporation", "Oracle Corp.",
    "SAP SE", "Accenture plc", "Deloitte Touche Tohmatsu Limited",
    "PricewaterhouseCoopers LLP", "Ernst & Young Global Limited",
    "Goldman Sachs Group Inc.", "JPMorgan Chase & Co.", "Bank of America Corp.",
    "Wells Fargo & Company", "Johnson & Johnson", "Pfizer Inc.",
    "Novartis AG", "Roche Holding AG", "Sanofi S.A.", "T-Mobile US Inc.",
    "Verizon Communications Inc.", "AT&T Inc.", "Samsung Electronics Co. Ltd.",
    "Sony Group Corporation", "LG Electronics Inc."
]

# Get embeddings using a SentenceTransformer model (similar to OpenAI's output)
print("Loading SentenceTransformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2') # A fast, small model for demo
print("Generating embeddings for sample company names...")
embeddings = model.encode(company_names_sample, convert_to_numpy=True)
embedding_dim = embeddings.shape[1]


In [None]:

# Create a mapping from internal HNSW ID to original company name
id_to_name = {i: name for i, name in enumerate(company_names_sample)}
name_to_id = {name: i for i, name in enumerate(company_names_sample)}

# --- 2. Build a Small HNSW Index (Conceptualizing your 200M index) ---
# In your real system, this index would be pre-built and loaded.
print(f"Building HNSW index with {len(company_names_sample)} embeddings...")
num_elements = len(company_names_sample)
hnsw_index = hnswlib.Index(space='cosine', dim=embedding_dim) # 'cosine' is good for OpenAI embeddings
hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16) # Adjust M and ef_construction as needed
hnsw_index.add_items(embeddings, np.arange(num_elements, dtype=np.int32))
print("HNSW index built.")


In [None]:

# --- 3. Extract Connections for Visualization ---
num_neighbors_to_visualize = 5

net = Network(notebook=True, height="750px", width="100%", directed=False, cdn_resources='remote')
net.set_options("""
var options = {
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -20000,
      "centralGravity": 0.3,
      "springLength": 100,
      "springConstant": 0.005,
      "damping": 0.9
    },
    "maxVelocity": 50,
    "minVelocity": 0.1,
    "solver": "barnesHut",
    "stabilization": {
      "enabled": true,
      "iterations": 1000,
      "updateInterval": 25,
      "onlyDynamicEdges": false,
      "fit": true
    },
    "timestep": 0.5,
    "adaptiveTimestep": true
  }
}
""")

# Add all nodes to the Pyvis graph first, ensuring IDs are standard Python integers
# This ensures that any valid neighbor_id found later will already exist as a node
for i, name in id_to_name.items():
    net.add_node(i, label=name, title=name, size=15)

added_edges = set()

print(f"Extracting up to {num_neighbors_to_visualize} connections per node for visualization...")
for i, embedding in enumerate(embeddings):
    # hnsw_index.knn_query returns [distances], [indices]
    distances, raw_indices_array = hnsw_index.knn_query(embedding, k=num_neighbors_to_visualize + 1)

    # Convert to a list of standard Python integers, then filter.
    # CRUCIAL FIX: Ensure `raw_indices_array` elements are valid for conversion
    # before calling .tolist() to avoid potential errors on non-integer-like floats.
    # The `astype(int)` ensures the conversion is clean, then tolist() for Python native types.
    neighbor_ids = raw_indices_array[0].astype(int).tolist()

    for j, neighbor_id in enumerate(neighbor_ids):
        # Filter out self-loops, invalid HNSW filler values (-1),
        # and ensure the ID is within the valid range of nodes we added to Pyvis.
        if (neighbor_id == i or
            neighbor_id == -1 or # HNSWlib sometimes returns -1 for unfound neighbors
            not (0 <= neighbor_id < num_elements)): # Check bounds
            continue

        edge_tuple = tuple(sorted((i, neighbor_id)))
        if edge_tuple not in added_edges:
            similarity = float(1 - distances[0][j])
            
            net.add_edge(
                i, # 'i' is already a standard Python int
                neighbor_id, # This is now guaranteed a standard Python int
                value=float(similarity * 10), # Ensure 'value' is a standard Python float
                title=f"Similarity: {similarity:.2f}",
                color={'color': 'blue', 'highlight': 'red'}
            )
            added_edges.add(edge_tuple)

print(f"Added {len(net.get_edges())} connections to the visualization.")

output_file = "hnsw_company_network_visualization.html"
net.show(output_file)