# Similarity Enrichment

## 0. Environment

In [18]:
import os
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain import PromptTemplate, LLMChain
from langchain.chat_models import ChatOpenAI
from neo4j import GraphDatabase

load_dotenv(override=True)
uri = os.getenv("NEO4J_URI")
user = os.getenv("NEO4J_USER")
password = os.getenv("NEO4J_PASSWORD")
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
embedding_model = OpenAIEmbeddings()

with GraphDatabase.driver(uri, auth=(user, password)) as driver:
    driver.verify_connectivity()

### 1. Embeddings on Terms (OpenAI)

In [2]:
vectorstore = Neo4jVector.from_existing_graph(
    embedding=embedding_model,
    url=uri,
    username=user,
    password=password,
    index_name='term',
    node_label="Term",
    embedding_node_property="embedding",
    text_node_properties=["name"]
)

retrieval_query = """
    MATCH (node:Term)
    WITH node, score
    return node.name AS text, score,
        node {name: node.name} 
    as metadata
    """
contextualized_vectorstore = Neo4jVector.from_existing_index(
    OpenAIEmbeddings(),
    url=uri,
    username=user,
    password=password,
    index_name="term",
    retrieval_query=retrieval_query,
)

### 2. Get Top PageRank Terms (Percentile 90)

In [3]:
top_terms = []
with GraphDatabase.driver(uri, auth=(user, password)) as driver:
    records, summary, keys = driver.execute_query("""
        // 1) Gets p90 of PageRank Property
        CALL () {
            MATCH (t:Term)
            RETURN percentileCont(t.pageRank, 0.90) AS p90
        }

        // 2) Gets Terms in p90 of PageRank Property
        MATCH (t:Term)
        WHERE t.pageRank >= p90
        RETURN
            t.name,
            t.pageRank
        ORDER BY t.pageRank DESC
        """,
        database_="neo4j"
    )

    for record in records:
        top_terms.append(record['t.name'])

### 3. Get All PageRank Terms

In [None]:
terms = []
with GraphDatabase.driver(uri, auth=(user, password)) as driver:
    records, summary, keys = driver.execute_query("""
        // 1) Gets All Terms (LIMITED TO 1000!!)
        MATCH (t:Term)
        RETURN
            t.name
        LIMIT 1000
        """,
        database_="neo4j"
    )

    for record in records:
        terms.append(record['t.name'])

    print(terms)

['Flexible hoses, not of metal', 'Cutters', 'Reservation services for airline travel', 'Flowers', 'Coffee', 'Adhesives for household purposes', 'Magnetic data media', 'Cosmetics', 'Coffee flavourings', 'Dairy spreads', 'Agricultural produce (Unprocessed -)', 'Writing of texts, other than publicity texts', 'Milk', 'Scarves', 'False eyelashes', 'Honey', 'Towels of textile', 'Hand tools and implements [hand-operated]', 'Fruit-based snack food', 'Mushrooms, preserved', 'Desserts made from milk products', 'Meat extracts', 'Photocopy paper', 'Adhesives for stationery', 'Database design and development', 'Sound reproduction apparatus', 'Peanuts, prepared', 'Rental of audio equipment', 'Steam engines', 'Pizzas', 'String', 'Heat exchangers [parts of machines]', 'Ready-made clothing', 'Non-alcoholic beer', 'Non-alcoholic beverages', 'Production of musical videos', 'Arranging of musical events', 'Apparatus for the transmission of data', 'Sandwiches', 'Jellies for food', 'Paper', 'False nails', 'B

### 4. Similarity and Comparison from KG

In [26]:
# Similar Results
k = 100

with GraphDatabase.driver(uri, auth=(user, password)) as driver:

    print(f"\nFound {len(top_terms)} TopTerms:")

    for top_term in top_terms:

        print(f"\nLooking for Similars to «{top_term}»…")

        # 1) Get Name/Score by Similarity
        similar = contextualized_vectorstore.similarity_search_with_score(
            top_term, k
        )
        similar_terms = [(doc.metadata["name"], score) for doc, score in similar]
        names_list = [name for name, _ in similar_terms]

        # 2) Get Terms not related for TopTerm
        records, summary, keys = driver.execute_query("""
            MATCH (t:Term {name: $topTerm})
            UNWIND $namesList AS candidate
            MATCH (o:Term {name: candidate})
            WHERE NOT EXISTS {
                (t)-[:HAS_SIMILARITY*1..3]-(o)
            }
            RETURN o.name AS name;
            """,
            topTerm=top_term, 
            namesList=names_list, 
            database_="neo4j"
        )

        # 3) Not related Name/Score List
        not_related = {r["name"] for r in records}
        result = [
            (name, score)
            for name, score in similar_terms
            if name in not_related
        ]
        
        print(f"Similars with no Previous Relation ({len(result)} of {k}):")
        for name, score in result:
            print(f"  • {name}: {score:.4f}")

        #break


Found 268 TopTerms:

Looking for Similars to «Cosmetics»…
Similars with no Previous Relation (23 of 100):
  • Face creams for cosmetic use: 0.9334
  • Skin care creams, other than for medical use: 0.9160
  • Cosmetic bags sold empty: 0.9131
  • Skin pomades: 0.9115
  • Skin moisturisers: 0.9102
  • Hairsprays: 0.9094
  • Body soaps: 0.9085
  • Decorative articles for the hair: 0.9070
  • Shaving cream: 0.9054
  • Bath preparations, not for medical purposes: 0.9046
  • Contact lenses: 0.9041
  • Hair pomades: 0.9039
  • Toiletry bags: 0.9038
  • Doll accessories: 0.9032
  • Chemical preparations for use in photography: 0.9027
  • Colorants: 0.9024
  • Conditioners: 0.9019
  • Dolls' clothing accessories: 0.9019
  • Lacquers and varnishes: 0.9019
  • Spectacles: 0.9014
  • Hair gel: 0.9012
  • Medical dressings, coverings and applicators: 0.9008
  • Hand-operated hygienic and beauty implements for humans and animals: 0.9008

Looking for Similars to «Vehicles»…
Similars with no Previous 