# Similarity Enrichment

### 0. Setup Environment

In [34]:
import os
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain import PromptTemplate, LLMChain
from langchain.chat_models import ChatOpenAI
from neo4j import GraphDatabase

load_dotenv(override=True)
uri = os.getenv("NEO4J_URI")
user = os.getenv("NEO4J_USER")
password = os.getenv("NEO4J_PASSWORD")
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
embedding_model = OpenAIEmbeddings()

with GraphDatabase.driver(uri, auth=(user, password)) as driver:
    driver.verify_connectivity()

### 1. Embeddings on Taxonomies (OpenAI)

In [36]:
vectorstore = Neo4jVector.from_existing_graph(
    embedding=embedding_model,
    url=uri,
    username=user,
    password=password,
    index_name='term',
    node_label="Taxonomy",
    embedding_node_property="embedding",
    text_node_properties=["title"]
)

retrieval_query = """
    MATCH (node:Taxonomy)
    WITH node, score
    return node.title AS text, score,
        node {title: node.title} 
    as metadata
    """
contextualized_vectorstore = Neo4jVector.from_existing_index(
    OpenAIEmbeddings(),
    url=uri,
    username=user,
    password=password,
    index_name="term",
    retrieval_query=retrieval_query,
)

### 2. Get Top PageRank Taxonomy (Percentile 90)

In [38]:
top_taxonomies = []
with GraphDatabase.driver(uri, auth=(user, password)) as driver:
    records, summary, keys = driver.execute_query("""
        // 1) Gets p90 of PageRank Property
        CALL () {
            MATCH (t:Taxonomy)
            RETURN percentileCont(t.pageRank, 0.90) AS p90
        }

        // 2) Gets Taxonomies in p90 of PageRank Property
        MATCH (t:Taxonomy)
        WHERE t.pageRank >= p90
        RETURN
            t.title,
            t.pageRank
        ORDER BY t.pageRank DESC
        """,
        database_="neo4j"
    )

    for record in records:
        top_taxonomies.append(record['t.title'])

    print(top_taxonomies)

['Cosmetics', 'Vehicles', 'Apparatus for locomotion by land', 'Computers', 'Telecommunication services', 'Clothing', 'Musical instruments', 'Furniture', 'Pharmaceutical preparations', 'Soap', 'Services for providing food and drink', 'Cakes', 'Motors, other than for land vehicles', 'Insurances', 'Computer software', 'Beds', 'Coffee', 'Footwear', 'Headgear', 'Industrial chemicals', 'Games', 'Machine tools', 'Streaming of audio content via a global computer network', 'Apparatus for locomotion by water', 'Computer programming', 'Guns [weapons]', 'Apparatus for locomotion by air', 'Dolls', 'Stationery', 'Business administration', 'Aceites esenciales', 'Frameworks of metal', 'Insurance services', 'Hotel services', 'Cosmetic hand care products', 'Data processing equipment', 'Jewellery', 'Travelling bags', 'Provision of food and drink', 'Teaching', 'Cards', 'Software', 'Business management', 'Magnetic data media', 'Cars', 'Fish, not live', 'Confectionery', 'Preparations for destroying vermin',

### 3. Similarity and Comparison from KG

In [41]:
# Similar Results
k = 100

with GraphDatabase.driver(uri, auth=(user, password)) as driver:

    print(f"\nFound {len(top_taxonomies)} Top Taxonomies:")

    for top_term in top_taxonomies:

        print(f"\nLooking for Similars to «{top_term}»…")

        # 1) Get Name/Score by Similarity
        similar = contextualized_vectorstore.similarity_search_with_score(
            top_term, k
        )
        similar_taxonomies = [(doc.metadata["title"], score) for doc, score in similar]
        names_list = [name for name, _ in similar_taxonomies]

        # 2) Get Taxonomies not related for TopTerm
        records, summary, keys = driver.execute_query("""
            MATCH (t:Taxonomy {title: $topTerm})
            UNWIND $namesList AS candidate
            MATCH (o:Taxonomy {title: candidate})
            WHERE NOT EXISTS {
                (t)-[:HAS_SIMILARITY*1..3]-(o)
            }
            RETURN o.title AS title;
            """,
            topTerm=top_term, 
            namesList=names_list, 
            database_="neo4j"
        )

        # 3) Not related Name/Score List
        not_related = {r["title"] for r in records}
        result = [
            (title, score)
            for title, score in similar_taxonomies
            if title in not_related
        ]
        
        print(f"Similars with no Previous Relation ({len(result)} of {k}):")
        for title, score in result:
            print(f"  • {title}: {score:.4f}")

        break


Found 268 Top Taxonomies:

Looking for Similars to «Cosmetics»…
Similars with no Previous Relation (31 of 100):
  • Cosmetic, hygiene and beauty care utensils: 0.9333
  • Face creams for cosmetic use: 0.9309
  • Chemical compositions and materials for use in the manufacture of cosmetics: 0.9295
  • Body cleaning and beauty care preparations: 0.9188
  • Skin, eye and nail care preparations: 0.9179
  • Skin care creams, other than for medical use: 0.9138
  • Feminine hygiene products: 0.9111
  • Cosmetic bags sold empty: 0.9107
  • Perfumery and fragrances: 0.9076
  • Glasses, sunglasses and contact lenses: 0.9066
  • Skin pomades: 0.9062
  • Soaps and gels: 0.9060
  • Decorative articles for the hair: 0.9046
  • Hair styling appliances: 0.9042
  • Hairsprays: 0.9041
  • Body soaps: 0.9040
  • Hair removal and shaving preparations: 0.9037
  • Chemical preparations for use in photography: 0.9036
  • Skin moisturisers: 0.9034
  • Rental of equipment for human hygiene and beauty care: 0.90

### 4. Other uses: similarity by class text, found and retrieve all Taxonomies

In [42]:
vectorstore2 = Neo4jVector.from_existing_graph(
    embedding=embedding_model,
    url=uri,
    username=user,
    password=password,
    index_name='class',
    node_label="Class",
    embedding_node_property="embedding",
    text_node_properties=["text"]
)

retrieval_query2 = """
    MATCH (t)<-[rc:HAS_CHILD*]-(node:Class)<-[rcat]-(category)
    WITH node, category, collect(t.title) AS taxonomies, score
    RETURN node.text AS text, score,
        node {title: node.title, text: node.text, category: category.title, taxonomies: taxonomies} 
    AS metadata
    """
contextualized_vectorstore2 = Neo4jVector.from_existing_index(
    OpenAIEmbeddings(),
    url=uri,
    username=user,
    password=password,
    index_name="class",
    retrieval_query=retrieval_query2,
)

In [43]:
question = "It provides comprehensive IT and technology services—including software and hardware development, hosting and SaaS, consultancy and security, data processing and testing, design, and related equipment rentals."
def kg_class_response(input):
  response = contextualized_vectorstore2.similarity_search_with_score(input, k=3)
  return "\n".join([
        f"\nScore: {score}\nTitle: {doc.metadata.get('title')}\nCategory: {doc.metadata.get('category')}\nText: {doc.metadata.get('text')}\nTaxonomies: {doc.metadata.get('taxonomies')}."
        for doc, score in response
    ])
print(kg_class_response(question))


Score: 0.9415740966796875
Title: 42
Category: Services
Text: <ol><li>IT services namely Software development, programming and implementation, Computer hardware development, Hosting services, and software as a service and rental of software, Rental of computer hardware and facilities, IT consultancy, advisory and information services, IT security, protection and restoration, Data duplication and conversion services, data coding services, Computer analysis and diagnostics, Research, development and implementation of computers and systems, Computer project management services, Data mining, Digital watermarking, Computer services, Technological services relating to computers</li><li>Science and technology services</li><li>Testing, authentication and quality control</li><li>Design services</li><li>And rental, hire and leasing in connection with the aforesaid, included in the class and advice, consultancy and information for the aforesaid, included in the class Computer network services, Up

In [44]:
template = '''
You are an expert assistant retrieving information about Taxonomies for users in a company. The user will ask you about this Class:
  "{user_query}"

You have found the following classes most relevant to the question:
{classes}

For these class you have the following:
	•	Score: Similarity score for the guideline
	•	Title: Number of the Class
	•	Text: HTML text that makes a summary of the Class
	•	Taxonomies: All the taxonomies included in this class

Generate a humanized response that includes in a sentence the Title and a Summary from the text of the highest-scoring class and also list all its taxonomies.

At the end, include the titles of the other Class in case they may be useful to the user, with a question like: Other classes you were expecting?.
'''

prompt = PromptTemplate(
    input_variables=["user_query", "classes"],
    template=template
)

llm = ChatOpenAI(model="gpt-4o", temperature=0.7)
chain = LLMChain(llm=llm, prompt=prompt)

def humanized_response(question):
    classes = kg_class_response(question)
    enriched = chain.predict(user_query=question, classes=classes)
    return enriched

response = humanized_response(kg_class_response(question))
print(response)

The class with the highest relevance is Class 42, which falls under the category of Services. This class encompasses a wide range of IT services, including software development, programming, computer hardware development, hosting services, and software as a service. It also includes IT consultancy, security, data duplication, computer analysis, and technological services related to computers. Furthermore, Class 42 covers science and technology services, testing, authentication, quality control, design services, and various related advisory and information services.

Here is a comprehensive list of taxonomies included in Class 42:
- IT services
- Software development, programming and implementation
- Computer hardware development
- Hosting services, software as a service, and rental of software
- Rental of computer hardware and facilities
- IT consultancy, advisory and information services
- IT security, protection and restoration
- Data duplication and conversion services, data coding 