# Extract Definitions

This module has the following objectives:
- Creating a graph from Unstructured Data

In [None]:
# !pip install graphdatascience neo4j dotenv pydantic openai

Import our usual suspects (and some more...)

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
from graphdatascience import GraphDataScience
from neo4j import Query, GraphDatabase, RoutingControl, Result
from typing import List, Optional
from pydantic import BaseModel, Field, validator
from openai import OpenAI
import json
from langchain_openai import OpenAIEmbeddings

# Setup

Load env variables

In [None]:
env_file = '../ws.env'

In [None]:
if os.path.exists(env_file):
    load_dotenv(env_file, override=True)

    # Neo4j
    HOST = os.getenv('NEO4J_URI')
    USERNAME = os.getenv('NEO4J_USERNAME')
    PASSWORD = os.getenv('NEO4J_PASSWORD')
    DATABASE = os.getenv('NEO4J_DATABASE')

    # AI
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    os.environ['OPENAI_API_KEY']=OPENAI_API_KEY
    LLM = os.getenv('LLM')
    EMBEDDINGS_MODEL = os.getenv('EMBEDDINGS_MODEL')
else:
    print(f"File {env_file} not found.")

Connect to neo4j db

In [None]:
driver = GraphDatabase.driver(
    HOST,
    auth=(USERNAME, PASSWORD)
)

Test the connection

In [None]:
driver.execute_query(
    """
    MATCH (n) RETURN COUNT(n) as Count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

## Unstructured data

Let's define some unstrucutured data from some of our documents

In [None]:
chunk_df = driver.execute_query(
    """
    MATCH (c:Chunk)-[:PART_OF]->(d:Document) RETURN c.chunk_eng as text, c.id as chunk_id, d.file_name as document
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [None]:
chunk_df.head()

## Define the Domain Model

[Pydantic Models](https://docs.pydantic.dev/latest/api/base_model/) are simply classes which inherit from BaseModel and define fields as annotated attributes.

In [None]:
class Definition(BaseModel):
    """
    Represents a term with its definition and description.
    """
    term: str = Field(..., description="The description or explanation of the term")
    description: str = Field(..., description="The description or explanation of the term")
    chunk_id: int = Field(..., description="The id of the chunk from which the term was derived from.")

class DefinitionList(BaseModel):
    definitions:List[Definition]

In [None]:
system_message = """
    You are an expert in identifying definitions and terms in a piece of text. 
    Identify defnitions/terms that are explicitly explained in the text. Please extract the following details: 
    - Term: The description or explanation of the term 
    - Description: The description or explanation of the term
    - chunk_id: The id of the chunk from which the term was derived from. 

    Be concise, take the following points in consideration:
    - Don't come up with anything yourself. 
    - Focus on terms that are explained in the text. Focus on abbreviations and specific context depended terms. 
    - Only map explicit definitions and terms that are explained in the text.
    - Don't state obvious terms but focus on specific ones here. 
    - Don't translate things. Only store them in the origal language.
    - Skip chapters or sections headers which are very generic. 

    Present the extracted information in a clear, structured format. Be concise.
"""

In [None]:
client = OpenAI()

In [None]:
def extract(document, chunk_id, model=LLM, temperature=0):
    response = client.beta.chat.completions.parse(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": document},
            {"role": "user", "content": "chunk_id: " + str(chunk_id)},
        ],
        response_format=DefinitionList,
    )
    return json.loads(response.choices[0].message.content)

In [None]:
for _, row in chunk_df.iterrows(): 
    print(row)

In [None]:
rows = [extract(row.text, row.chunk_id) for _, row in chunk_df.iterrows()]

In [None]:
rows = {'definitions': [entry for d in rows for entry in d['definitions']]}

In [None]:
structured_data = DefinitionList.model_validate({'definitions': rows['definitions']})

In [None]:
for k, details_list in structured_data.model_dump().items():
    print(f"{k}")
    for details in details_list:
        for key, value in details.items():
            print(f"  {key}: {value}")
        print()

## Graph creation
Now that data is structured and validated, we can save it to the database

Create Definition Nodes

In [None]:
records, summary, keys = driver.execute_query(
    """
        UNWIND $rows AS row
        MERGE (d:Definition{term:LOWER(row.term)})
        SET d.description = row.description
        SET d.original_chunk_id = row.chunk_id
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE,
    rows = rows['definitions']
)

Create relationships to Chunks

In [None]:
records, summary, keys = driver.execute_query(
    """
    MATCH (d:Definition)
    MATCH (c:Chunk)
    WHERE toLower(c.chunk_eng) CONTAINS toLower(d.term)
    MERGE (c)-[:MENTIONS]->(d)
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE
)

## Create embeddings

Create embeddings on definitions. 

In [None]:
embeddings_model = OpenAIEmbeddings(
    model = EMBEDDINGS_MODEL,
    openai_api_key = OPENAI_API_KEY
)

In [None]:
df = driver.execute_query(
    """
    MATCH (def:Definition)
    RETURN def.term AS term, def.description AS description
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
)

In [None]:
df['embedding'] = df['term'].apply(lambda x: embeddings_model.embed_query(x))

In [None]:
for _, row in df.iterrows():
    driver.execute_query(
        """
        MATCH (def:Definition {term: $term})
        SET def.embedding = $embedding
        WITH def
        CALL db.create.setNodeVectorProperty(def, "embedding", def.embedding)
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        result_transformer_= lambda r: r.to_df(),
        term=row['term'],
        embedding=row['embedding']
    )