# Module 2 - Extract Definitions

This module has the following objectives:
- Creating a graph from Unstructured Data

In [1]:
# !pip install graphdatascience neo4j dotenv pydantic openai

Import our usual suspects (and some more...)

In [2]:
import os
import pandas as pd
from dotenv import load_dotenv
from graphdatascience import GraphDataScience
from neo4j import Query, GraphDatabase, RoutingControl, Result
from typing import List, Optional
from pydantic import BaseModel, Field, validator
from openai import OpenAI
import json
from langchain_openai import OpenAIEmbeddings

# Setup

Load env variables

In [3]:
env_file = 'credentials.env'

In [4]:
if os.path.exists(env_file):
    load_dotenv(env_file, override=True)

    # Neo4j
    HOST = os.getenv('NEO4J_URI')
    USERNAME = os.getenv('NEO4J_USERNAME')
    PASSWORD = os.getenv('NEO4J_PASSWORD')
    DATABASE = os.getenv('NEO4J_DATABASE')

    # AI
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    os.environ['OPENAI_API_KEY']=OPENAI_API_KEY
    LLM = os.getenv('LLM')
    EMBEDDINGS_MODEL = os.getenv('EMBEDDINGS_MODEL')
else:
    print(f"File {env_file} not found.")

Connect to neo4j db

In [5]:
driver = GraphDatabase.driver(
    HOST,
    auth=(USERNAME, PASSWORD)
)

Test the connection

In [6]:
driver.execute_query(
    """
    MATCH (n) RETURN COUNT(n) as Count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

Unnamed: 0,Count
0,763


## Unstructured data

Let's define some unstrucutured data from some of our documents

In [7]:
chunk_df = driver.execute_query(
    """
    MATCH (c:Chunk)-[:PART_OF]->(d:Document) RETURN c.chunk_eng as text, c.id as chunk_id, d.file_name as document
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [8]:
chunk_df.head()

Unnamed: 0,text,chunk_id,document
0,Interpolis\nShort-term\nTravel Insurance\nInsu...,0,Interpolis Short-Term Travel Insurance.pdf
1,2 of 22 Insurance Terms and Conditions Interpo...,1,Interpolis Short-Term Travel Insurance.pdf
2,3. Which events are covered? And which are not...,2,Interpolis Short-Term Travel Insurance.pdf
3,5. How is the amount of the damage determined?...,3,Interpolis Short-Term Travel Insurance.pdf
4,6.2 What if the insured does not comply with t...,4,Interpolis Short-Term Travel Insurance.pdf


## Define the Domain Model

[Pydantic Models](https://docs.pydantic.dev/latest/api/base_model/) are simply classes which inherit from BaseModel and define fields as annotated attributes.

In [9]:
class Definition(BaseModel):
    """
    Represents a term with its definition and description.
    """
    term: str = Field(..., description="The description or explanation of the term")
    description: str = Field(..., description="The description or explanation of the term")
    chunk_id: int = Field(..., description="The id of the chunk from which the term was derived from.")

class DefinitionList(BaseModel):
    definitions:List[Definition]

In [10]:
system_message = """
    You are an expert in extracting structured information from a text. 
    Identify defnitions/terms that are explicitly explained in the text. Please extract the following details: 
    - Term: The description or explanation of the term 
    - Description: The description or explanation of the term
    - chunk_id: The id of the chunk from which the term was derived from. 

    Be concise, take the following points in consideration:
    - Don't come up with anything yourself. 
    - Only map explicit definitions and terms that are explained in the text.
    - Don't translate things. Only store them in the origal language. 
    
    Present the extracted information in a clear, structured format. Be concise.
"""

In [11]:
client = OpenAI()

In [12]:
def extract(document, chunk_id, model=LLM, temperature=0):
    response = client.beta.chat.completions.parse(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": document},
            {"role": "user", "content": "chunk_id: " + str(chunk_id)},
        ],
        response_format=DefinitionList,
    )
    return json.loads(response.choices[0].message.content)

In [13]:
for _, row in chunk_df.iterrows(): 
    print(row)

text        Interpolis\nShort-term\nTravel Insurance\nInsu...
chunk_id                                                    0
document           Interpolis Short-Term Travel Insurance.pdf
Name: 0, dtype: object
text        2 of 22 Insurance Terms and Conditions Interpo...
chunk_id                                                    1
document           Interpolis Short-Term Travel Insurance.pdf
Name: 1, dtype: object
text        3. Which events are covered? And which are not...
chunk_id                                                    2
document           Interpolis Short-Term Travel Insurance.pdf
Name: 2, dtype: object
text        5. How is the amount of the damage determined?...
chunk_id                                                    3
document           Interpolis Short-Term Travel Insurance.pdf
Name: 3, dtype: object
text        6.2 What if the insured does not comply with t...
chunk_id                                                    4
document           Interpolis Short-Term

In [14]:
rows = [extract(row.text, row.chunk_id) for _, row in chunk_df.iterrows()]

In [15]:
rows = {'definitions': [entry for d in rows for entry in d['definitions']]}

In [16]:
structured_data = DefinitionList.model_validate({'definitions': rows['definitions']})

In [17]:
# for k, details_list in structured_data.model_dump().items():
#     print(f"{k}")
#     for details in details_list:
#         for key, value in details.items():
#             print(f"  {key}: {value}")
#         print()

## Graph creation
Now that data is structured and validated, we can save it to the database

Create Definition Nodes

In [23]:
records, summary, keys = driver.execute_query(
    """
        UNWIND $rows AS row
        MERGE (d:Definition{term:LOWER(row.term)})
        SET d.description = row.description
        SET d.original_chunk_id = row.chunk_id
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE,
    rows = rows['definitions']
)

Create relationships to Chunks

In [24]:
records, summary, keys = driver.execute_query(
    """
    MATCH (d:Definition)
    MATCH (c:Chunk)
    WHERE toLower(c.chunk_eng) CONTAINS toLower(d.term)
    MERGE (c)-[:MENTIONS]->(d)
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE
)

## Create embeddings

Create embeddings on definitions. 

In [25]:
embeddings_model = OpenAIEmbeddings(
    model = EMBEDDINGS_MODEL,
    openai_api_key = OPENAI_API_KEY
)

In [26]:
df = driver.execute_query(
    """
    MATCH (def:Definition)
    RETURN def.term AS term, def.description AS description
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
)

In [27]:
df['embedding'] = df['term'].apply(lambda x: embeddings_model.embed_query(x))

In [28]:
for _, row in df.iterrows():
    driver.execute_query(
        """
        MATCH (def:Definition {term: $term})
        SET def.embedding = $embedding
        WITH def
        CALL db.create.setNodeVectorProperty(def, "embedding", def.embedding)
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        result_transformer_= lambda r: r.to_df(),
        term=row['term'],
        embedding=row['embedding']
    )