# Module 2 - Taming Unstructured Data

This module has the following objectives:
- Creating a graph from Unstructured Data

In [58]:
# !pip install graphdatascience neo4j dotenv pydantic openai

Import our usual suspects (and some more...)

In [59]:
import os
import pandas as pd
from dotenv import load_dotenv
from graphdatascience import GraphDataScience
from neo4j import Query, GraphDatabase, RoutingControl, Result
from typing import List, Optional
from pydantic import BaseModel, Field, validator
from openai import OpenAI
import json

# Setup

Load env variables

In [60]:
env_file = 'credentials.env'

In [61]:
if os.path.exists(env_file):
    load_dotenv(env_file, override=True)

    # Neo4j
    HOST = os.getenv('NEO4J_URI')
    USERNAME = os.getenv('NEO4J_USERNAME')
    PASSWORD = os.getenv('NEO4J_PASSWORD')
    DATABASE = os.getenv('NEO4J_DATABASE')

    # AI
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    os.environ['OPENAI_API_KEY']=OPENAI_API_KEY
    LLM = os.getenv('LLM')
    EMBEDDINGS_MODEL = os.getenv('EMBEDDINGS_MODEL')
else:
    print(f"File {env_file} not found.")

Connect to neo4j db

In [62]:
driver = GraphDatabase.driver(
    HOST,
    auth=(USERNAME, PASSWORD)
)

Test the connection

In [63]:
driver.execute_query(
    """
    MATCH (n) RETURN COUNT(n) as Count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

Unnamed: 0,Count
0,400


## Unstructured data

Let's define some unstrucutured data from some of our Neo4j colleagues

In [64]:
chunk_df = driver.execute_query(
    """
    MATCH (c:Chunk)-[:PART_OF]->(d:Document) RETURN c.chunk as text, d.file_name as document
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [65]:
chunk_df.head()

Unnamed: 0,text,document
0,Rabo \nSpaarRekening 2020,Rabo SpaarRekening 2020.pdf
1,Pagina 2/14\nInhoud\nRabo SpaarRekening Novem...,Rabo SpaarRekening 2020.pdf
2,"Pagina 3/14\nProductkenmerken, Rabo SpaarReken...",Rabo SpaarRekening 2020.pdf
3,"openen, wijzigen of opheffen. Hiervoor zijn ee...",Rabo SpaarRekening 2020.pdf
4,wij de bijbehorende rente. Wij kunnen de schij...,Rabo SpaarRekening 2020.pdf


## Define the Domain Model

[Pydantic Models](https://docs.pydantic.dev/latest/api/base_model/) are simply classes which inherit from BaseModel and define fields as annotated attributes.

In [66]:
class Definition(BaseModel):
    """
    Represents a term with its definition and description.
    """
    term: str = Field(..., description="The description or explanation of the term")
    description: str = Field(..., description="The description or explanation of the term")
    
class DefinitionList(BaseModel):
    definitions:List[Definition]

In [67]:
system_message = """
    You are an expert in extracting structured information from a text. 
    Identify defnitions/terms that are explicitly explained in the text. Please extract the following details: 
    - Term: The description or explanation of the term 
    - Description: The description or explanation of the term

    Be concise, take the following points in consideration:
    - Don't come up with anything yourself. 
    - Only map explicit definitions and terms that are explained in the text.
    - Don't translate things. Only store them in the origal language. 
    
    Present the extracted information in a clear, structured format. Be concise.
"""

In [68]:
client = OpenAI()

In [69]:
def extract(document, model=LLM, temperature=0):
    response = client.beta.chat.completions.parse(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": document},
        ],
        response_format=DefinitionList,
    )
    return json.loads(response.choices[0].message.content)

In [70]:
rows = [extract(row.text) for _, row in chunk_df.iterrows()]

In [71]:
rows = {'definitions': [entry for d in rows for entry in d['definitions']]}

In [72]:
structured_data = DefinitionList.model_validate({'definitions': rows['definitions']})

In [73]:
# for k, details_list in structured_data.model_dump().items():
#     print(f"{k}")
#     for details in details_list:
#         for key, value in details.items():
#             print(f"  {key}: {value}")
#         print()

## Graph creation
Now that data is structured and validated, we can save it to the database

Create Definition Nodes

In [74]:
records, summary, keys = driver.execute_query(
    """
        UNWIND $rows AS row
        MERGE (d:Definition{term:row.term})
        SET d.description = row.description
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE,
    rows = rows['definitions']
)

Create relationships to Chunks

In [75]:
records, summary, keys = driver.execute_query(
    """
    MATCH (d:Definition)
    MATCH (c:Chunk)
    WHERE toLower(c.chunk) CONTAINS toLower(d.term)
    MERGE (c)-[:MENTIONS]->(d)
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE
)