# Module 2 - Taming Unstructured Data

This module has the following objectives:
- Creating a graph from Unstructured Data

In [3]:
# !pip install graphdatascience neo4j dotenv pydantic openai

Import our usual suspects (and some more...)

In [4]:
import os
import pandas as pd
from dotenv import load_dotenv
from graphdatascience import GraphDataScience
from neo4j import Query, GraphDatabase, RoutingControl, Result
from typing import List, Optional
from pydantic import BaseModel, Field, validator
from openai import OpenAI
import json

# Setup

Load env variables

In [5]:
env_file = 'credentials.env'

In [6]:
if os.path.exists(env_file):
    load_dotenv(env_file, override=True)

    # Neo4j
    HOST = os.getenv('NEO4J_URI')
    USERNAME = os.getenv('NEO4J_USERNAME')
    PASSWORD = os.getenv('NEO4J_PASSWORD')
    DATABASE = os.getenv('NEO4J_DATABASE')

    # AI
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    os.environ['OPENAI_API_KEY']=OPENAI_API_KEY
    LLM = os.getenv('LLM')
    EMBEDDINGS_MODEL = os.getenv('EMBEDDINGS_MODEL')
else:
    print(f"File {env_file} not found.")

Connect to neo4j db

In [7]:
driver = GraphDatabase.driver(
    HOST,
    auth=(USERNAME, PASSWORD)
)

Test the connection

In [8]:
driver.execute_query(
    """
    MATCH (n) RETURN COUNT(n) as Count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

Unnamed: 0,Count
0,902


## Unstructured data

Let's define some unstrucutured data from some of our Neo4j colleagues

In [71]:
chunk_df = driver.execute_query(
    """
    MATCH (c:Chunk)-[:PART_OF]->(d:Document) RETURN c.chunk as text, c.id as chunk_id, d.file_name as document LIMIT 10
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [72]:
chunk_df.head()

Unnamed: 0,text,chunk_id,document
0,Rabo \nSpaarRekening 2020,0,Rabo SpaarRekening 2020.pdf
1,Pagina 2/14\nInhoud\nRabo SpaarRekening Novem...,1,Rabo SpaarRekening 2020.pdf
2,"Pagina 3/14\nProductkenmerken, Rabo SpaarReken...",2,Rabo SpaarRekening 2020.pdf
3,"openen, wijzigen of opheffen. Hiervoor zijn ee...",3,Rabo SpaarRekening 2020.pdf
4,wij de bijbehorende rente. Wij kunnen de schij...,4,Rabo SpaarRekening 2020.pdf


## Define the Domain Model

[Pydantic Models](https://docs.pydantic.dev/latest/api/base_model/) are simply classes which inherit from BaseModel and define fields as annotated attributes.

In [73]:
class Definition(BaseModel):
    """
    Represents a term with its definition and description.
    """
    term: str = Field(..., description="The description or explanation of the term")
    description: str = Field(..., description="The description or explanation of the term")
    chunk_id: int = Field(..., description="The id of the chunk from which the term was derived from.")

class DefinitionList(BaseModel):
    definitions:List[Definition]

In [74]:
system_message = """
    You are an expert in extracting structured information from a text. 
    Identify defnitions/terms that are explicitly explained in the text. Please extract the following details: 
    - Term: The description or explanation of the term 
    - Description: The description or explanation of the term
    - chunk_id: The id of the chunk from which the term was derived from. 

    Be concise, take the following points in consideration:
    - Don't come up with anything yourself. 
    - Only map explicit definitions and terms that are explained in the text.
    - Don't translate things. Only store them in the origal language. 
    
    Present the extracted information in a clear, structured format. Be concise.
"""

In [75]:
client = OpenAI()

In [76]:
def extract(document, chunk_id, model=LLM, temperature=0):
    response = client.beta.chat.completions.parse(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": document},
            {"role": "user", "content": "chunk_id: " + str(chunk_id)},
        ],
        response_format=DefinitionList,
    )
    return json.loads(response.choices[0].message.content)

In [77]:
for _, row in chunk_df.iterrows(): 
    print(row)

text          Rabo \nSpaarRekening 2020
chunk_id                              0
document    Rabo SpaarRekening 2020.pdf
Name: 0, dtype: object
text        Pagina 2/14\nInhoud\nRabo SpaarRekening  Novem...
chunk_id                                                    1
document                          Rabo SpaarRekening 2020.pdf
Name: 1, dtype: object
text        Pagina 3/14\nProductkenmerken, Rabo SpaarReken...
chunk_id                                                    2
document                          Rabo SpaarRekening 2020.pdf
Name: 2, dtype: object
text        openen, wijzigen of opheffen. Hiervoor zijn ee...
chunk_id                                                    3
document                          Rabo SpaarRekening 2020.pdf
Name: 3, dtype: object
text        wij de bijbehorende rente. Wij kunnen de schij...
chunk_id                                                    4
document                          Rabo SpaarRekening 2020.pdf
Name: 4, dtype: object
text        U kunt er

In [78]:
rows = [extract(row.text, row.chunk_id) for _, row in chunk_df.iterrows()]

In [79]:
rows = {'definitions': [entry for d in rows for entry in d['definitions']]}

In [80]:
structured_data = DefinitionList.model_validate({'definitions': rows['definitions']})

In [81]:
for k, details_list in structured_data.model_dump().items():
    print(f"{k}")
    for details in details_list:
        for key, value in details.items():
            print(f"  {key}: {value}")
        print()

definitions
  term: Rabo SpaarRekening
  description: A type of savings account offered by Rabobank.
  chunk_id: 0

  term: Rabo SpaarRekening
  description: A type of savings account offered by Rabobank.
  chunk_id: 1

  term: Rabo SpaarRekening
  description: Een spaarrekening waarvan u het tegoed altijd en zonder kosten kunt opnemen. Het rentepercentage is afhankelijk van de hoogte van het tegoed.
  chunk_id: 2

  term: Rabo SpaarRekening voor particuliere klanten
  description: Een vrij opneembare spaarrekening voor particuliere klanten, te gebruiken via overboekingsformulieren of Rabo Online Bankieren.
  chunk_id: 2

  term: Rabo SpaarRekening openen
  description: U kunt de Rabo SpaarRekening kosteloos online openen, of via de Rabobank. U mag drie Rabo SpaarRekeningen bij ons aanhouden op uw eigen naam, en drie Rabo SpaarRekeningen samen met iemand anders. U kunt een rekening openen voor uzelf of samen met uw partner of iemand anders, of op naam van uw kind.
  chunk_id: 2

  term

## Graph creation
Now that data is structured and validated, we can save it to the database

Create Definition Nodes

In [17]:
records, summary, keys = driver.execute_query(
    """
        UNWIND $rows AS row
        MERGE (d:Definition{term:row.term})
        SET d.description = row.description
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE,
    rows = rows['definitions']
)

Create relationships to Chunks

In [18]:
records, summary, keys = driver.execute_query(
    """
    MATCH (d:Definition)
    MATCH (c:Chunk)
    WHERE toLower(c.chunk) CONTAINS toLower(d.term)
    MERGE (c)-[:MENTIONS]->(d)
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE
)