# **AutoCimKG: Tutorial**

In [None]:
import os

import pandas as pd
import numpy as np

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader

from datetime import datetime

from autocimkg import AutoCimKGCore
from autocimkg import DocumentsDistiller
from autocimkg.models import Document, Employee, Ontology, KnowledgeGraph, KnowledgeGraphVersion
from autocimkg.graph_integration import GraphIntegrator
from autocimkg.metadata_integration import MetadataIntegrator
from autocimkg.utils import ScientificArticle, AuthorsOnly

### **initialisation of models, import of required data and pre-processing**  
The first step is to initialise the chat and embedding model with the respective API token(s).

In [None]:
openai_api_key = ""

openai_llm_model = ChatOpenAI(
    api_key = openai_api_key,
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    model_kwargs={"response_format": {"type": "json_object"}}
)

openai_embeddings_model = OpenAIEmbeddings(
    api_key = openai_api_key ,
    model="text-embedding-3-large"
)

Next, the raw documents are fetched from the workspace of the project. This requires specifying the path (e.g. '../tutorial/data/abc.pdf'), possible pages to exclude and a type (e.g. a 'scientific article') for each document to process. 

In [None]:
raw_documents = []
documents_to_fetch = [
    ("../tutorial/data/abc.pdf", list(range(2,50,1)), 'scientific article'),
    ("../tutorial/data/def.pdf", list(range(2,50,1)), 'scientific article')
]

for path_, exclude_pages, type_ in documents_to_fetch:
    loader = PyPDFLoader(path_)
    pages = loader.load_and_split()
    pages = [page for page in pages if page.metadata["page"]+1 not in exclude_pages] # exclude some pages (unnecessary pages, e.g. the references)
    pages = [page.page_content.replace("{", '[').replace("}", "]") 
             for page in pages]
    head, tail = os.path.split(path_)
    raw_documents.append({'name': tail, 'content': pages, 
                          'extract_type': type_, 'doc_type': type_})

Optionally, the raw documents can be compressed to blocks. Here, the LLM uses a user-defined blueprint to extract certain parts of the full-text (e.g. title, abstract, keywords and authors).

In [None]:
documents: list[Document] = []
document_distiller = DocumentsDistiller(llm_model=openai_llm_model)

for raw_document in raw_documents:
    
    # determine blueprint
    blueprint = None
    if raw_document['extract_type'] == 'scientific article': 
        blueprint = ScientificArticle
    elif raw_document['extract_type'] == 'authors only': 
        blueprint = AuthorsOnly
    else: 
        print(f"No template for document distillation found for {raw_document['extract_type']}")
        continue
        
    # determine context
    context = ['\n\n'.join(raw_document['content'])] # list w/ one element per page => one-element list 
    if blueprint == AuthorsOnly: context = ['\n\n'.join(raw_document['content'][:2])] # AUTHORS ONLY: only first pages
    
    # distill
    distilled_doc = document_distiller.distill(
            documents = context,
            document_type = raw_document['doc_type'],
            output_data_structure = blueprint
    )
    
    # assemble
    content = [f"{raw_document['doc_type']}'s {key} - {value}".replace("{", "[").replace("}", "]") 
               for key, value in distilled_doc.items() 
               if value and value != [] and key != "authors"]
    if not content: content = raw_document['content'] # AUTHORS ONLY: take whole content
    # spare authors from being processed later on!
    authors = distilled_doc.get("authors") if "authors" in distilled_doc else []

    document = Document(name = raw_document['name'], doc_type = raw_document['doc_type'], content = content, authors = authors, language = "eng")
    # create / save metadata!
    
    documents.append(document)

Lastly, the relational master data need to be fetched and preprocessed. AutoCimKG expects a CSV file with the following values for each person: ID, GIVENNAME, SURNAME, COMPANY, DEPARTMENT and STATUS.

In [None]:
employees_df = pd.read_csv('../tutorial/data/employee.csv', sep=";", header='infer')
employees_df['NAME'] = employees_df.apply(lambda employee: employee['SURNAME'] + ' ' + employee['GIVENNAME'], axis=1)
# resolution based on full, lowercase name
employees_df['NAME_EMBEDDING'] = employees_df.apply(lambda employee: np.array(openai_embeddings_model.embed_query(employee['NAME'].lower())), axis=1)

employees = [Employee(id_=empl[0], name=empl[1], name_embedding=empl[5], company=empl[2], department=empl[3], status=True if empl[4] == "active" else False) for empl in zip(employees_df['ID'], employees_df['NAME'], employees_df['COMPANY'], employees_df['DEPARTMENT'], employees_df['STATUS'], employees_df['NAME_EMBEDDING'])]

### **KG construction and maintenance**  
Now, the actual creation and upkeep of the competency KG can be initiated. Here, AutoCimKG offers many options for parametrisation. Moreover, this step consumes the prepared documents, the relational master data and optionally a lightweight ontology to align the whole text-based extraction. If a competency KG already exists and needs to be incrementally maintained with new documents, it would be handed over to the corresponding function call here. The source code provides insights in all offered options.

In [None]:
start_proc_ts = datetime.now()

ont = Ontology(topics=[{"Subject area": "Concise description of a domain-specific area."}],
    relations=[{"Relationship type": "Concise description of a desired relation type between a START and END entity."}],
    strict=False)

autocimkg = AutoCimKGCore(llm_model = openai_llm_model, embeddings_model = openai_embeddings_model)
kg, ont = autocimkg.build_graph(ontology = ont, 
                            documents = documents, employees = employees, domain = "financial supervisory domain", expert_threshold=0.8, ent_threshold = 0.8, 
                            rel_threshold = 0.8, max_tries_isolated_entities=0)

# STOP KGC
log = autocimkg.log[0]
conf = autocimkg.conf[0]
agent = autocimkg.agent
chat_conf = autocimkg.llm_model.model_dump()
chat_conf['openai_api_key'] = ""
emd_conf = autocimkg.embeddings_model.model_dump()
emd_conf['openai_api_key'] = ""

end_proc_ts = datetime.now()

### **database integration**  
After successful construction or maintenance of a competency KG, AutoCimKG can be used to store and retrieve graph and metadata with respect to a connected database.

In [None]:
HOST = ""
PORT = 0
DBNAME = ""
USERNAME = ""
PASSWORD = ""

GRAPH = "kg_v1"

graph_integrator = GraphIntegrator(host = HOST, port = PORT, dbname = DBNAME, username = USERNAME, password = PASSWORD)
metadata_integrator = MetadataIntegrator(host = HOST, port = PORT, dbname = DBNAME, username = USERNAME, password = PASSWORD)

In [None]:
graph_integrator.delete_graph(GRAPH)
graph_integrator.write_graph(GRAPH, kg)
kg_db = graph_integrator.read_graph(GRAPH)

In [None]:
metadata_integrator.init_db()

In [None]:
metadata_integrator.create_kg_version(KnowledgeGraphVersion(kg_name=GRAPH, agent=agent, start_proc_ts=start_proc_ts, end_proc_ts=end_proc_ts))
for kg_version in metadata_integrator.read_kg_versions():
    print(kg_version.kg_name, kg_version.agent, kg_version.start_proc_ts, kg_version.end_proc_ts)
# metadata_integrator.delete_kg_version(GRAPH)

In [None]:
metadata_integrator.create_logs(kg_name=GRAPH, logs=log)

for entry in metadata_integrator.read_logs(kg_name= GRAPH):
    print(entry.ts, entry.logger_name, entry.log_level, entry.message)
# metadata_integrator.delete_logs(GRAPH)

In [None]:
metadata_integrator.create_data_sources(documents=documents)

for doc in metadata_integrator.read_data_sources():
    print(doc.name, doc.language, doc.authors, doc.doc_type)
# metadata_integrator.delete_data_sources()

In [None]:
metadata_integrator.create_ontology(GRAPH, ont)

for ontology in metadata_integrator.read_ontologies(GRAPH):
    print(ontology.topics, ontology.relations, ontology.strict)
# metadata_integrator.delete_ontologies(GRAPH)

In [None]:
metadata_integrator.create_llm_config(kg_name=GRAPH, llm_config=chat_conf)
metadata_integrator.create_llm_config(kg_name=GRAPH, llm_config=emd_conf)

for config in metadata_integrator.read_llm_configs(GRAPH):
    print(config)
# metadata_integrator.delete_llm_configs(GRAPH)

In [None]:
metadata_integrator.create_autocimkg_config(GRAPH, conf)

for config in metadata_integrator.read_autocimkg_configs(GRAPH):
    print(config)
# metadata_integrator.delete_autocimkg_configs(GRAPH)