<a href="https://colab.research.google.com/github/graphlit/graphlit-samples/blob/main/python/Notebook%20Examples/Graphlit_2024_09_16_Build_Medical_Knowledge_Graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Description**

This example shows how to ingest medical-related documentation, extract entities, and ....

**Requirements**

Prior to running this notebook, you will need to [signup](https://docs.graphlit.dev/getting-started/signup) for Graphlit, and [create a project](https://docs.graphlit.dev/getting-started/create-project).

You will need the Graphlit organization ID, preview environment ID and JWT secret from your created project.

Assign these properties as Colab secrets: GRAPHLIT_ORGANIZATION_ID, GRAPHLIT_ENVIRONMENT_ID and GRAPHLIT_JWT_SECRET.


---

Install Graphlit Python client SDK

In [None]:
!pip install --upgrade graphlit-client

Initialize Pyvis

In [None]:
!pip install --upgrade pyvis

Initialize Graphlit

In [None]:
import os
from google.colab import userdata
from graphlit import Graphlit
from graphlit_api import input_types, enums, exceptions

os.environ['GRAPHLIT_ORGANIZATION_ID'] = userdata.get('GRAPHLIT_ORGANIZATION_ID')
os.environ['GRAPHLIT_ENVIRONMENT_ID'] = userdata.get('GRAPHLIT_ENVIRONMENT_ID')
os.environ['GRAPHLIT_JWT_SECRET'] = userdata.get('GRAPHLIT_JWT_SECRET')

graphlit = Graphlit()

Define Graphlit helper functions

In [None]:
from typing import List, Optional

# Create specification for Anthropic Sonnet 3.5
async def create_anthropic_specification():
    if graphlit.client is None:
        return;

    input = input_types.SpecificationInput(
        name="Anthropic Claude Sonnet 3.5",
        type=enums.SpecificationTypes.EXTRACTION,
        serviceType=enums.ModelServiceTypes.ANTHROPIC,
        anthropic=input_types.AnthropicModelPropertiesInput(
            model=enums.AnthropicModels.CLAUDE_3_5_SONNET,
        )
    )

    try:
        response = await graphlit.client.create_specification(input)

        return response.create_specification.id if response.create_specification is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

    return None

# Create entity extraction and entity enrichment workflow using LLM specification
async def create_workflow(specification_id: str):
    if graphlit.client is None:
        return;

    input = input_types.WorkflowInput(
        name="Entity Extraction",
        extraction=input_types.ExtractionWorkflowStageInput(
            jobs=[
                input_types.ExtractionWorkflowJobInput(
                    connector=input_types.EntityExtractionConnectorInput(
                        type=enums.EntityExtractionServiceTypes.MODEL_TEXT,
                        modelText=input_types.ModelTextExtractionPropertiesInput(
                            specification=input_types.EntityReferenceInput(id=specification_id)
                        )
                    )
                )
            ]
        )
    )

    try:
        response = await graphlit.client.create_workflow(input)

        return response.create_workflow.id if response.create_workflow is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

    return None

async def create_feed(read_limit: int, workflow_id: str, storage_access_key: str, account_name: str, container_name: str, prefix: str):
    if graphlit.client is None:
        return;

    input = input_types.FeedInput(
        name="Azure Blob",
        type=enums.FeedTypes.SITE,
        site=input_types.SiteFeedPropertiesInput(
            type=enums.FeedServiceTypes.AZURE_BLOB,
            azureBlob=input_types.AzureBlobFeedPropertiesInput(
                storageAccessKey=storage_access_key,
                accountName=account_name,
                containerName=container_name,
                prefix=prefix,
            ),
            isRecursive=True, # NOTE: ingest folders recursively
            readLimit=read_limit
        ),
        workflow=input_types.EntityReferenceInput(
            id=workflow_id
        )
    )

    try:
        response = await graphlit.client.create_feed(input)

        return response.create_feed.id if response.create_feed is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

    return None

async def is_feed_done(feed_id: str):
    if graphlit.client is None:
        return;

    response = await graphlit.client.is_feed_done(feed_id)

    return response.is_feed_done.result if response.is_feed_done is not None else None

async def query_contents_graph(feed_id: str, search_text: Optional[str], medical_condition_id: Optional[str] = None):
    if graphlit.client is None:
        return;

    try:
        response = await graphlit.client.query_contents_graph(
            filter=input_types.ContentFilter(
                search=search_text,
                searchType=enums.SearchTypes.HYBRID,
                feeds=[
                    input_types.EntityReferenceFilter(
                        id=feed_id
                    )
                ],
                observations=[
                    input_types.ObservationReferenceFilter(
                        type=enums.ObservableTypes.MEDICAL_CONDITION,
                        observable=input_types.EntityReferenceFilter(
                            id=medical_condition_id
                        )
                    )
                ] if medical_condition_id is not None else None
            ),
            graph=input_types.ContentGraphInput(
                # Uncomment to filter on observable types
#                types=[enums.ObservableTypes.MEDICAL_CONDITION,enums.ObservableTypes.ORGANIZATION]
            )
        )

        return response.contents.graph if response.contents is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

async def query_medical_conditions(search_text: str):
    if graphlit.client is None:
        return;

    try:
        response = await graphlit.client.query_medical_conditions(
            filter=input_types.MedicalConditionFilter(
                search=search_text,
                searchType=enums.SearchTypes.HYBRID, # or, select KEYWORD for exact match
                limit=10 # return top 10, if we want to have drop-down list
            )
        )

        return response.medical_conditions.results if response.medical_conditions is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

async def delete_all_workflows():
    if graphlit.client is None:
        return;

    _ = await graphlit.client.delete_all_workflows(is_synchronous=True)

async def delete_all_contents():
    if graphlit.client is None:
        return;

    _ = await graphlit.client.delete_all_contents(is_synchronous=True)

async def delete_all_feeds():
    if graphlit.client is None:
        return;

    _ = await graphlit.client.delete_all_feeds(is_synchronous=True)


Define Graphlit knowledge graph helper functions. Required for rendering the knowledge graph with Pyvis.

In [None]:
import random
import os
import json
from pyvis.network import Network
from typing import Optional

def select_emoji(entity_type):
    # Emoji mappings for observable types
    observable_emoji_map = {
        "CONTENT": "📄",  # Page facing up emoji for generic content
        "LABEL": "🏷️",   # Label emoji for categories or tags
        "PERSON": "🧑",  # Person emoji for individuals
        "ORGANIZATION": "🏢",  # Office building emoji for organizations
        "PLACE": "🌍",  # Globe showing Europe-Africa for places
        "PRODUCT": "🛍️",  # Shopping bags emoji for products
        "SOFTWARE": "💻",  # Laptop emoji for software
        "REPO": "🗂️",  # Card index dividers emoji for repositories
        "EVENT": "🎉",  # Party popper emoji for events
        "MEDICAL_STUDY": "📊",  # Bar chart emoji for medical studies
        "MEDICAL_CONDITION": "🤒",  # Face with thermometer emoji for medical conditions
        "MEDICAL_GUIDELINE": "📜",  # Scroll emoji for medical guidelines
        "MEDICAL_DRUG": "💊",  # Pill emoji for medical drugs
        "MEDICAL_DRUG_CLASS": "🧬",  # DNA emoji for medical drug classes
        "MEDICAL_INDICATION": "🔍",  # Magnifying glass emoji for medical indications
        "MEDICAL_CONTRAINDICATION": "🚫",  # Prohibited emoji for contraindications
        "MEDICAL_TEST": "🧪",  # Test tube emoji for medical tests
        "MEDICAL_DEVICE": "🦾",  # Mechanical arm emoji for medical devices
        "MEDICAL_THERAPY": "🩺",  # Stethoscope emoji for medical therapies
        "MEDICAL_PROCEDURE": "🔧",  # Wrench emoji for medical procedures
    }

    # Return the emoji corresponding to the entity type
    return observable_emoji_map.get(entity_type, "📄")  # Default to page facing up emoji if entity type is unknown

def lookup_node_shape(entity_type, content_type, file_type):
    entity_icon_map = {
        "CONTENT": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf15c", "color": "#aec7e8"}},  # file-text
        "LABEL": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf02b", "color": "#ffbb78"}},   # tag (luggage tag-like)
        "PERSON": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf007", "color": "#98df8a"}},  # user
        "ORGANIZATION": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf1ad", "color": "#ff9896"}},  # building
        "PLACE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf3c5", "color": "#c5b0d5"}},  # globe-americas
        "PRODUCT": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf1b2", "color": "#c49c94"}},  # cube
        "SOFTWARE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf085", "color": "#f7b6d2"}},  # cog
        "REPO": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf1c0", "color": "#c7c7c7"}},  # database
        "EVENT": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf073", "color": "#dbdb8d"}},  # calendar
        "MEDICAL_STUDY": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf201", "color": "#17becf"}},  # bar-chart
        "MEDICAL_CONDITION": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf0f1", "color": "#d62728"}},  # heartbeat
        "MEDICAL_GUIDELINE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf02d", "color": "#9467bd"}},  # book
        "MEDICAL_DRUG": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf484", "color": "#bcbd22"}},  # capsules
        "MEDICAL_DRUG_CLASS": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf471", "color": "#1f77b4"}},  # dna
        "MEDICAL_INDICATION": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf0ae", "color": "#2ca02c"}},  # search
        "MEDICAL_CONTRAINDICATION": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf05e", "color": "#ff7f0e"}},  # ban
        "MEDICAL_TEST": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf469", "color": "#e377c2"}},  # vial
        "MEDICAL_DEVICE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf6a8", "color": "#7f7f7f"}},  # robot
        "MEDICAL_THERAPY": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf0f9", "color": "#8c564b"}},  # stethoscope
        "MEDICAL_PROCEDURE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf0ad", "color": "#17becf"}},  # wrench
    }

    content_icon_map = {
        "FILE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf15c", "color": "#aec7e8"}},  # file
        "PAGE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf0ac", "color": "#aec7e8"}},  # globe
        "MESSAGE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf075", "color": "#aec7e8"}},  # comment
        "TEXT": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf15c", "color": "#aec7e8"}},  # file-text
        "POST": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf1ea", "color": "#aec7e8"}},  # newspaper
        "EMAIL": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf0e0", "color": "#aec7e8"}},  # envelope
        "EVENT": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf073", "color": "#aec7e8"}},  # calendar
        "ISSUE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf188", "color": "#aec7e8"}},  # bug
    }

    file_icon_map = {
        "VIDEO": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf03d", "color": "#aec7e8"}},  # video-camera
        "AUDIO": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf028", "color": "#aec7e8"}},  # volume-up
        "IMAGE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf03e", "color": "#aec7e8"}},  # picture-o (image)
        "DOCUMENT": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf15b", "color": "#aec7e8"}},  # file-text-o
        "EMAIL": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf0e0", "color": "#aec7e8"}},  # envelope
        "CODE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf121", "color": "#aec7e8"}},  # code
        "DATA": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf1c0", "color": "#aec7e8"}},  # database
    }

    if file_type is not None:
        return file_icon_map.get(file_type, content_icon_map.get("FILE"))
    elif content_type is not None:
        return content_icon_map.get(content_type, content_icon_map.get("FILE"))
    else:
        return entity_icon_map.get(entity_type, {"shape": "dot"})  # Default to a simple dot shape if the entity type is unknown

def lookup_node_color(entity_type):
    entity_color_map = {
        "CONTENT": "#aec7e8",  # Soft blue
        "LABEL": "#ffbb78",   # Soft orange
        "PERSON": "#98df8a",  # Pale green
        "ORGANIZATION": "#ff9896",  # Soft red
        "PLACE": "#c5b0d5",  # Soft purple
        "PRODUCT": "#c49c94",  # Soft brown
        "SOFTWARE": "#f7b6d2",  # Light pink
        "REPO": "#c7c7c7",  # Light gray
        "EVENT": "#dbdb8d",  # Soft yellow
        "MEDICAL_STUDY": "#17becf",  # Cyan
        "MEDICAL_CONDITION": "#d62728",  # Strong red
        "MEDICAL_GUIDELINE": "#9467bd",  # Medium purple
        "MEDICAL_DRUG": "#bcbd22",  # Olive green
        "MEDICAL_DRUG_CLASS": "#1f77b4",  # Strong blue
        "MEDICAL_INDICATION": "#2ca02c",  # Vivid green
        "MEDICAL_CONTRAINDICATION": "#ff7f0e",  # Vivid orange
        "MEDICAL_TEST": "#e377c2",  # Soft magenta
        "MEDICAL_DEVICE": "#7f7f7f",  # Medium gray
        "MEDICAL_THERAPY": "#8c564b",  # Muted brown
        "MEDICAL_PROCEDURE": "#17becf",  # Cyan (same as MEDICAL_STUDY)
    }

    return entity_color_map.get(entity_type, "#ffffff")  # Default to white if entity type is unknown

def parse_metadata(metadata):
    if metadata is None:
        return None, None

    o = json.loads(metadata)

    return enums.ContentTypes[o["type"]] if "type" in o else None, enums.FileTypes[o["fileType"]] if "fileType" in o else None

def pretty_print_json(dictionary):
    return '\n'.join(f"{key}: {value}" for key, value in dictionary.items() if not key.startswith('@'))

def parse_title(metadata):
    if metadata is None:
        return None

    o = json.loads(metadata)

    if o is not None:
        uri = o["uri"] if "uri" in o else None

        title = pretty_print_json(o)

        if uri is not None:
            return f'URI: {uri}' + '\n' + title
        else:
            return title
    else:
        return None

def parse_label(metadata):
    if metadata is None:
        return None

    o = json.loads(metadata)

    file_name = o["fileName"] if "fileName" in o else None

    label = None

    document = o["document"] if "document" in o else None
    audio = o["audio"] if "audio" in o else None
    video = o["video"] if "video" in o else None

    if document is not None and "title" in document:
        label = document["title"]
    elif video is not None and "title" in video:
        label = video["title"]
    elif audio is not None and "title" in audio:
        label = audio["title"]

    return label if label is not None else file_name

def format_relation(relation: str):
    if relation == "observed-by":
        return None

    return relation.replace("-", " ")

def create_pyvis_contents_graph(graph):
    g = create_pyvis_network()

    if graph.nodes is not None:
        for node in graph.nodes:
            content_type = None
            file_type = None
            label = None
            title = None

            parsed_title = parse_title(node.metadata)

            if parsed_title is not None:
                if node.type == enums.EntityTypes.CONTENT:
                    content_type, file_type = parse_metadata(node.metadata)
                    label = parse_label(node.metadata)
                    title = f'{node.type.name} [{node.id}]\n' + parsed_title
                else:
                    title = f'{node.type.name} [{node.id}]\n' + parsed_title

            shape = lookup_node_shape(node.type.name, content_type, file_type)

            if shape is not None:
                g.add_node(node.id, label=label if label is not None else node.name, shape=shape["shape"], icon=shape.get("icon"), color=lookup_node_color(node.type.name), title=title if title is not None else f'{node.type.name} [{node.id}]')

    if graph.edges is not None:
        for edge in graph.edges:
            # ensure start and end vertex exist in graph
            if not edge.from_ in g.node_ids:
                g.add_node(edge.from_)
            if not edge.to in g.node_ids:
                g.add_node(edge.to)

            relation = format_relation(edge.relation)

            width = 3 if edge.relation != "observed-by" else 1

            g.add_edge(edge.from_, edge.to, label=relation, title=relation, width=width, arrowStrikethrough=False, arrows="middle")

    return g

def create_pyvis_network():
    g = Network(
        notebook=False,
        directed=True,
        cdn_resources="in_line",
        height="900px",
        width="100%",
    )

    return g

Execute Graphlit example

In [None]:
from IPython.display import display, Markdown, HTML
import ipywidgets as widgets
import time

# Remove any existing feeds, contents and workflows; only needed for notebook example
await delete_all_workflows()
await delete_all_contents()
await delete_all_feeds()

print('Deleted all feeds, contents and workflows.')

read_limit = 1000 # how many files to ingest from feed

# Specify the Azure container name and blob prefix
container_name = "test"
prefix = "customers/intelinotion/2024-09-10/"

extraction_specification_id = await create_anthropic_specification()

if extraction_specification_id is not None:
    print(f'Created extraction specification [{extraction_specification_id}].')

    workflow_id = await create_workflow(extraction_specification_id)

    if workflow_id is not None:
        print(f'Created workflow [{workflow_id}].')

        feed_id = await create_feed(read_limit, workflow_id, userdata.get('AZURE_STORAGE_ACCESS_KEY'), userdata.get('AZURE_STORAGE_ACCOUNT_NAME'), container_name, prefix)

        if feed_id is not None:
            print(f'Created feed [{feed_id}].')

            # Wait for feed to complete, since ingestion happens asychronously
            done = False
            time.sleep(5)
            while not done:
                done = await is_feed_done(feed_id)

                if not done:
                    print('.', end="")
                    time.sleep(2)

            print()
            print(f'Completed feed [{feed_id}].')

In [None]:
            search_text = input("Enter name of medical condition, or any search text (or just hit <enter> for everything): ")

            medical_conditions = await query_medical_conditions(search_text)
            medical_condition = medical_conditions[0] if medical_conditions is not None and len(medical_conditions) > 0 else None
            medical_condition_id = medical_condition.id if medical_condition is not None else None

            if medical_condition is not None:
                print(f'Filtering by medical condition [{medical_condition.id}]: {medical_condition.name}')
            else:
                print(f'Searching for [{search_text}]')

In [None]:
            # Query the resulting knowledge graph for the feed
            graph = await query_contents_graph(feed_id, search_text if medical_condition_id is None else None, medical_condition_id)

            if graph is not None:
                g = create_pyvis_contents_graph(graph)

                g.set_options("""
                var options = {
                    "physics": {
                        "forceAtlas2Based": {
                            "gravitationalConstant": -50,
                            "centralGravity": 0.01,
                            "springLength": 100,
                            "springConstant": 0.08
                        },
                        "maxVelocity": 100,
                        "solver": "forceAtlas2Based",
                        "timestep": 0.25,
                        "stabilization": {
                            "iterations": 10
                        }
                    }
                    }
                """)

                # render with random file name
                graph_html = g.generate_html(f"graph_{random.randint(0, 1000)}.html")

                # Inject FontAwesome CSS
                font_awesome_link = '<script src="https://kit.fontawesome.com/2c74303849.js" crossorigin="anonymous"></script>'
                graph_html = graph_html.replace('<head>', f'<head>{font_awesome_link}')

                display(HTML(graph_html))