<a href="https://colab.research.google.com/github/graphlit/graphlit-samples/blob/main/python/Notebook%20Examples/Graphlit_2024_09_25_Connected_Data_Knowledge_Graph_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Description**

This example shows how to generate a knowledge graph from extracted entities, based on the [Connected Data Knowledge Graph Challenge](https://github.com/Connected-Data/cdkg-challenge) dataset.

**Requirements**

Prior to running this notebook, you will need to [signup](https://docs.graphlit.dev/getting-started/signup) for Graphlit, and [create a project](https://docs.graphlit.dev/getting-started/create-project).

You will need the Graphlit organization ID, preview environment ID and JWT secret from your created project.

Assign these properties as Colab secrets: GRAPHLIT_ORGANIZATION_ID, GRAPHLIT_ENVIRONMENT_ID and GRAPHLIT_JWT_SECRET.


---

Install Graphlit Python client SDK

In [2]:
!pip install --upgrade graphlit-client



Install Pyvis

In [3]:
!pip install --upgrade pyvis



Clone the original repo

In [4]:
!rm -rf cdkg-challenge
!git clone https://github.com/Connected-Data/cdkg-challenge

Cloning into 'cdkg-challenge'...
remote: Enumerating objects: 105, done.[K
remote: Counting objects: 100% (105/105), done.[K
remote: Compressing objects: 100% (99/99), done.[K
remote: Total 105 (delta 18), reused 4 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (105/105), 1.11 MiB | 1.54 MiB/s, done.
Resolving deltas: 100% (18/18), done.


In [5]:
DATA_DIRECTORY = "cdkg-challenge"

PATH_TRANSCRIPTS = DATA_DIRECTORY + "/Transcripts/"

Initialize Graphlit

In [6]:
import os
from google.colab import userdata
from graphlit import Graphlit
from graphlit_api import input_types, enums, exceptions

os.environ['GRAPHLIT_ORGANIZATION_ID'] = userdata.get('GRAPHLIT_ORGANIZATION_ID')
os.environ['GRAPHLIT_ENVIRONMENT_ID'] = userdata.get('GRAPHLIT_ENVIRONMENT_ID')
os.environ['GRAPHLIT_JWT_SECRET'] = userdata.get('GRAPHLIT_JWT_SECRET')

graphlit = Graphlit()

Define Graphlit helper functions

In [44]:
import base64
import mimetypes
import asyncio
from typing import List, Optional
from tqdm.asyncio import tqdm

async def process_file(filename, workflow_id, progress_bar):
    print(f'Ingesting content from [{filename}]')

    content_id = await ingest_file(filename, workflow_id)

    if content_id is not None:
        print(f'Ingested content [{content_id}] from [{filename}].')
    else:
        print(f'Failed to ingest content from [{filename}].')

    progress_bar.update(1)

# NOTE: for local files, load from disk and convert to Base64 data
async def ingest_file(file_path: str, workflow_id: str):
    if graphlit.client is None:
        return;

    try:
        file_name = os.path.basename(file_path)
        content_name, _ = os.path.splitext(file_name)

        mime_type = mimetypes.guess_type(file_name)[0]

        if mime_type is None:
            print(f'Failed to infer MIME type [{file_name}], treat as plain text')
            mime_type = "text/plain"

        if mime_type == "text/csv":
            print(f'Skipping MIME type [{mime_type}]')
            return None

        if not os.path.exists(file_path):
            print(f'File [{file_path}] does not exist')
            return None

        with open(file_path, "rb") as file:
            file_content = file.read()

            base64_content = base64.b64encode(file_content).decode('utf-8')

            # Using synchronous mode, so the notebook waits for the content to be ingested
            response = await graphlit.client.ingest_encoded_file(name=content_name, data=base64_content, mime_type=mime_type, workflow=input_types.EntityReferenceInput(id=workflow_id), is_synchronous=True)

            return response.ingest_encoded_file.id if response.ingest_encoded_file is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

# Create specification for Anthropic Sonnet 3.5
async def create_anthropic_specification():
    if graphlit.client is None:
        return;

    input = input_types.SpecificationInput(
        name="Anthropic Claude Sonnet 3.5",
        type=enums.SpecificationTypes.EXTRACTION,
        serviceType=enums.ModelServiceTypes.ANTHROPIC,
        anthropic=input_types.AnthropicModelPropertiesInput(
            model=enums.AnthropicModels.CLAUDE_3_5_SONNET,
        )
    )

    try:
        response = await graphlit.client.create_specification(input)

        return response.create_specification.id if response.create_specification is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

    return None

# Create specification for OpenAI GPT-4o
async def create_openai_specification():
    if graphlit.client is None:
        return;

    input = input_types.SpecificationInput(
        name="OpenAI GPT-4o",
        type=enums.SpecificationTypes.EXTRACTION,
        serviceType=enums.ModelServiceTypes.OPEN_AI,
        openAI=input_types.OpenAIModelPropertiesInput(
            model=enums.OpenAIModels.GPT4O_128K,
        )
    )

    try:
        response = await graphlit.client.create_specification(input)

        return response.create_specification.id if response.create_specification is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

    return None

# Create entity extraction workflow using LLM specification
async def create_workflow(specification_id: str):
    if graphlit.client is None:
        return;

    input = input_types.WorkflowInput(
        name="Entity Extraction",
        extraction=input_types.ExtractionWorkflowStageInput(
            jobs=[
                input_types.ExtractionWorkflowJobInput(
                    connector=input_types.EntityExtractionConnectorInput(
                        type=enums.EntityExtractionServiceTypes.MODEL_TEXT,
                        modelText=input_types.ModelTextExtractionPropertiesInput(
                            specification=input_types.EntityReferenceInput(id=specification_id)
                        )
                    )
                )
            ]
        )
    )

    try:
        response = await graphlit.client.create_workflow(input)

        return response.create_workflow.id if response.create_workflow is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

    return None

async def query_contents_graph():
    if graphlit.client is None:
        return;

    try:
        response = await graphlit.client.query_contents_graph(
            graph=input_types.ContentGraphInput(
                # Uncomment to filter just on software and organization nodes
#                types=[enums.ObservableTypes.SOFTWARE,enums.ObservableTypes.ORGANIZATION]
            )
        )

        return response.contents.graph if response.contents is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

async def query_contents_facets():
    if graphlit.client is None:
        return;

    try:
        response = await graphlit.client.query_contents_facets(
            filter=input_types.ContentFilter(
                offset=0,
                limit=0,
            ),
            facets=[
                input_types.ContentFacetInput(
                    facet=enums.ContentFacetTypes.OBSERVABLE
                )
            ]
        )

        return response.contents.facets if response.contents is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

# NOTE: these functions are just used to clean-up old data before executing the example
async def delete_all_specifications():
    if graphlit.client is None:
        return;

    _ = await graphlit.client.delete_all_specifications(is_synchronous=True)

async def delete_all_contents():
    if graphlit.client is None:
        return;

    _ = await graphlit.client.delete_all_contents(is_synchronous=True)

async def delete_all_workflows():
    if graphlit.client is None:
        return;

    _ = await graphlit.client.delete_all_workflows(is_synchronous=True)


Define Graphlit knowledge graph helper functions. Required for rendering the knowledge graph with Pyvis.

In [38]:
import random
import os
import json
from pyvis.network import Network
from typing import Optional

def select_emoji(entity_type):
    # Emoji mappings for observable types
    observable_emoji_map = {
        "CONTENT": "📄",  # Page facing up emoji for generic content
        "LABEL": "🏷️",   # Label emoji for categories or tags
        "PERSON": "🧑",  # Person emoji for individuals
        "ORGANIZATION": "🏢",  # Office building emoji for organizations
        "PLACE": "🌍",  # Globe showing Europe-Africa for places
        "PRODUCT": "🛍️",  # Shopping bags emoji for products
        "SOFTWARE": "💻",  # Laptop emoji for software
        "REPO": "🗂️",  # Card index dividers emoji for repositories
        "EVENT": "🎉",  # Party popper emoji for events
        "MEDICAL_STUDY": "📊",  # Bar chart emoji for medical studies
        "MEDICAL_CONDITION": "🤒",  # Face with thermometer emoji for medical conditions
        "MEDICAL_GUIDELINE": "📜",  # Scroll emoji for medical guidelines
        "MEDICAL_DRUG": "💊",  # Pill emoji for medical drugs
        "MEDICAL_DRUG_CLASS": "🧬",  # DNA emoji for medical drug classes
        "MEDICAL_INDICATION": "🔍",  # Magnifying glass emoji for medical indications
        "MEDICAL_CONTRAINDICATION": "🚫",  # Prohibited emoji for contraindications
        "MEDICAL_TEST": "🧪",  # Test tube emoji for medical tests
        "MEDICAL_DEVICE": "🦾",  # Mechanical arm emoji for medical devices
        "MEDICAL_THERAPY": "🩺",  # Stethoscope emoji for medical therapies
        "MEDICAL_PROCEDURE": "🔧",  # Wrench emoji for medical procedures
    }

    # Return the emoji corresponding to the entity type
    return observable_emoji_map.get(entity_type, "📄")  # Default to page facing up emoji if entity type is unknown

def lookup_node_shape(entity_type, content_type, file_type):
    entity_icon_map = {
        "CONTENT": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf15c", "color": "#aec7e8"}},  # file-text
        "LABEL": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf02b", "color": "#ffbb78"}},   # tag (luggage tag-like)
        "PERSON": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf007", "color": "#98df8a"}},  # user
        "ORGANIZATION": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf1ad", "color": "#ff9896"}},  # building
        "PLACE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf3c5", "color": "#c5b0d5"}},  # globe-americas
        "PRODUCT": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf1b2", "color": "#c49c94"}},  # cube
        "SOFTWARE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf085", "color": "#f7b6d2"}},  # cog
        "REPO": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf1c0", "color": "#c7c7c7"}},  # database
        "EVENT": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf073", "color": "#dbdb8d"}},  # calendar
        "MEDICAL_STUDY": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf201", "color": "#17becf"}},  # bar-chart
        "MEDICAL_CONDITION": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf0f1", "color": "#d62728"}},  # heartbeat
        "MEDICAL_GUIDELINE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf02d", "color": "#9467bd"}},  # book
        "MEDICAL_DRUG": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf484", "color": "#bcbd22"}},  # capsules
        "MEDICAL_DRUG_CLASS": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf471", "color": "#1f77b4"}},  # dna
        "MEDICAL_INDICATION": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf0ae", "color": "#2ca02c"}},  # search
        "MEDICAL_CONTRAINDICATION": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf05e", "color": "#ff7f0e"}},  # ban
        "MEDICAL_TEST": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf469", "color": "#e377c2"}},  # vial
        "MEDICAL_DEVICE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf6a8", "color": "#7f7f7f"}},  # robot
        "MEDICAL_THERAPY": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf0f9", "color": "#8c564b"}},  # stethoscope
        "MEDICAL_PROCEDURE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf0ad", "color": "#17becf"}},  # wrench
    }

    content_icon_map = {
        "FILE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf15c", "color": "#aec7e8"}},  # file
        "PAGE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf0ac", "color": "#aec7e8"}},  # globe
        "MESSAGE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf075", "color": "#aec7e8"}},  # comment
        "TEXT": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf15c", "color": "#aec7e8"}},  # file-text
        "POST": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf1ea", "color": "#aec7e8"}},  # newspaper
        "EMAIL": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf0e0", "color": "#aec7e8"}},  # envelope
        "EVENT": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf073", "color": "#aec7e8"}},  # calendar
        "ISSUE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf188", "color": "#aec7e8"}},  # bug
    }

    file_icon_map = {
        "VIDEO": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf03d", "color": "#aec7e8"}},  # video-camera
        "AUDIO": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf028", "color": "#aec7e8"}},  # volume-up
        "IMAGE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf03e", "color": "#aec7e8"}},  # picture-o (image)
        "DOCUMENT": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf15b", "color": "#aec7e8"}},  # file-text-o
        "EMAIL": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf0e0", "color": "#aec7e8"}},  # envelope
        "CODE": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf121", "color": "#aec7e8"}},  # code
        "DATA": {"shape": "icon", "icon": {"face": "FontAwesome", "code": "\uf1c0", "color": "#aec7e8"}},  # database
    }

    if file_type is not None:
        return file_icon_map.get(file_type, content_icon_map.get("FILE"))
    elif content_type is not None:
        return content_icon_map.get(content_type, content_icon_map.get("FILE"))
    else:
        return entity_icon_map.get(entity_type, {"shape": "dot"})  # Default to a simple dot shape if the entity type is unknown

def lookup_node_color(entity_type):
    entity_color_map = {
        "CONTENT": "#aec7e8",  # Soft blue
        "LABEL": "#ffbb78",   # Soft orange
        "PERSON": "#98df8a",  # Pale green
        "ORGANIZATION": "#ff9896",  # Soft red
        "PLACE": "#c5b0d5",  # Soft purple
        "PRODUCT": "#c49c94",  # Soft brown
        "SOFTWARE": "#f7b6d2",  # Light pink
        "REPO": "#c7c7c7",  # Light gray
        "EVENT": "#dbdb8d",  # Soft yellow
        "MEDICAL_STUDY": "#17becf",  # Cyan
        "MEDICAL_CONDITION": "#d62728",  # Strong red
        "MEDICAL_GUIDELINE": "#9467bd",  # Medium purple
        "MEDICAL_DRUG": "#bcbd22",  # Olive green
        "MEDICAL_DRUG_CLASS": "#1f77b4",  # Strong blue
        "MEDICAL_INDICATION": "#2ca02c",  # Vivid green
        "MEDICAL_CONTRAINDICATION": "#ff7f0e",  # Vivid orange
        "MEDICAL_TEST": "#e377c2",  # Soft magenta
        "MEDICAL_DEVICE": "#7f7f7f",  # Medium gray
        "MEDICAL_THERAPY": "#8c564b",  # Muted brown
        "MEDICAL_PROCEDURE": "#17becf",  # Cyan (same as MEDICAL_STUDY)
    }

    return entity_color_map.get(entity_type, "#ffffff")  # Default to white if entity type is unknown

def parse_metadata(metadata):
    if metadata is None:
        return None, None

    o = json.loads(metadata)

    return enums.ContentTypes[o["type"]] if "type" in o else None, enums.FileTypes[o["fileType"]] if "fileType" in o else None

def pretty_print_json(dictionary):
    return '\n'.join(f"{key}: {value}" for key, value in dictionary.items() if not key.startswith('@'))

def parse_title(metadata):
    if metadata is None:
        return None

    o = json.loads(metadata)

    if o is not None:
        uri = o["uri"] if "uri" in o else None

        title = pretty_print_json(o)

        if uri is not None:
            return f'URI: {uri}' + '\n' + title
        else:
            return title
    else:
        return None

def parse_label(metadata):
    if metadata is None:
        return None

    o = json.loads(metadata)

    file_name = o["fileName"] if "fileName" in o else None

    label = None

    document = o["document"] if "document" in o else None
    audio = o["audio"] if "audio" in o else None
    video = o["video"] if "video" in o else None

    if document is not None and "title" in document:
        label = document["title"]
    elif video is not None and "title" in video:
        label = video["title"]
    elif audio is not None and "title" in audio:
        label = audio["title"]

    return label if label is not None else file_name

def format_relation(relation: str):
    if relation == "observed-by":
        return None

    return relation.replace("-", " ")

def create_pyvis_graph(graph):
    g = create_pyvis_network()

    if graph.nodes is not None:
        for node in graph.nodes:
            content_type = None
            file_type = None
            label = None
            title = None

            parsed_title = parse_title(node.metadata)

            if parsed_title is not None:
                if node.type == enums.EntityTypes.CONTENT:
                    content_type, file_type = parse_metadata(node.metadata)
                    label = parse_label(node.metadata)
                    title = f'{node.type.name} [{node.id}]\n' + parsed_title
                else:
                    title = f'{node.type.name} [{node.id}]\n' + parsed_title

            shape = lookup_node_shape(node.type.name, content_type, file_type)

            if shape is not None:
                g.add_node(node.id, label=label if label is not None else node.name, shape=shape["shape"], icon=shape.get("icon"), color=lookup_node_color(node.type.name), title=title if title is not None else f'{node.type.name} [{node.id}]')

    if graph.edges is not None:
        for edge in graph.edges:
            # ensure start and end vertex exist in graph
            if not edge.from_ in g.node_ids:
                g.add_node(edge.from_)
            if not edge.to in g.node_ids:
                g.add_node(edge.to)

            relation = format_relation(edge.relation)

            width = 3 if edge.relation != "observed-by" else 1

            g.add_edge(edge.from_, edge.to, label=relation, title=relation, width=width, arrowStrikethrough=False, arrows="middle")

    return g

def create_pyvis_network():
    g = Network(
        notebook=False,
        directed=True,
        cdn_resources="in_line",
        height="900px",
        width="100%",
    )

    return g

Execute Graphlit example

In [45]:
# Remove any existing specifications, workflows and contents; only needed for notebook example
await delete_all_workflows()
await delete_all_specifications()
await delete_all_contents()

print('Deleted all specifications, workflows and contents.')

# NOTE: depending on LLM rate limits, specify if ingestion/extraction should run in parallel or not
run_parallel = True

# NOTE: select the LLM to use for extraction
specification_id = await create_openai_specification()
#specification_id = await create_anthropic_specification()

if specification_id is not None:
    print(f'Created specification [{specification_id}].')

    workflow_id = await create_workflow(specification_id)

    if workflow_id is not None:
        print(f'Created workflow [{workflow_id}].')

        # Get all files in the directory and subdirectories
        files = [os.path.join(root, file) for root, _, files in os.walk(PATH_TRANSCRIPTS) for file in files]

        progress_bar = tqdm(total=len(files))

        if run_parallel:
            tasks = [process_file(filename, workflow_id, progress_bar) for filename in files]

            await asyncio.gather(*tasks)
        else:
            # NOTE: not running parallel, to keep within LLM rate limits
            for filename in files:
                await process_file(filename, workflow_id, progress_bar)

        print('Ingested all contents.')

Deleted all specifications, workflows and contents.
Created specification [20162969-c9d1-4892-9b7e-3edfa47b30da].
Created workflow [b7bc8378-02ea-40f6-91e4-dad3ff2e2efa].



100%|██████████| 55/55 [28:35<00:00, 31.20s/it]

Ingesting content from [cdkg-challenge/Transcripts/Connected Data Knowledge Graph Challenge - Transcript Metadata.csv]
Skipping MIME type [text/csv]
Failed to ingest content from [cdkg-challenge/Transcripts/Connected Data Knowledge Graph Challenge - Transcript Metadata.csv].
Ingesting content from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Omar Khan and David Newman - The Enterprise Knowledge Graph _ CDW21 Presentation.srt]
Ingesting content from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Dave Duggal - One graph to bind them all! Linking data & apps for event-driven interoperability.srt]
Ingesting content from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Jörg Schad - Graph Analytics vs Graph Machine Learning _ CDW21 Presentation.srt]
Ingesting content from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Hybridization of Machine Learning and Operational Research is the future of AI _ Nikolaj van



  4%|▎         | 2/55 [00:37<16:28, 18.65s/it][A

Ingested content [eafebd0f-3bcb-4e93-b5fe-347e9a425bb7] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Legal Knowledge Graphs _ Vaishali Raghvani _ Connected Data World.srt].



  5%|▌         | 3/55 [00:41<10:53, 12.57s/it][A

Ingested content [a37a18af-1280-4a08-ab97-bb780bc670e7] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Omar Khan and David Newman - The Enterprise Knowledge Graph _ CDW21 Presentation.srt].



  7%|▋         | 4/55 [00:41<06:51,  8.08s/it][A

Ingested content [93ebdaec-14fc-4a7f-900f-8d991ae4b5c7] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Dave Duggal - One graph to bind them all! Linking data & apps for event-driven interoperability.srt].



  9%|▉         | 5/55 [00:42<04:29,  5.39s/it][A

Ingested content [7922c071-83f6-405c-80ba-5cf1cfb6a8cc] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Knowledge Graphs_ Moving Beyond RDF _ Kurt Cagle _ Connected Data World 2021 Presentation.srt].



 11%|█         | 6/55 [00:42<03:03,  3.74s/it][A

Ingested content [414b693d-515e-4a70-ac2a-fcbe17a49851] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Mike Dillinger - Taxonomies_ Connecting Data with Duct Tape _ CDW21 Presentation.srt].



 13%|█▎        | 7/55 [00:43<02:15,  2.82s/it][A

Ingested content [790e9279-557f-48a1-b171-9934cac5f56d] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/The Business Case for Data Management _ Mike Atkin _ Connected Data World 2021.srt].



 15%|█▍        | 8/55 [00:47<02:30,  3.20s/it][A

Ingested content [5962cb77-40cb-445e-a6fd-3d5f7aa06d82] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Implementing Informed Consent with Knowledge Graphs _ Anelia Kurteva _ Connected Data World 2021.srt].



 16%|█▋        | 9/55 [00:52<02:58,  3.88s/it][A

Ingested content [76b2a45f-12b1-4c63-a8f7-8d511da0ddef] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/David Amzallag & Szymon Klarman - Knowledge Mesh_ From Data Silos to Data Fabric at Global 2000 EPs.srt].



 18%|█▊        | 10/55 [00:57<03:05,  4.13s/it][A

Ingested content [0b5316e7-4988-4a6a-98f5-f59aa1f8676a] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Data Observability_ How to Eliminate Data Downtime and Start Trusting Your Data _ Barr Moses.srt].



 20%|██        | 11/55 [00:58<02:12,  3.02s/it][A

Ingested content [8c99f91e-a200-45f8-8acb-05f2c461de76] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Humans and the Graph - Inspiring and empowering GraphQL adoption _ Dan Boerner _ Connected Data Worl.srt].



 22%|██▏       | 12/55 [01:03<02:41,  3.75s/it][A

Ingested content [fcd683fb-f217-48e8-85a4-0b1e7390c496] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/RDF Leveled the Advantages of LPG and Keeps 3 Key Benefits_ Standards, Semantics & Interoperability.srt].



 24%|██▎       | 13/55 [01:07<02:42,  3.88s/it][A

Ingested content [b8bb9931-32d0-4407-bae0-fd18a86dcdc7] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Jörg Schad - Graph Analytics vs Graph Machine Learning _ CDW21 Presentation.srt].
Ingested content [2d2e855f-4f85-4ec4-8003-29c6c2c543c9] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Hybridization of Machine Learning and Operational Research is the future of AI _ Nikolaj van Omme.srt].



 27%|██▋       | 15/55 [01:08<01:26,  2.15s/it][A

Ingested content [1a0602d6-45c4-484e-94dc-8bf8dbc64ac8] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Introducing D3FEND_ A Knowledge Graph of Cybersecurity Countermeasures _ Peter Kaloroumakis _ CDW21.srt].
Ingested content [24c615c3-7ef4-4916-bee3-32f4083bb02f] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Natasa+recording.srt].



 31%|███       | 17/55 [01:12<01:26,  2.27s/it][A

Ingested content [dbf00aec-8dae-43b5-846c-7b8e94ea1b20] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Peter Crocker - Reasoning for the Answer_ Who is the Greatest Formula 1 Driver of all Time_ _ CDW21.srt].



 33%|███▎      | 18/55 [01:13<01:07,  1.81s/it][A

Ingested content [c55777fa-9964-4ae1-bb37-087d7832144e] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/How a Knowledge Graph Can Support Advanced Price Analytics in Supply Chain Management _ Marcus Nölke.srt].



 35%|███▍      | 19/55 [01:13<00:53,  1.47s/it][A

Ingested content [4b9a58f9-4dc6-4731-9ec7-6056280ca316] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/The game plan for your Knowledge Graph-driven FAIR data platform.srt].



 36%|███▋      | 20/55 [01:13<00:41,  1.18s/it][A

Ingested content [4f7e07c0-0dac-4692-99bc-5425453716c2] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Supporting Ontology and Knowledge Graph Development on the Web.srt].



 38%|███▊      | 21/55 [01:14<00:31,  1.09it/s][A

Ingested content [8cf48c48-7c87-4090-b622-6a546797fc57] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Systems that learn and reason _ Frank van Harmelen _ Connected Data World 2021.srt].



 40%|████      | 22/55 [01:18<00:58,  1.77s/it][A

Ingested content [1b72f50f-2ed9-4e4b-a1a5-56b339ef8a15] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Andreea Deac - Neural Algorithmic Reasoning_ Combining Classical Algorithms and Neural Networks.srt].
Ingested content [e3568ffb-226e-46e9-8196-437dcc170a18] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Graph algorithms over data lakes and warehouses.srt].



 44%|████▎     | 24/55 [01:18<00:34,  1.12s/it][A

Ingested content [716d7177-d5ab-4a8c-8c70-3aa3d1979137] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Sales AI - Building and maintaining a knowledge graph.srt].
Ingested content [60959fe8-66c1-4cf3-894b-aa09301d22fd] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Modelling regulation requirements using SHACL.srt].



 47%|████▋     | 26/55 [01:18<00:21,  1.37it/s][A

Ingested content [c0fb2303-2766-46fb-8724-31d491c0f869] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Gadi Singer - Thrill-K_ Rethinking knowledge layering and construction for higher machine cognition.srt].



 49%|████▉     | 27/55 [01:19<00:17,  1.58it/s][A

Ingested content [1065d724-9b01-42bf-93f3-13887a38d6d9] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Unprecedented Products in Commercial Real Estate with Automated Generation of Large Knowledge Graphs.srt].



 51%|█████     | 28/55 [01:22<00:36,  1.34s/it][A

Ingested content [cb72921b-b678-46a9-8521-53aaffe37b21] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Ora Lassila - Graph Abstractions Matter _ CDW21 Presentations.srt].



 53%|█████▎    | 29/55 [01:23<00:31,  1.22s/it][A

Ingested content [be65dd09-fc78-4040-8538-ea9272fabb10] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Victor_Lee_graph_alg_and_graph_ML_making_sense_Recording_1920x1050.srt].



 55%|█████▍    | 30/55 [01:28<00:55,  2.22s/it][A

Ingested content [61be5f5b-d30d-4e70-b44c-c17d02a1f456] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Keshav Pingali - Knowledge Graphs, Graph AI, & the Need for High-performance Graph Computing _ CDW21.srt].
Ingested content [9ffcd08b-c385-4873-8733-5ef69ad70b27] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Applying Semantic Web Standards for Knowledge Representation at Elsevier _ Veronique Moore.srt].
Ingested content [f4cf712d-d76e-41ef-8e59-3293311895cd] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Big Graphs & Machine Learning & Rich Interactions the recipe for the future of search.srt].



 60%|██████    | 33/55 [01:33<00:42,  1.94s/it][A
 65%|██████▌   | 36/55 [01:33<00:21,  1.13s/it][A

Ingested content [5d34e248-14b4-4f7b-92b4-894ed3a3d544] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/GraphEDM A Unified Framework for Machine Learning on Graphs.srt].
Ingested content [69d655a6-796d-4fbb-9d8b-e16baed3d513] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Knowledge Graphs as Hub for Data, Metadata and Content. And the Partners Needed to Deliver the Multitude of Applications.srt].
Ingested content [05b4916d-9db3-42ea-874e-493ef69e5115] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/The Road to AI requires a Semantic Data Fabric.srt].
Ingested content [8602aae7-5171-4071-84e5-220e3b9b3d6f] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Leveraging Graphcore’s IPU architecture for large scale GNN compute _ Carlo Luschi _ Connected Data.srt].



 67%|██████▋   | 37/55 [01:38<00:31,  1.76s/it][A

Ingested content [b830e24e-a712-405e-8c73-3feb192a54f8] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Deep Learning on Graphs_ Past, Present, And Future.srt].



 69%|██████▉   | 38/55 [01:39<00:24,  1.47s/it][A
 71%|███████   | 39/55 [01:39<00:18,  1.18s/it][A

Ingested content [59a40ba2-67f4-4637-b719-4033ff160996] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Van Gogh Worldwide -- constructing and searching a knowledge graph of linked art.srt].
Ingested content [2ff8b069-0514-48e4-ac41-e13e85ca952b] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/(DataCatalog)_-[poweredBy]-_(KnowledgeGraph).srt].



 73%|███████▎  | 40/55 [01:43<00:30,  2.02s/it][A

Ingested content [57dfdd48-fe72-454c-9021-d61f4d63027b] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Knowledge Graphs The Frontier.srt].
Ingested content [5d7cb7c6-0360-4489-91dc-27a7d2c88e9c] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Graph stories How stories and metaphors can help you promote Enterprise Knowledge Graphs (1).srt].



 76%|███████▋  | 42/55 [01:48<00:29,  2.23s/it][A

Ingested content [e229ce71-bc74-494f-a202-5b79a9880405] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/From discovering data to trusting data.srt].



 78%|███████▊  | 43/55 [01:53<00:33,  2.83s/it][A

Ingested content [13a8f0dd-f314-4001-8e5a-0c6dd805a75c] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Protecting vital public health programs with AI and Knowledge Graphs.srt].



 80%|████████  | 44/55 [01:54<00:24,  2.19s/it][A


Ingested content [6eae7d67-302f-4dbc-9874-242266a765a7] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Powering the Connected Enterprise in a Hybrid, Multicloud World.srt].
Ingested content [bd150d3f-95e6-4196-9315-c66dac5a1c02] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/KG Enterprise Adoption Roundtable Summary.srt].
Ingested content [e6e2252c-c302-4481-885e-5046c41aaf49] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Data-Centric Security.srt].
Ingested content [899c1e51-ce6b-4d83-9d6f-6cf1c8fc4797] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Reinforcement Learning for Knowledge Graph Reasoning.srt].
Ingested content [0de14b72-9119-4a51-b3f4-f9d770631fc1] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Kirell_Benzi_Secret Knowledge_ Visualizing Complexity.srt].


 87%|████████▋ | 48/55 [01:54<00:06,  1.05it/s][A
 89%|████████▉ | 49/55 [02:03<00:14,  2.43s/it][A

Ingested content [d3b919dc-b65c-4dae-affa-1b2bf526d07b] from [cdkg-challenge/Transcripts/Connected Data World 2021/Presentations/Graph Thinking _ Paco Nathan _ Connected Data World 2021.srt].



 91%|█████████ | 50/55 [02:09<00:14,  2.98s/it][A

Ingested content [03867a01-67cc-4fd7-9560-29142b327ce0] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/DBpedia Databus - A platform to evolve knowledge and AI from versioned web files.srt].



 93%|█████████▎| 51/55 [02:13<00:13,  3.35s/it][A

Ingested content [4d5d6d42-d21e-4553-bf90-e2caf2afe67e] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Leveraging the Semantics of Adverse Childhood Experiences for Explainable AI Recommendations.srt].



 95%|█████████▍| 52/55 [02:14<00:07,  2.66s/it][A

Ingested content [8d00e167-b0fb-4b8c-961d-f036325e042d] from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Rebooting AI Adding Knowledge to Deep Learning.srt].



 96%|█████████▋| 53/55 [02:19<00:06,  3.27s/it][A

Ingested content [d87b3d63-3783-46ac-a955-14287058348a] from [cdkg-challenge/Transcripts/2024/Connected Data London Roundtable July 3 2024.txt].



 98%|█████████▊| 54/55 [03:51<00:27, 27.01s/it][A

HTTP status code: 500
Failed to ingest content from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/Illuminating the Last Vestiges of Dark Data - Speech Recognition for Knowledge Graphs.srt].
HTTP status code: 500
Failed to ingest content from [cdkg-challenge/Transcripts/Knowledge Connexions 2020/Presentations/BizDevOps Aligning the Business and IT with a Graph Knowledge Base.srt].
Ingested all contents.


In [46]:
from IPython.display import display, HTML, JSON

# Query the resulting knowledge graph
graph = await query_contents_graph()

if graph is not None:
    g = create_pyvis_graph(graph)

    g.set_options("""
    var options = {
        "physics": {
            "forceAtlas2Based": {
                "gravitationalConstant": -50,
                "centralGravity": 0.01,
                "springLength": 100,
                "springConstant": 0.08
            },
            "maxVelocity": 50,
            "solver": "forceAtlas2Based",
            "timestep": 0.35,
            "stabilization": {
                "iterations": 100
            }
        }
        }
    """)

    # render with random file name
    graph_html = g.generate_html(f"graph_{random.randint(0, 1000)}.html")

    # Inject FontAwesome CSS
    font_awesome_link = '<script src="https://kit.fontawesome.com/2c74303849.js" crossorigin="anonymous"></script>'
    graph_html = graph_html.replace('<head>', f'<head>{font_awesome_link}')

    display(HTML(graph_html))

Display histogram of extracted entities

In [47]:
import plotly.express as px
import pandas as pd

facets = await query_contents_facets()

if facets is not None:
    json_strings = [facet.model_dump_json(indent=2) for facet in facets if facet is not None]  # Using indent for pretty printing
    json_dicts = [json.loads(js) for js in json_strings]

    df = pd.json_normalize(json_dicts, sep='_')

    # Rename columns for clarity
    df.rename(columns={
        'observable_type': 'Type',
        'observable_observable_name': 'Name',
        'count': 'Count'
    }, inplace=True)

    # Sort the DataFrame by 'Name' alphabetically
    df = df.sort_values(by='Name')

    # Using Plotly for more customizable visualization
    fig = px.bar(df, x='Name', y='Count', color='Type',
                 hover_data={'Name': True, 'Count': True, 'Type': True},
                 labels={'Count': 'Count', 'Name': 'Observable Name', 'Type': 'Observable Type'})

    fig.show()