<a href="https://colab.research.google.com/github/graphlit/graphlit-samples/blob/main/python/Notebook%20Examples/Graphlit_2024_12_28_Assign_Labels_During_Ingestion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Description**

This example shows how to assign labels to ingested content, when ingesting by URI and ingesting by feed.

**Requirements**

Prior to running this notebook, you will need to [signup](https://docs.graphlit.dev/getting-started/signup) for Graphlit, and [create a project](https://docs.graphlit.dev/getting-started/create-project).

You will need the Graphlit organization ID, preview environment ID and JWT secret from your created project.

Assign these properties as Colab secrets: GRAPHLIT_ORGANIZATION_ID, GRAPHLIT_ENVIRONMENT_ID and GRAPHLIT_JWT_SECRET.

---

Install Graphlit Python client SDK

In [8]:
!pip install --upgrade graphlit-client



Initialize Graphlit

In [9]:
import os
from google.colab import userdata
from graphlit import Graphlit
from graphlit_api import input_types, enums, exceptions

os.environ['GRAPHLIT_ORGANIZATION_ID'] = userdata.get('GRAPHLIT_ORGANIZATION_ID')
os.environ['GRAPHLIT_ENVIRONMENT_ID'] = userdata.get('GRAPHLIT_ENVIRONMENT_ID')
os.environ['GRAPHLIT_JWT_SECRET'] = userdata.get('GRAPHLIT_JWT_SECRET')

graphlit = Graphlit()

Define Graphlit helper functions

In [10]:
from typing import List, Optional
import json

async def create_web_feed(uri: str, workflow_id: str):
    if graphlit.client is None:
        return;

    input = input_types.FeedInput(
        name="Web",
        type=enums.FeedTypes.WEB,
        web=input_types.WebFeedPropertiesInput(
            uri=uri,
            readLimit=5 # limiting to 5 web pages
        ),
        workflow=input_types.EntityReferenceInput(id=workflow_id) if workflow_id is not None else None,
    )

    try:
        response = await graphlit.client.create_feed(input)

        return response.create_feed.id if response.create_feed is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

    return None

async def is_feed_done(feed_id: str):
    if graphlit.client is None:
        return;

    response = await graphlit.client.is_feed_done(feed_id)

    return response.is_feed_done.result if response.is_feed_done is not None else None

async def create_ingestion_workflow(label_id: str):
    if graphlit.client is None:
        return;

    input = input_types.WorkflowInput(
        name="Ingestion",
        ingestion=input_types.IngestionWorkflowStageInput(
            observations=[input_types.ObservationReferenceInput(
                type=enums.ObservableTypes.LABEL,
                observable=input_types.NamedEntityReferenceInput(id=label_id)
            )]
        )
    )

    try:
        response = await graphlit.client.create_workflow(input)

        return response.create_workflow.id if response.create_workflow is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

async def ingest_uri(uri: str, label_id: str):
    if graphlit.client is None:
        return;

    try:
        # Using synchronous mode, so the notebook waits for the content to be ingested
        response = await graphlit.client.ingest_uri(uri=uri,
                                                    observations=[input_types.ObservationReferenceInput(
                                                        type=enums.ObservableTypes.LABEL,
                                                        observable=input_types.NamedEntityReferenceInput(id=label_id)
                                                    )], is_synchronous=True)

        return response.ingest_uri.id if response.ingest_uri is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

async def query_contents(label_id: str):
    if graphlit.client is None:
        return;

    try:
        response = await graphlit.client.query_contents(
            filter=input_types.ContentFilter(
                observations=[
                    input_types.ObservationReferenceFilter(
                        type=enums.ObservableTypes.LABEL,
                        observable=input_types.EntityReferenceFilter(id=label_id)
                    )
                ]
            )
        )

        return response.contents.results if response.contents is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

async def create_label(name: str):
    if graphlit.client is None:
        return;

    try:
        response = await graphlit.client.create_label(
            label=input_types.LabelInput(
                name=name
            )
        )

        return response.create_label.id if response.create_label is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

async def query_labels(search_text: Optional[str] = None):
    if graphlit.client is None:
        return;

    try:
        response = await graphlit.client.query_labels(
            filter=input_types.LabelFilter(
                search=search_text
            )
        )

        return response.labels.results if response.labels is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

async def delete_all_workflows():
    if graphlit.client is None:
        return;

    _ = await graphlit.client.delete_all_workflows(is_synchronous=True)

async def delete_all_contents():
    if graphlit.client is None:
        return;

    _ = await graphlit.client.delete_all_contents(is_synchronous=True)

async def delete_all_feeds():
    if graphlit.client is None:
        return;

    _ = await graphlit.client.delete_all_feeds(is_synchronous=True)

async def delete_all_observables():
    if graphlit.client is None:
        return;

    _ = await graphlit.client.delete_all_persons()
    _ = await graphlit.client.delete_all_organizations()
    _ = await graphlit.client.delete_all_places()
    _ = await graphlit.client.delete_all_events()
    _ = await graphlit.client.delete_all_products()
    _ = await graphlit.client.delete_all_softwares()
    _ = await graphlit.client.delete_all_repos()
    _ = await graphlit.client.delete_all_labels()
    _ = await graphlit.client.delete_all_categories()

Execute Graphlit example

In [11]:
from IPython.display import display, Markdown
import time

# Remove any existing feeds, contents and workflows; only needed for notebook example
await delete_all_workflows()
await delete_all_feeds()
await delete_all_contents()
await delete_all_observables()

print('Deleted all feeds, contents, and workflows.')

other_label_id = await create_label(name="Other")

if other_label_id is not None:
    print(f'Created other label [{other_label_id}].')

label_id = await create_label(name="Graphlit")

if label_id is not None:
    print(f'Created label [{label_id}].')

    uri = "https://www.graphlit.com"

    content_id = await ingest_uri(uri, label_id)

    if content_id is not None:
        print(f'Ingested URI [{uri}].')

    workflow_id = await create_ingestion_workflow(label_id)

    if workflow_id is not None:
        print(f'Created workflow [{workflow_id}].')


Deleted all feeds, contents, and workflows.
Created other label [ef3f4327-bdd8-4cf7-b2e8-f142b6579475].
Created label [56a5ded9-028d-4a53-9fde-f992931c05c3].
Ingested URI [https://www.graphlit.com].
Created workflow [ecc89ef5-e0cc-4716-bef9-05d4b9b8a6a0].


Ingest web feed

In [12]:
        uri = "https://changelog.graphlit.dev"

        feed_id = await create_web_feed(uri, workflow_id)

        if feed_id is not None:
            print(f'Created feed [{feed_id}].')

            # Wait for feed to complete, since ingestion happens asychronously
            done = False
            time.sleep(5)
            while not done:
                done = await is_feed_done(feed_id)

                if not done:
                    time.sleep(10)

            print(f'Completed feed [{feed_id}].')

Created feed [62906623-1ab4-4038-b30e-16e1d43bf81c].
Completed feed [62906623-1ab4-4038-b30e-16e1d43bf81c].


Search within ingested pages

In [13]:
    # Query contents by label; should get content from both graphlit.com and changelog.graphlit.dev
    contents = await query_contents(label_id)

    if contents is not None and len(contents) > 0:
        print(f'Found {len(contents)} contents by label [{label_id}].')

        for content in contents:
            if content is not None:
                display(Markdown(f'### Found {content.type} content [{content.id}]: URI [{content.uri}]'))

                if content.observations is not None:
                    for observation in content.observations:
                        if observation is not None and observation.observable is not None:
                            print(f'{observation.type} [{observation.id}]: {observation.observable.name} [{observation.observable.id}]')

                print()
            else:
                print('No content found')
    else:
        print(f'No contents found.')


Found 6 contents by label [56a5ded9-028d-4a53-9fde-f992931c05c3].


### Found PAGE content [c6e6afc9-6dea-442c-a346-6fdbc7fb0400]: URI [https://changelog.graphlit.dev/august-2023/august-3-new-data-model-for-observations-new-category-entity]

LABEL [6dd1dc59-b455-4d7b-b72f-94f977b785fe]: Graphlit [56a5ded9-028d-4a53-9fde-f992931c05c3]



### Found PAGE content [d3c40a0e-e21e-424c-98c3-64512090471d]: URI [https://changelog.graphlit.dev/april-2024/april-7-support-for-discord-feeds-cohere-reranking-section-aware-chunking-and-retrieval]

LABEL [140309b5-89db-458d-9c94-a900394a494d]: Graphlit [56a5ded9-028d-4a53-9fde-f992931c05c3]



### Found PAGE content [8443663e-d7ec-486e-8b98-f02f4863c7de]: URI [https://changelog.graphlit.dev/august-2023/august-17-prepare-for-usage-based-billing-append-sas-tokens-to-uris]

LABEL [c8ae67a8-9a96-4d34-a233-d5b350062eae]: Graphlit [56a5ded9-028d-4a53-9fde-f992931c05c3]



### Found PAGE content [e6eaf3cd-6dfe-428e-9686-e822a1181a14]: URI [https://changelog.graphlit.dev/]

LABEL [899bb896-a9bb-4e27-80e7-d7a13dbd4aaa]: Graphlit [56a5ded9-028d-4a53-9fde-f992931c05c3]



### Found PAGE content [1ee0f338-8a9c-4394-b7cb-aa54bb7278be]: URI [https://changelog.graphlit.dev/april-2024/april-23-support-for-python-and-typescript-sdks-latest-openai-cohere-and-groq-models-bug-fixes]

LABEL [6ea0c1c3-d3f1-48b5-85c1-be4bf4b5f776]: Graphlit [56a5ded9-028d-4a53-9fde-f992931c05c3]



### Found PAGE content [7fc2b7d8-bc8c-476f-a8db-a4a7772e4fa0]: URI [https://www.graphlit.com/]

LABEL [3a3bc832-1577-4cb0-bb23-f3e90af059c9]: Graphlit [56a5ded9-028d-4a53-9fde-f992931c05c3]



In [14]:
# Query contents by label; should get nothing since it's using the other label
if other_label_id is not None:
    contents = await query_contents(other_label_id)

    if contents is not None and len(contents) > 0:
        print(f'Found {len(contents)} contents by label [{other_label_id}].')

        for content in contents:
            if content is not None:
                display(Markdown(f'### Found {content.type} content [{content.id}]: URI [{content.uri}]'))

                if content.observations is not None:
                    for observation in content.observations:
                        if observation is not None and observation.observable is not None:
                            print(f'{observation.type} [{observation.id}]: {observation.observable.name} [{observation.observable.id}]')

                print()
            else:
                print('No content found')
    else:
        print(f'No contents found.')


No contents found.
