<a href="https://colab.research.google.com/github/graphlit/graphlit-samples/blob/main/python/Notebook%20Examples/Graphlit_2024_12_24_Assign_Labels_at_Ingestion_Time.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Description**

This example shows how to assign labels to files upon ingestion. As an alternative to using entity extraction to assign labels, this shows how to label content manually.

**Requirements**

Prior to running this notebook, you will need to [signup](https://docs.graphlit.dev/getting-started/signup) for Graphlit, and [create a project](https://docs.graphlit.dev/getting-started/create-project).

You will need the Graphlit organization ID, preview environment ID and JWT secret from your created project.

Assign these properties as Colab secrets: GRAPHLIT_ORGANIZATION_ID, GRAPHLIT_ENVIRONMENT_ID and GRAPHLIT_JWT_SECRET.

---

Install Graphlit Python client SDK

In [22]:
!pip install --upgrade graphlit-client



Initialize Graphlit

In [23]:
import os
from google.colab import userdata
from graphlit import Graphlit
from graphlit_api import input_types, enums, exceptions

os.environ['GRAPHLIT_ORGANIZATION_ID'] = userdata.get('GRAPHLIT_ORGANIZATION_ID')
os.environ['GRAPHLIT_ENVIRONMENT_ID'] = userdata.get('GRAPHLIT_ENVIRONMENT_ID')
os.environ['GRAPHLIT_JWT_SECRET'] = userdata.get('GRAPHLIT_JWT_SECRET')

graphlit = Graphlit()

Define Graphlit helper functions

In [24]:
from typing import List, Optional
import json

# NOTE: these are label names
async def ingest_uri(uri: str, labels: Optional[List[str]] = None):
    if graphlit.client is None:
        return;

    try:
        # Using synchronous mode, so the notebook waits for the content to be ingested
        response = await graphlit.client.ingest_uri(
            uri=uri,
            observations=[input_types.ObservationReferenceInput(type=enums.ObservableTypes.LABEL, observable=input_types.NamedEntityReferenceInput(name=label)) for label in labels] if labels is not None else None,
            is_synchronous=True
        )

        return response.ingest_uri.id if response.ingest_uri is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

async def get_content_observations(id: str):
    if graphlit.client is None:
        return;

    try:
        response = await graphlit.client.get_content(id=id)

        return response.content.observations if response.content is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

# NOTE: these are label entity IDs
async def query_contents(search_text: Optional[str] = None, labels: Optional[List[str]] = None):
    if graphlit.client is None:
        return;

    try:
        response = await graphlit.client.query_contents(
            filter=input_types.ContentFilter(
                search=search_text,
                searchType=enums.SearchTypes.HYBRID,
                observations=[input_types.ObservationReferenceFilter(type=enums.ObservableTypes.LABEL, observable=input_types.EntityReferenceFilter(id=label)) for label in labels] if labels is not None else None,
            )
        )

        return response.contents.results if response.contents is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

async def query_labels(search_text: Optional[str] = None):
    if graphlit.client is None:
        return;

    try:
        response = await graphlit.client.query_labels(
            filter=input_types.LabelFilter(
                search=search_text
            )
        )

        return response.labels.results if response.labels is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

async def delete_all_contents():
    if graphlit.client is None:
        return;

    _ = await graphlit.client.delete_all_contents(is_synchronous=True)

async def delete_all_observables():
    if graphlit.client is None:
        return;

    _ = await graphlit.client.delete_all_persons()
    _ = await graphlit.client.delete_all_organizations()
    _ = await graphlit.client.delete_all_places()
    _ = await graphlit.client.delete_all_events()
    _ = await graphlit.client.delete_all_products()
    _ = await graphlit.client.delete_all_softwares()
    _ = await graphlit.client.delete_all_repos()
    _ = await graphlit.client.delete_all_labels()
    _ = await graphlit.client.delete_all_categories()


In [25]:
from IPython.display import display, Markdown
import time

# Remove any existing contents; only needed for notebook example
await delete_all_contents()
await delete_all_observables()

print('Deleted all contents.')

uri = "https://graphlitplatform.blob.core.windows.net/test/audio/podcasts/Bring%20Order%20To%20The%20Chaos%20Of%20Your%20Unstructured%20Data%20Assets%20With%20Unstruk-Data%20Engineering%20Podcast.mp3"
labels = ['podcast', 'unstructured data']

content_id = await ingest_uri(uri=uri, labels=labels)

if content_id is not None:
    print(f'Ingested content [{content_id}].')

    observations = await get_content_observations(content_id)

    if observations is not None:
        print('Found content observations:')

        for observation in observations:
            if observation is not None and observation.observable is not None:
                print(f'{observation.type}: {observation.observable.name} [{observation.observable.id}]')

uri = "https://graphlitplatform.blob.core.windows.net/test/documents/Unifying%20Large%20Language%20Models%20and%20Knowledge%20Graphs%20A%20Roadmap-2306.08302.pdf"
labels = ['knowledge graphs', 'unstructured data']

content_id = await ingest_uri(uri=uri, labels=labels)

if content_id is not None:
    print(f'Ingested content [{content_id}].')

    observations = await get_content_observations(content_id)

    if observations is not None:
        print('Found content observations:')

        for observation in observations:
            if observation is not None and observation.observable is not None:
                print(f'{observation.type}: {observation.observable.name} [{observation.observable.id}]')


Deleted all contents.
Ingested content [ea93c1b6-851b-48a1-b110-734b57a02778].
Found content observations:
LABEL: podcast [5d44b756-6d25-4dde-8e94-76743fa05bcb]
LABEL: unstructured data [7fd66dfe-17a3-4fa1-932b-0daf7aa408d2]
Ingested content [f9052e0b-fd67-45f2-99d6-1ae22f8e2726].
Found content observations:
LABEL: knowledge graphs [d6e0d625-5bed-4751-87af-c57485bb4be0]
LABEL: unstructured data [7fd66dfe-17a3-4fa1-932b-0daf7aa408d2]


Query all labels

In [26]:
labels = await query_labels()

if labels is not None:
    for label in labels:
        if label is not None:
            print(f'Found Label [{label.id}]: {label.name}')

Found Label [d6e0d625-5bed-4751-87af-c57485bb4be0]: knowledge graphs
Found Label [5d44b756-6d25-4dde-8e94-76743fa05bcb]: podcast
Found Label [7fd66dfe-17a3-4fa1-932b-0daf7aa408d2]: unstructured data


Query labels by name, uses full-text search

In [27]:
#label_name = 'unstructured data'
label_name = 'Knowledge'

labels = await query_labels(label_name)
label = labels[0] if labels is not None and len(labels) > 0 else None

if label is not None:
    print(f'Found Label [{label.id}]: {label.name}')

Found Label [d6e0d625-5bed-4751-87af-c57485bb4be0]: knowledge graphs


Query contents by label ID

In [28]:
if label is not None:
    contents = await query_contents(labels=[label.id])

    if contents is not None:
        print(f'Found contents by label [{label.id}]:')

        for content in contents:
            if content is not None and content.observations is not None:
                print(f'{content.file_type}: {content.name}:')

                for observation in content.observations:
                    if observation is not None and observation.observable is not None:
                        print(f'{observation.type}: {observation.observable.name} [{observation.observable.id}]')

Found contents by label [d6e0d625-5bed-4751-87af-c57485bb4be0]:
DOCUMENT: Unifying Large Language Models and Knowledge Graphs A Roadmap-2306.08302.pdf:
LABEL: knowledge graphs [d6e0d625-5bed-4751-87af-c57485bb4be0]
LABEL: unstructured data [7fd66dfe-17a3-4fa1-932b-0daf7aa408d2]
