<a href="https://colab.research.google.com/github/graphlit/graphlit-samples/blob/main/python/Notebook%20Examples/Graphlit_2024_12_09_Locate_Microsoft_Emails_by_Organization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Description**

This example shows how to ingest emails from a user's Microsoft email account, automatically extract Organizations using Anthropic Claude Sonnet 3.5, and filter emails by a specific Organization.

**Requirements**

Prior to running this notebook, you will need to [signup](https://docs.graphlit.dev/getting-started/signup) for Graphlit, and [create a project](https://docs.graphlit.dev/getting-started/create-project).

You will need the Graphlit organization ID, preview environment ID and JWT secret from your created project.

Assign these properties as Colab secrets: GRAPHLIT_ORGANIZATION_ID, GRAPHLIT_ENVIRONMENT_ID and GRAPHLIT_JWT_SECRET.

To access your Microsoft email, assign these properties as Colab secrets: MICROSOFT_CLIENT_ID, MICROSOFT_CLIENT_SECRET and MICROSOFT_REFRESH_TOKEN.

---

Install Graphlit Python client SDK

In [None]:
!pip install --upgrade graphlit-client

Initialize Graphlit

In [None]:
import os
from google.colab import userdata
from graphlit import Graphlit
from graphlit_api import input_types, enums, exceptions

os.environ['GRAPHLIT_ORGANIZATION_ID'] = userdata.get('GRAPHLIT_ORGANIZATION_ID')
os.environ['GRAPHLIT_ENVIRONMENT_ID'] = userdata.get('GRAPHLIT_ENVIRONMENT_ID')
os.environ['GRAPHLIT_JWT_SECRET'] = userdata.get('GRAPHLIT_JWT_SECRET')

graphlit = Graphlit()

In [None]:
os.environ['MICROSOFT_CLIENT_ID'] = userdata.get('MICROSOFT_CLIENT_ID')
os.environ['MICROSOFT_CLIENT_SECRET'] = userdata.get('MICROSOFT_CLIENT_SECRET')
os.environ['MICROSOFT_REFRESH_TOKEN'] = userdata.get('MICROSOFT_REFRESH_TOKEN')

Define Graphlit helper functions

In [None]:
from typing import List, Optional

async def create_extraction_specification(model: enums.AnthropicModels):
    if graphlit.client is None:
        return;

    input = input_types.SpecificationInput(
        name=f"Anthropic [{str(model)}]",
        type=enums.SpecificationTypes.EXTRACTION,
        serviceType=enums.ModelServiceTypes.ANTHROPIC,
        anthropic=input_types.AnthropicModelPropertiesInput(
            model=model
        )
    )

    try:
        response = await graphlit.client.create_specification(input)

        return response.create_specification.id if response.create_specification is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

    return None

async def create_workflow(extraction_specification_id: Optional[str]):
    if graphlit.client is None:
        return;

    input = input_types.WorkflowInput(
        name="Workflow",
        extraction=input_types.ExtractionWorkflowStageInput(
            jobs=[
                input_types.ExtractionWorkflowJobInput(
                    connector=input_types.EntityExtractionConnectorInput(
                        type=enums.EntityExtractionServiceTypes.MODEL_TEXT,
                        modelText=input_types.ModelTextExtractionPropertiesInput(
                            specification=input_types.EntityReferenceInput(id=extraction_specification_id)
                        )
                    )
                )
            ]
        ) if extraction_specification_id is not None else None,
    )

    try:
        response = await graphlit.client.create_workflow(input)

        return response.create_workflow.id if response.create_workflow is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

    return None

async def create_feed(workflow_id: str):
    if graphlit.client is None:
        return;

    input = input_types.FeedInput(
        name="Microsoft Email",
        type=enums.FeedTypes.EMAIL,
        email=input_types.EmailFeedPropertiesInput(
            type=enums.FeedServiceTypes.MICROSOFT_EMAIL,
            microsoft=input_types.MicrosoftEmailFeedPropertiesInput(
                type=enums.EmailListingTypes.PAST,
                refreshToken=os.environ['MICROSOFT_REFRESH_TOKEN'],
                clientId=os.environ['MICROSOFT_CLIENT_ID'],
                clientSecret=os.environ['MICROSOFT_CLIENT_SECRET']
            ),
            readLimit=100 # limiting to 100 emails
        ),
        workflow=input_types.EntityReferenceInput(id=workflow_id) if workflow_id is not None else None,
    )

    try:
        response = await graphlit.client.create_feed(input)

        return response.create_feed.id if response.create_feed is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

    return None

async def is_feed_done(feed_id: str):
    if graphlit.client is None:
        return;

    response = await graphlit.client.is_feed_done(feed_id)

    return response.is_feed_done.result if response.is_feed_done is not None else None

async def query_organizations(name: str):
    if graphlit.client is None:
        return;

    try:
        response = await graphlit.client.query_organizations(
            filter=input_types.OrganizationFilter(
                name=name
            )
        )

        return response.organizations.results if response.organizations is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

# Locate contents where organization was observed, with relevant search text
async def query_contents(organization_id: str, search_text: str):
    if graphlit.client is None:
        return;

    try:
        response = await graphlit.client.query_contents(
            filter=input_types.ContentFilter(
                search=search_text,
                searchType=enums.SearchTypes.HYBRID,
                observations=[
                    input_types.ObservationReferenceFilter(
                        type=enums.ObservableTypes.ORGANIZATION,
                        observable=input_types.EntityReferenceFilter(
                            id=organization_id
                        )
                    )
                ]
            )
        )

        return response.contents.results if response.contents is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

async def delete_all_feeds():
    if graphlit.client is None:
        return;

    _ = await graphlit.client.delete_all_feeds(is_synchronous=True)


Execute Graphlit example

In [None]:
from IPython.display import display, Markdown
import time

# Remove any existing feeds; only needed for notebook example
await delete_all_feeds()

print('Deleted all feeds.')

specification_id = await create_extraction_specification(enums.AnthropicModels.CLAUDE_3_5_SONNET)

if specification_id is not None:
    print(f'Created extraction specification [{specification_id}].')

    workflow_id = await create_workflow(specification_id)

    if workflow_id is not None:
        print(f'Created workflow [{workflow_id}].')

        feed_id = await create_feed(workflow_id)

        if feed_id is not None:
            print(f'Created feed [{feed_id}].')

            # Wait for feed to complete, since ingestion happens asychronously
            done = False
            time.sleep(5)
            while not done:
                done = await is_feed_done(feed_id)

                if not done:
                    time.sleep(2)

            print(f'Completed feed [{feed_id}].')

In [None]:
# NOTE: specify the name of the Organization to locate
name = "Microsoft"

# NOTE: specify the text to search for in the filtered emails
search_text = "Azure subscription"

organizations = await query_organizations(name)

organization = organizations[0] if organizations is not None and len(organizations) > 0 else None

if organization is not None:
    print(f'Found organization [{organization.id}] named [{organization.name}].')

    # Query contents by organization and search text
    contents = await query_contents(organization.id, search_text)

    if contents is not None:
        for content in contents:
            if content is not None:
                display(Markdown(f'### Found Microsoft email [{content.id}] that referenced organization [{name}] and search text [{search_text}].'))

                if content.original_date is not None:
                    print("Date: " + content.original_date)

                metadata = content.email

                if metadata is not None:
                    if metadata.subject is not None:
                        print("Subject: " + metadata.subject)
                    if metadata.to is not None and len(metadata.to) > 0:
                        print("To: " + ", ".join([f'"{item.name}" <{item.email}>' for item in metadata.to if item is not None]))
                    if metadata.from_ is not None and len(metadata.from_) > 0:
                        print("From: " + ", ".join([f'"{item.name}" <{item.email}>' for item in metadata.from_ if item is not None]))

                display(Markdown(content.markdown))
                print()
else:
    print(f'No organization with name [{name}] found.')