<a href="https://colab.research.google.com/github/graphlit/graphlit-samples/blob/main/python/Notebook%20Examples/Graphlit_2024_09_07_Locate_Google_Emails_by_Person.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Description**

This example shows how to ingest emails from a user's Gmail account, automatically extract to/from/cc/bcc as Persons, and filter emails by a specific Person.

**Requirements**

Prior to running this notebook, you will need to [signup](https://docs.graphlit.dev/getting-started/signup) for Graphlit, and [create a project](https://docs.graphlit.dev/getting-started/create-project).

You will need the Graphlit organization ID, preview environment ID and JWT secret from your created project.

Assign these properties as Colab secrets: GRAPHLIT_ORGANIZATION_ID, GRAPHLIT_ENVIRONMENT_ID and GRAPHLIT_JWT_SECRET.


---

Install Graphlit Python client SDK

In [None]:
!pip install --upgrade graphlit-client

Install Google OAuth SDKs

In [None]:
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

Initialize Graphlit

In [None]:
import os
from google.colab import userdata
from graphlit import Graphlit
from graphlit_api import input_types, enums, exceptions

os.environ['GRAPHLIT_ORGANIZATION_ID'] = userdata.get('GRAPHLIT_ORGANIZATION_ID')
os.environ['GRAPHLIT_ENVIRONMENT_ID'] = userdata.get('GRAPHLIT_ENVIRONMENT_ID')
os.environ['GRAPHLIT_JWT_SECRET'] = userdata.get('GRAPHLIT_JWT_SECRET')

graphlit = Graphlit()

Initialize Gmail authentication

Requirements:
- Create Google OAuth 2.0 Client ID via [Google Cloud Console](https://console.cloud.google.com/apis/credentials), download JSON, and assign text of JSON file as Google Colab secret named GOOGLE_OAUTH_CREDENTIALS.  Make sure to add http://localhost as an authorized redirect URI.

In [None]:
from google_auth_oauthlib.flow import Flow
import json
from google.colab import output

# The scope for Gmail API
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

# Get Google OAuth credentials from secret
client_secret_json = userdata.get('GOOGLE_OAUTH_CREDENTIALS')

client_secrets = json.loads(client_secret_json)

# Extract client ID and client secret
if "installed" in client_secrets:
    client_id = client_secrets['installed']['client_id']
    client_secret = client_secrets['installed']['client_secret']
elif "web" in client_secrets:
    client_id = client_secrets['web']['client_id']
    client_secret = client_secrets['web']['client_secret']
else:
    raise ValueError("Invalid JSON format for OAuth credentials.")

flow = Flow.from_client_config(
    client_secrets,
    scopes=SCOPES,
    redirect_uri='http://localhost' # NOTE: needs to be added as an authorized redirect URI
)

# Generate the authorization URL with offline access to request a refresh token
auth_url, _ = flow.authorization_url(prompt='consent', access_type='offline')

# Display the URL for the user to authorize access
print(f"Please go to this URL and authorize access: {auth_url}")

# Open the URL in the browser
output.eval_js(f'window.open("{auth_url}");')

# After granting permissions to Google, you will be redirected to http://localhost with a "code" in the URL
# NOTE: The page won't load, but you just need to capture the 'code' parameter from the URL.
# http://localhost/?state={state}&code={code}&scope=https://www.googleapis.com/auth/gmail.readonly

# Copy the "code" parameter from the URL and paste it here
code = input("Enter the authorization code you received: ")

# Exchange the authorization code for credentials (including refresh token)
flow.fetch_token(code=code)

# Get the credentials
credentials = flow.credentials

# Now we have the client_id, client_secret and refresh_token to use with Graphlit
refresh_token = credentials.refresh_token

Define Graphlit helper functions

In [None]:
from typing import List, Optional

async def create_feed(refresh_token: str):
    if graphlit.client is None:
        return;

    input = input_types.FeedInput(
        name="Google Email",
        type=enums.FeedTypes.EMAIL,
        email=input_types.EmailFeedPropertiesInput(
            type=enums.FeedServiceTypes.GOOGLE_EMAIL,
            google=input_types.GoogleEmailFeedPropertiesInput(
                type=enums.EmailListingTypes.PAST,
                refreshToken=refresh_token,
                clientId=client_id,
                clientSecret=client_secret
            ),
            readLimit=25 # limiting to 25 emails
        )
    )

    try:
        response = await graphlit.client.create_feed(input)

        return response.create_feed.id if response.create_feed is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

    return None

async def is_feed_done(feed_id: str):
    if graphlit.client is None:
        return;

    response = await graphlit.client.is_feed_done(feed_id)

    return response.is_feed_done.result if response.is_feed_done is not None else None

async def query_persons(email: str):
    if graphlit.client is None:
        return;

    try:
        response = await graphlit.client.query_persons(
            filter=input_types.PersonFilter(
                email=email
            )
        )

        return response.persons.results if response.persons is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

# Locate contents where person was observed, with relevant search text
async def query_contents(person_id: str, search_text: str):
    if graphlit.client is None:
        return;

    try:
        response = await graphlit.client.query_contents(
            filter=input_types.ContentFilter(
                search=search_text,
                searchType=enums.SearchTypes.HYBRID,
                observations=[
                    input_types.ObservationReferenceFilter(
                        type=enums.ObservableTypes.PERSON,
                        observable=input_types.EntityReferenceFilter(
                            id=person_id
                        )
                    )
                ]
            )
        )

        return response.contents.results if response.contents is not None else None
    except exceptions.GraphQLClientError as e:
        print(str(e))
        return None

async def delete_all_feeds():
    if graphlit.client is None:
        return;

    _ = await graphlit.client.delete_all_feeds(is_synchronous=True)


Execute Graphlit example

In [None]:
from IPython.display import display, Markdown
import time

# Remove any existing feeds; only needed for notebook example
await delete_all_feeds()

print('Deleted all feeds.')

# NOTE: specify the email of the Person to locate
email = "kirk@graphlit.com"

# NOTE: specify the text to search for in the filtered emails
search_text = "graphlit-client"

if refresh_token is not None:
    feed_id = await create_feed(refresh_token)

    if feed_id is not None:
        print(f'Created feed [{feed_id}].')

        # Wait for feed to complete, since ingestion happens asychronously
        done = False
        time.sleep(5)
        while not done:
            done = await is_feed_done(feed_id)

            if not done:
                time.sleep(2)

        print(f'Completed feed [{feed_id}].')

        persons = await query_persons(email)

        person = persons[0] if persons is not None and len(persons) > 0 else None

        if person is not None:
            print(f'Found person [{person.id}] with email [{person.email}] named [{person.name}].')

            # Query contents by person and search text
            contents = await query_contents(person.id, search_text)

            if contents is not None:
                for content in contents:
                    if content is not None:
                        display(Markdown(f'### Found Google email [{content.id}] that referenced Person [{email}] and search text [{search_text}].'))

                        metadata = content.email

                        if metadata is not None:
                            if metadata.subject is not None:
                                print("Subject: " + metadata.subject)
                            if metadata.to is not None and len(metadata.to) > 0:
                                print("To: " + ", ".join([f'"{item.name}" <{item.email}>' for item in metadata.to if item is not None]))
                            if metadata.from_ is not None and len(metadata.from_) > 0:
                                print("From: " + ", ".join([f'"{item.name}" <{item.email}>' for item in metadata.from_ if item is not None]))

                        display(Markdown(content.markdown))
                        print()
        else:
            print(f'No person with email [{email}] found.')