In [219]:
import openai
import notion_client
import json

with open('openai_secret_key') as r:
    openai_secret_key = r.readline().strip()
    openai.api_key = openai_secret_key

with open('notion_secret_key') as r:
    notion_secret_key = r.readline().strip()
    notion = notion_client.Client(auth=notion_secret_key)

# Entity extraction

In [304]:
engine = 'text-davinci-003'
journal_id = '66a8cd52520543df9895c373627f821d'
organizations_id = 'ec516282ca5248c3bd52d7cede238bc6'
people_id = '748131dafe774e4a9ad5c0c259f69aba'
places_id = '9c8884e3303a49238e44d9aad9d7e50e'

In [47]:
entries = notion.databases.query(database_id=journal_id)

In [179]:
entry_id = 'c6affbed81284a09affcd41c8fa4915b'
entry = notion.pages.retrieve(page_id=entry_id)

In [288]:
def get_entry_plain_text(entry_id):
    blocks = notion.blocks.children.list(block_id=entry_id)['results']
    plain_text = ''

    for block in blocks:
        block_type = block['type']
        rich_text = block[block_type]['rich_text']
        for text in rich_text:
            plain_text += text['plain_text']
        plain_text += '  \n\n'
    return plain_text

In [382]:
def get_entity_tags(entry_plain_text, engine='text-davinci-003'):
    extraction_prompt_template = """
    INSTRUCTIONS:
    From the text below, extract the following entities:
    Organizations: All platforms, products, companies and institutions mentioned by name (e.g. "Google", "FIFA", "World Bank"). Exclude broad terms such as industries ("Record Labels") that don't refer to specific companies. Exclude names of places (e.g. "Switzerland") and people (e.g. "Bill Gates").
    People: All people mentioned by their full name (e.g. John Doe). Exclude people mentioned only by first name (e.g. Peter).
    Places: All cities (e.g. Barcelona), regions (e.g. California) and countries (e.g. Italy) mentioned by name.
    Tags: At least 20 and ideally 50 of the most relevant, unusual, surprising and salient topics, ideas and themes the author is exploring and writing about. Also include topics that are mentioned in passing, or in relatively short fragments of the text. Make the topic names short and concise (e.g. use "Streaming Economics" instead of "The economics of music streaming")
    Capitalize each of the entities in title case.
    Format the results as a JSON dictionary containing all of the lists described above:
    {
        "organizations": [
            # list of organization entities goes here
        ],
        "people": [
            # list of people entities goes here
        ],
        "places": [
            # list of place entities goes here
        ],
        "tags": [
            # list of tag entities goes here
        ]
    }
    
    EXAMPLE TEXT:
    ###
    Pedro Cano and I were talking about the electric car industry and he suggested that we rent a Tesla for our trip to LA with Alex and Enric. I’m finding it harder to hate things lately. I find it hard to sympathize with people who immediately assign malice to people or companies they know little about, instead of trying to understand what they might not know. Despite not being a huge fan of Elon Musk, I don’t feel the need to join the online polarization about him and transfer that energy to my impressions of a Tesla.
    ###
    
    EXAMPLE OUTPUT:
    ###
    {
        "organizations": [
            "Tesla"
        ],
        "people": [
            "Pedro Cano", "Elon Musk"
        ],
        "places": [
            "Los Angeles"
        ],
        "tags": [
            "Electric Cars", "Empathy", "Hating", "Polarization", "Celebrities", "Online Behavior"
        ]
    }
    ###
    """
    extraction_prompt = extraction_prompt_template + f"""
    REAL TEXT:
    ###
    {entry_plain_text}
    ###
    
    REAL OUTPUT:
    """
    extraction = openai.Completion.create(
        engine=engine,
        prompt=extraction_prompt,
        max_tokens=1024,
        temperature=0.1
    )
    entities_text = extraction['choices'][0]['text']
    entities = json.loads(entities_text)
    return entities

## Old stuff

In [278]:
extraction_prompt_template = """
From the text below, extract the following entities:
Organizations: All platforms, products, companies and institutions mentioned by name. Exclude broad terms such as industries ("Record Labels") that don't refer to specific companies. Exclude names of places and people.
People: All people mentioned by their full name (e.g. John Doe). Exclude people mentioned only by first name (e.g. Peter).
Places: All cities (e.g. Barcelona), regions (e.g. California) and countries (e.g. Italy) mentioned by name.
Tags: At least 20 and ideally 50 of the most relevant, unusual, surprising and salient topics, ideas and themes the author is exploring and writing about. Also include topics that are mentioned in passing, or in relatively short fragments of the text. Make the topic names short and concise (e.g. use "Streaming Economics" instead of "The economics of music streaming")
Format the results as a JSON dictionary containing all of the lists described above:
{
    "organizations": [
        # list of organization entities goes here
    ],
    "people": [
        # list of people entities goes here
    ],
    "places": [
        # list of place entities goes here
    ],
    "tags": [
        # list of tag entities goes here
    ]
}
"""
extraction_prompt_template += f"""
Text:###
{plain_text}
###
"""

In [279]:
extraction = openai.Completion.create(
    engine=engine,
    prompt=extraction_prompt,
    max_tokens=1024,
    temperature=0.2
)
entities_text = extraction['choices'][0]['text']

In [286]:
entities = json.loads(entities_text)

In [247]:
single_list_prompt = """
INSTRUCTIONS:
From the text below, make a list of all entities mentioned. 
Include all platforms, products, companies and institutions mentioned by name. 
Include all people mentioned. 
Include all places mentioned, including cities (e.g. Barcelona), regions (e.g. California) and countries (e.g. Italy) mentioned by name.
Include all topics, ideas and themes the author is writing about. 
Include topics that are mentioned in passing, or in relatively short fragments of the text.
Make the list long (at least 30 entries and 50 if possible) and exhaustive (include any entities regardless of how often or thoroughly they are mentioned).
Format the results as a JSON dictionary containing a single list of all entities extracted as per above:
{
    "entities": [
        # list of all extracted entities goes here
    ]
}
"""
single_list_prompt += f"""
TEXT:###
{plain_text}
###
"""

In [248]:
single_list_results = openai.Completion.create(
    engine=engine,
    prompt=single_list_prompt,
    max_tokens=1500,
    temperature=0.9
)
single_list = single_list_results['choices'][0]['text']

In [249]:
print(single_list)

{
    "entities": [
        "Dominican barber",
        "Nic",
        "Spanish",
        "American",
        "Canarian",
        "Catalan",
        "Spain",
        "Catalonia",
        "United States",
        "Canary Islands",
        "Business",
        "Philosophy of Life",
        "Brian Schmitt",
        "Fort Greene",
        "Prospect Park",
        "Design",
        "Apple",
        "Product Strategy and Mangement",
        "BMAT",
        "Carina",
        "Songtradr",
        "Design Company",
        "Live Experience of Music",
        "Underdogs Function",
        "Asymmetric Bets",
        "Haircut",
        "Racial Stereotyping",
        "Mexicans",
        "LA",
        "Black People",
        "Minz",
        "Korean People",
        "Asian People",
        "Julie",
        "Japanese Mom",
        "Racially Homogeneous Cultures",
        "Island Nations",
        "United Kingdom",
        "Ireland",
        "Japan",
        "Mixed Cultures",
        "Mixed Races",
    

In [281]:
filtering_prompt = """
Below are four lists of entities: People, Organizations, Places and Tags.
Filter each of the lists using the following instructions:
- People: keep only entries in the input list that are people referred to exclusively by their full name (e.g. 'John Doe'). Exclude from this list people referred to by their first name only (e.g. 'John').
- Organizations: keep only entries in the input list that refer to specific platforms (e.g. 'Twitter'), companies (e.g. 'Apple'), and institutions (e.g. 'NATO'). Do not include names of industries (e.g. 'Music Publishing') or specific products (e.g. 'iPhone').
- Places: keep only entries in the input list that refer to specific neighborhoods (e.g. 'Eixample'), cities (e.g. 'Barcelona'), regions (e.g. 'Catalonia'), countries (e.g. 'Spain'), continents (e.g. 'Asia') or natural geographical landmarks (e.g. 'The Himalayas').
- Tags: keep only entries in the input list that aren't already included in the lists above.
Capitalize all entries in title case.
Format the results as a JSON dictionary containing all of the lists described above:
{
    "organizations": [
        # list of organization entities goes here
    ],
    "people": [
        # list of people entities goes here
    ],
    "places": [
        # list of place entities goes here
    ],
    "tags": [
        # list of tag entities goes here
    ]
}
"""

filtering_prompt += f"""
List:###
{entities_text}
###
"""

In [282]:
filtering = openai.Completion.create(
    engine=engine,
    prompt=filtering_prompt,
    max_tokens=1024,
    temperature=0.3
)
filtered_text = filtering['choices'][0]['text']

In [284]:
print(filtered_text)


Filtered List:

{
    "organizations": [
        "Apple",
        "BMAT",
        "Songtradr"
    ],
    "people": [
        "Brian Schmitt",
        "Carina",
        "Minz",
        "Julie",
        "Mel",
        "JJ",
        "Zal"
    ],
    "places": [
        "Dominican Republic",
        "Spain",
        "Catalonia",
        "United States",
        "Canary Islands",
        "The Philippines",
        "Scandinavia",
        "South Korea",
        "Japan"
    ],
    "tags": [
        "Cultures of Me",
        "Underdog Functions",
        "Rabbit Hole",
        "Stereotypes",
        "Beauty",
        "Island Nations",
        "K-Pop",
        "Fandom",
        "World of Warcraft",
        "Metal Music",
        "Tumblr",
        "Emo Music",
        "Middle-Class Upbringing",
        "Financial Struggle",
        "Psychological Struggle",
        "Social Justice",
        "Atheistic Liberals",
        "Journaling",
        "Narratives",
        "Mood Swings",
        "Cycles",

# Entity retrieval/creation and relation creation

In [376]:
entry_id = 'cdd12f54517c453bb25538e5fb0d2152'
entry = notion.pages.retrieve(page_id=entry_id)
entry_plain_text = get_entry_plain_text(entry_id)
#print(entry_plain_text)

In [383]:
entity_tags = get_entity_tags(entry_plain_text)
entity_tags

{'organizations': ['BMAT', 'Songtradr', 'Apple'],
 'people': ['Nic', 'Brian Schmitt', 'Carina', 'Minz', 'Julie', 'Mel'],
 'places': ['Fort Greene',
  'Prospect Park',
  'Los Angeles',
  'The Philippines',
  'Scandinavia',
  'South Korea',
  'United Kingdom',
  'Ireland',
  'Japan'],
 'tags': ['Racial Stereotyping',
  'Racism',
  'Genetic Diversity',
  'Beauty Culture',
  'Fandom',
  'Escapism',
  'Middle-Class Upbringing',
  'Financial Struggle',
  'Psychological Struggle',
  'Social Justice',
  'Atheistic Liberals',
  'Mood Swings',
  'American Worldview']}

In [367]:
def get_all_pages(database_id):
    pages = []
    response = notion.databases.query(database_id=database_id)
    pages += response['results']
    while response['has_more']:
        response = notion.databases.query(database_id, start_cursor=response['next_cursor'])
        pages += response['results']
    return pages


def get_pages_dict(raw_pages):
    # Get name of title property
    for p_name, p_values in raw_pages[0]['properties'].items():
        if p_values['type'] == 'title':
            title_property = p_name
            break

    # Parse all pages into a dict {'page_title': page}
    pages = {}
    for page in raw_pages:
        try:
            name = page['properties'][title_property]['title'][0]['plain_text']
        except:
            pass
        pages[name] = page
    return pages

In [394]:
raw_pages = get_all_pages(places_id)
pages_dict = get_pages_dict(raw_pages)

In [358]:
# take original entry
# for each autotagged entity type
#  get all existing pages of that type
#  for each autotagged entity of that type
#   if it doesn't have a page, create it
#   add a relation to the page in the original entry

In [409]:
relation_ids = []

for tag in entity_tags['places']:
    page = pages_dict.get(tag)
    if page:
        relation_ids.append({'id': page['id']})
    

In [415]:
prop = entry['properties']['Places']
prop['relation'] = prop['relation'] + relation_ids
property_updates = {'Places': prop}

In [418]:
notion.pages.update(page_id=entry_id, properties=property_updates)

{'object': 'page',
 'id': 'cdd12f54-517c-453b-b255-38e5fb0d2152',
 'created_time': '2022-12-30T17:30:00.000Z',
 'last_edited_time': '2022-12-30T18:03:00.000Z',
 'created_by': {'object': 'user',
  'id': 'fef31225-60a6-48ca-9c0a-78051140f2ff'},
 'last_edited_by': {'object': 'user',
  'id': '42bea9d8-261e-416e-a0cd-8e90482f044d'},
 'cover': None,
 'icon': {'type': 'external',
  'external': {'url': 'https://www.notion.so/icons/book-closed_gray.svg'}},
 'parent': {'type': 'database_id',
  'database_id': '66a8cd52-5205-43df-9895-c373627f821d'},
 'archived': False,
 'properties': {'Dots': {'id': '%3EBvH',
   'type': 'relation',
   'relation': [{'id': 'cc8e0758-05f1-43f2-ad04-0b2fed565e69'},
    {'id': '9c1db92a-b364-4534-875f-8a985a39ef91'}],
   'has_more': False},
  'Media': {'id': 'Ajzp',
   'type': 'relation',
   'relation': [],
   'has_more': False},
  'Date': {'id': 'SJ%3AQ',
   'type': 'date',
   'date': {'start': '2022-08-20', 'end': None, 'time_zone': None}},
  'Places': {'id': 'VGoF'

In [416]:
entry_id

{'Places': {'id': 'VGoF',
  'type': 'relation',
  'relation': [{'id': '792abf97-f88d-4c9d-a3f3-60eb4420339b'},
   {'id': '514aa001-a67b-48a8-ab43-9be17d415245'},
   {'id': 'c017eac5-9265-44f1-8ac0-1d744f39146d'},
   {'id': 'd395efbb-947b-4ebb-9943-2e70bc9c2158'},
   {'id': '22c8f5cd-876e-4266-9ec7-9d4c7c9481a2'},
   {'id': 'c189aced-be2d-4967-aa2b-7d1e44186c3c'},
   {'id': '810aec71-49e7-45fb-93ef-5410911a0ed3'},
   {'id': 'f28d8f05-f7c0-4218-a8cb-40c3b0d4aa70'},
   {'id': '68d44bd0-3d53-42b3-98d7-03f2ead4dda3'},
   {'id': '45fa6213-3ba7-45d0-bed6-5cecbf18e268'},
   {'id': '589f3239-50c1-456f-a1c4-e6ae8b4f43f2'},
   {'id': '41d006f8-9c93-4b60-9965-64486ff09efd'},
   {'id': 'c189aced-be2d-4967-aa2b-7d1e44186c3c'},
   {'id': '810aec71-49e7-45fb-93ef-5410911a0ed3'},
   {'id': 'f28d8f05-f7c0-4218-a8cb-40c3b0d4aa70'},
   {'id': '68d44bd0-3d53-42b3-98d7-03f2ead4dda3'},
   {'id': '45fa6213-3ba7-45d0-bed6-5cecbf18e268'},
   {'id': '589f3239-50c1-456f-a1c4-e6ae8b4f43f2'},
   {'id': '41d006f8-9c

In [417]:
entry_id.replace('-','')

'cdd12f54517c453bb25538e5fb0d2152'

In [354]:
len(results)

243