# Knowledge Extraction

In [21]:
import json, requests, re
import numpy as np, pandas as pd

import geovpylib.database as db
import geovdata.sparql as sparql
import geovdata.kit as kit

import spacy
import scipy

db.connect_yellow('switzerland_and_beyond')

def ask_ollama(prompt, model='mistral'):
    url='http://localhost:11434/api/generate'
    response = requests.post(url, json={'model':model,'prompt':prompt, 'option':{'temperature':0}})
    text = response.text.strip()
    lines = text.split('\n')
    tokens = list(map(lambda line: json.loads(line)['response'], lines))
    formated = ''.join(tokens)
    answer = formated.strip()
    return answer


nlp = spacy.load('en_core_web_lg')
nlp.add_pipe("merge_entities")

[DB] Requests will not be executed
[DB] Connecting to YELLOW database "switzerland_and_beyond" ... Connected!


<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

# Raw text

In [2]:
persons = db.query(f'select * from hls.person')

person = persons.sample(1).iloc[0]
name = person['name']
notice = person['notice']

print(kit.wrap(notice))

Naît le 19.11.1932 à Rupperswil, meurt le 5.11.1966 à Zurich, d'Untersiggenthal. Fils d'Ernst Friedrich, maître d'école secondaire, et de Margrit
Kyburz. Marié(e) à Anna Maria Chabloz, fille de Jean Pierre. Etudes d'architecture à l'EPF de Zurich, diplôme en 1958. Séjour de plusieurs mois à
Stockholm (1958-1959) et au Brésil (en 1959, puis vers 1962). Hans Ulrich Scherer est connu pour avoir eu l'idée pionnière de quartiers d'habitation
implantés en terrasse, à flanc de coteau. Il établit les plans et réalisa, avec ses partenaires, des quartiers modèles à Klingnau, Umiken et
Oberrohrdorf. Ses réalisations et ses articles spécialisés sont à l'origine de l'intérêt marqué des architectes suisses pour les quartiers en
terrasse.


# Pseudo structured data

In [3]:
llm_response = ask_ollama(f"""
Provide me all the statements you understand from the following text.
Statements should be short, concise like RDF triples.
If date are provided, they should have the following format: day.month.year
Nothing should be implied.
                    
Text: "About {name}. {notice}"
""")
# print(llm_response)

In [4]:
assertions = llm_response.split("\n")
assertions = [text[text.find(' ') + 1:] for text in assertions]

for assertion in assertions:
    print(assertion)

Hans Ulrich Scherer was born on 19.11.1932 in Rupperswil.
Hans Ulrich Scherer died on 5.11.1966 in Zurich.
Hans Ulrich Scherer's parents were Ernst Friedrich, a secondary school teacher, and Margrit Kyburz.
Hans Ulrich Scherer married Anna Maria Chabloz, daughter of Jean Pierre.
Hans Ulrich Scherer studied architecture at the EPF Zurich and graduated in 1958.
Hans Ulrich Scherer spent several months in Stockholm between 1958 and 1959.
Hans Ulrich Scherer spent time in Brazil around 1959 and sometime around 1962.
Hans Ulrich Scherer is known for his pioneering idea of housing estates built on terraces, on the side of a slope.
Hans Ulrich Scherer designed and implemented model housing estates in Klingnau, Umiken and Oberrohrdorf.
Hans Ulrich Scherer's works and specialized articles influenced the interest of Swiss architects in terraced housing estates.


# Extract structured data

## Code the extraction

**Tool functions**

In [9]:
# To have a unique index
index = 0
def get_index():
    global index
    index += 1
    return index


# Parse a date: 31.12.2020 => (2020, 12, 31)
def parse_date(number):
    splitted = number.split('.')
    if len(splitted) == 3: return (int(splitted[2]), int(splitted[1]), int(splitted[0]))
    print(number)
    return int(number)


**BIRTHS**

In [17]:
def handle_birth(doc, verbose=False):

    # Detetion parameters
    exact_words = ['birth', 'born']
    lemmas = ['born']

    # Detection
    extract = False
    for token in doc:
        if token.text in exact_words: extract = True
        if token.lemma in lemmas: extract = True
        if extract: break
    if not extract: return []


    # Extraction
    if verbose: print(f'BIRTH')
    pk_birth = get_index()

    # Extract persons
    persons = list(filter(lambda entity: entity.label_ == "PERSON", doc.ents))
    if len(persons) > 0: 
        if verbose: print("-- Person(s) found:", persons)
        pk_person = get_index()
        yield (f"Birth ({pk_birth})", "brought into life", f"{persons[0].text} ({pk_person})") 

    # Extract dates
    numbers = list(filter(lambda token: token.pos_ == "NUM", doc))
    if len(numbers) > 0: 
        if verbose: print("-- Number(s) found:", numbers)
        yield (f"Birth ({pk_birth})", "at some time within", parse_date(numbers[0].text))

    # Extract places
    gpes = list(filter(lambda entity: entity.label_ == "GPE", doc.ents))
    if len(gpes) > 0: 
        if verbose: print("-- GPE(s) found:", gpes)
        pk_place = get_index()
        yield (f"Birth ({pk_birth})", "took place at", f"{gpes[0].text} ({pk_place})")

**DEATH**

In [15]:
def handle_death(doc, verbose=False):
    
    # Detetion parameters
    exact_words = ['death', 'died']
    lemmas = ['died']

    # Detection
    extract = False
    for token in doc:
        if token.text in exact_words: extract = True
        if token.lemma in lemmas: extract = True
        if extract: break
    if not extract: return []


    # Extraction
    if verbose: print(f'DEATH')
    pk_death = get_index()

    # Extract persons
    persons = list(filter(lambda entity: entity.label_ == "PERSON", doc.ents))
    if len(persons) > 0: 
        if verbose: print("-- Person(s) found:", persons)
        pk_person = get_index()
        yield (f"Death ({pk_death})", "was death of", f"{persons[0].text} ({pk_person})") 

    # Extract dates
    numbers = list(filter(lambda token: token.pos_ == "NUM", doc))
    if len(numbers) > 0: 
        if verbose: print("-- Number(s) found:", numbers)
        yield (f"Death ({pk_death})", "at some time within", parse_date(numbers[0].text))

    # Extract places
    gpes = list(filter(lambda entity: entity.label_ == "GPE", doc.ents))
    if len(gpes) > 0: 
        if verbose: print("-- GPE(s) found:", gpes)
        pk_place = get_index()
        yield (f"Death ({pk_death})", "took place at", f"{gpes[0].text} ({pk_place})")

**CONFESSION**

In [33]:
def handle_confession(doc, verbose=False):

    confessions = ['catholic', 'protestant']

    extract = False
    for token in doc:
        if token.lemma_ in confessions:
            extract = True
            break
    if not extract: return []
    

    # Extraction
    if verbose: print(f'CONFESSION')

    # Extract confession
    confessions = list(filter(lambda entity: entity.label_ == "NORP", doc.ents))
    if len(confessions) > 0:
        pk_confession = get_index()
        if verbose: print("-- NORP(s) found:", confessions)

    # Extract person
    persons = list(filter(lambda entity: entity.label_ == "PERSON", doc.ents))
    if len(persons) > 0: 
        if verbose: print("-- Person(s) found:", persons)
        pk_person = get_index()
    

    # Return
    if len(confessions) == 1 and len(persons) == 1:
        return (f"{confessions[0].text} ({pk_confession})", "pertains to", f"{persons[0].text} ({pk_person})") 
    else:
        print('---')
        print('Error in extracting confession')
        print('Doc:', doc)
        print('Confessions:', confessions)
        print('Persons:', persons)
        print('---')
        return []

## Extract the graph

In [34]:
graph = []

for doc in nlp.pipe(assertions):
    print(doc)
    graph += handle_birth(doc, False)
    graph += handle_death(doc, False)
    graph += handle_confession(doc, True)


Hans Ulrich Scherer was born on 19.11.1932 in Rupperswil.
Hans Ulrich Scherer died on 5.11.1966 in Zurich.
Hans Ulrich Scherer's parents were Ernst Friedrich, a secondary school teacher, and Margrit Kyburz.
Hans Ulrich Scherer married Anna Maria Chabloz, daughter of Jean Pierre.
Hans Ulrich Scherer studied architecture at the EPF Zurich and graduated in 1958.
Hans Ulrich Scherer spent several months in Stockholm between 1958 and 1959.
Hans Ulrich Scherer spent time in Brazil around 1959 and sometime around 1962.
Hans Ulrich Scherer is known for his pioneering idea of housing estates built on terraces, on the side of a slope.
Hans Ulrich Scherer designed and implemented model housing estates in Klingnau, Umiken and Oberrohrdorf.
Hans Ulrich Scherer's works and specialized articles influenced the interest of Swiss architects in terraced housing estates.


In [35]:
for triplet in graph:
    print(triplet)

('Birth (48)', 'brought into life', 'Hans Ulrich Scherer (49)')
('Birth (48)', 'at some time within', (1932, 11, 19))
('Death (50)', 'was death of', 'Hans Ulrich Scherer (51)')
('Death (50)', 'at some time within', (1966, 11, 5))
('Death (50)', 'took place at', 'Zurich (52)')
