# Notice Knowledge extraction

In [134]:
import json, requests, re
import numpy as np, pandas as pd

import geovpylib.database as db
import geovdata.sparql as sparql
import geovdata.kit as kit

import spacy
import scipy

db.connect_yellow('switzerland_and_beyond')

def ask_ollama(prompt, model='mistral', print_prompt=False):

    if print_prompt:
        print()
        print('===== Prompt sent to', model, '=====')
        print(prompt)
        print('====================================')

    url='http://localhost:11434/api/generate'
    response = requests.post(url, json={'model':model,'prompt':prompt, 'option':{'temperature':0}})
    text = response.text.strip()
    lines = text.split('\n')
    tokens = list(map(lambda line: json.loads(line)['response'], lines))
    formated = ''.join(tokens)
    answer = formated.strip()
    return answer


nlp = spacy.load('fr_core_news_lg')

[DB] Requests will not be executed
[DB] Connecting to YELLOW database "switzerland_and_beyond" ... Connected!


## Test data

In [135]:
person_nb = 52
person = db.query(f'select * from hls.person where id = {person_nb}').iloc[0]
notice = person['notice']
person_name = person['name']
doc = nlp(notice)

print(kit.wrap(notice))

Naît le 26.12.1777 à Plaisance, meurt après 1843, catholique, de Plaisance. Fils de Gaspare, peintre. Pietro Antonio Landi résida tantôt à Plaisance,
tantôt à Rome jusqu'à l'âge de 30 ans, puis s'établit à Milan où il s'associa au typographe Visai. En 1817, il ouvrit une typographie à Mendrisio, qui
resta en activité quelques mois seulement et fut le lieu de rencontre d'exilés italiens (d'anciens officiers napoléoniens). Ceci déplut aux autorités
autrichiennes de la Lombardie qui firent pression sur le gouvernement tessinois pour qu'il mette Pietro Antonio Landi sous surveillance. Accusé de ne
pas avoir respecté l'interdiction de publier des opuscules politiques, il dut subir un procès et des perquisitions. En octobre 1817, le gouvernement
ordonna la fermeture de l'imprimerie et l'expulsion de son propriétaire.


## Transform the texts into pseudo structured data

In [154]:
response = ask_ollama(f"""
Provide me all the statements you understand from the following text.
Statements should be short, concise like RDF triples.
If date are provided, they should have the following format: day.month.year
Nothing should be implied.
                    
Text: "About {person_name}. {notice}"
""")
print(response)

1. Pietro Antonio Landi was born on 26.12.1777 in Plaisance.
2. Pietro Antonio Landi died after 1843.
3. Pietro Antonio Landi was Catholic and from Plaisance.
4. Pietro Antonio Landi is the son of Gaspare, a painter.
5. Pietro Antonio Landi lived in Plaisance and Rome until the age of 30.
6. Pietro Antonio Landi established a typography in Mendrisio in 1817.
7. Pietro Antonio Landi's typography in Mendrisio was active for only a few months.
8. Pietro Antonio Landi's typography in Mendrisio was a meeting place for Italian exiles.
9. Austrian authorities put pressure on the Tessinian government to monitor Pietro Antonio Landi.
10. Pietro Antonio Landi was accused of publishing political opuscules despite an interdiction.
11. Pietro Antonio Landi underwent a trial and searches.
12. The government ordered the closure of Pietro Antonio Landi's typography in October 1817.
13. Pietro Antonio Landi was expelled from Mendrisio.


## Extract statements from LLM

In [155]:
statements = response.split("\n")
statements = [text[text.find(' ') + 1:] for text in statements]

for statement in statements:
    print(statement)

Pietro Antonio Landi was born on 26.12.1777 in Plaisance.
Pietro Antonio Landi died after 1843.
Pietro Antonio Landi was Catholic and from Plaisance.
Pietro Antonio Landi is the son of Gaspare, a painter.
Pietro Antonio Landi lived in Plaisance and Rome until the age of 30.
Pietro Antonio Landi established a typography in Mendrisio in 1817.
Pietro Antonio Landi's typography in Mendrisio was active for only a few months.
Pietro Antonio Landi's typography in Mendrisio was a meeting place for Italian exiles.
Austrian authorities put pressure on the Tessinian government to monitor Pietro Antonio Landi.
Pietro Antonio Landi was accused of publishing political opuscules despite an interdiction.
Pietro Antonio Landi underwent a trial and searches.
The government ordered the closure of Pietro Antonio Landi's typography in October 1817.
Pietro Antonio Landi was expelled from Mendrisio.


## Get main information about a statement

### Attempt 1

In [138]:
# Attempt 1:
# Here we try to find the main information in the text by doing the cosine similarity with some predefined words.

nlp = spacy.load('en_core_web_lg')
nlp.add_pipe("merge_entities")

infos = ["Birth", "Death", "Marriage", "Study", "Family"]
infos_embeddings = []
for info in infos:
    infos_embeddings.append(nlp(info).vector)


for statement in statements:
    doc = nlp(statement)    
    embedding = doc.vector

    cosines_similarities = [
        {
            'index': i,
            'similarity': 1 - float(scipy.spatial.distance.cosine(embedding, info_emb))
        }
        for i, info_emb in enumerate(infos_embeddings)
    ]

    cosines_similarities.sort(key=lambda calc: calc['similarity'], reverse=True)

    print(infos[cosines_similarities[0]['index']], f'(score:{round(cosines_similarities[0]['similarity']*100)})', " ---> " ,statement)
    

Marriage (score:17)  --->  Pietro Antonio Landi was born on 26.12.1777 in Plaisance.
Death (score:20)  --->  Landi died after 1843.
Marriage (score:29)  --->  Landi was Catholic and from Plaisance.
Marriage (score:22)  --->  Landi's father, Gaspare, was a painter.
Marriage (score:25)  --->  Landi lived in Plaisance and Rome until the age of 30.
Marriage (score:21)  --->  Landi then established a typography in Mendrisio in 1817.
Marriage (score:26)  --->  The Mendrisio typography was a gathering place for Italian exiles.
Marriage (score:25)  --->  Authorities in Lombardy disapproved and pressured the Tessin government to monitor Landi.
Marriage (score:24)  --->  Landi was accused of publishing political opuscules despite an interdiction.
Marriage (score:24)  --->  Landi underwent a trial and house searches.
Marriage (score:28)  --->  In October 1817, the government ordered the closure of Landi's typography and his expulsion.


___

### Attempt2

In [139]:
def extract_birth(doc):
    print("Extract Birth from:", doc)
    
    graph = []

    # Extract person
    person = list(filter(lambda entity: entity.label_ == "PERSON", doc.ents))[0]
    if person: graph += [(f"Birth", "brought into life", person.text)]

    # Extract date
    date_pattern = r'\b(\d{1,2})\.(\d{1,2})\.(\d{4})\b'
    dates = re.findall(date_pattern, str(doc))
    if len(dates) > 0:
        day, month, year = dates[0]
        graph += [(f"Birth", "at some time within", (int(day), int(month), int(year)))]

    # Extract place
    place = list(filter(lambda entity: entity.label_ == "GPE", doc.ents))[0]
    if place: graph += [(f"Birth", "took place at", place.text)]

    return graph


def extract_death(doc):
    print("Extract Death from:", doc)

    graph = []

    # Extract person
    persons = list(filter(lambda entity: entity.label_ == "PERSON", doc.ents))
    if len(persons) > 0:
        graph += [("Death", "was death of", persons[0].text)]

    # Extract date
    date_pattern = r'\b(\d{1,2})\.(\d{1,2})\.(\d{4})\b'
    dates = re.findall(date_pattern, str(doc))
    if len(dates) > 0:
        day, month, year = dates[0]
        graph += [("Death", "at some time within", (int(day), int(month), int(year)))]

    # Extract place
    places = list(filter(lambda entity: entity.label_ == "GPE", doc.ents))
    if len(places) > 0: 
        graph += [("Death", "took place at", places[0].text)]

    return graph



def extract_marriage(doc):
    print("Extract Marriage from:", doc)

    graph = []

    # Extract persons
    persons = list(filter(lambda entity: entity.label_ == "PERSON", doc.ents))
    if persons: 
        graph += [("Union", "has partner", persons[1].text)]
        graph += [("Union", "has partner", persons[0].text)]

    return graph



def extract_parents(doc):
    print("Extract Parent from:", doc)

    graph = []
    matcher = spacy.matcher.Matcher(nlp.vocab)

    pattern_child = [{'LEMMA':'parent'}, {'LEMMA':'of'}, {"ENT_TYPE": "PERSON"}]
    pattern_parent1 = [{'LEMMA':'be'}, {"ENT_TYPE": "PERSON"}]
    pattern_parent2 = [{'LEMMA':'and'}, {"ENT_TYPE": "PERSON"}]
    matcher.add('CHILD', [pattern_child])
    matcher.add('PARENT', [pattern_parent1, pattern_parent2])

    matchings = matcher(doc)
    childs = []
    parents = []
    for id, start, end in matchings:
        if doc.vocab.strings[id] == 'CHILD': childs.append(str(doc[start+2:end]))
        if doc.vocab.strings[id] == 'PARENT': parents.append(str(doc[start+1:end]))


    for parent in parents: 
        graph += [("Union", "has partner", parent)]

    graph += [('Birth', 'stemmed from', 'Union')]

    for child in childs: 
        graph += [("Birth", "brought into life", child)]
    
    return graph


In [140]:

def extract_infos(info_name, spacy_doc):
    if info_name == 'Birth': return extract_birth(spacy_doc)
    if info_name == 'Death': return extract_death(spacy_doc)
    if info_name == 'Marriage': return extract_marriage(spacy_doc)
    if info_name == 'Parent': return extract_parents(spacy_doc)

    return []

In [141]:
# Attempt 2:
# Using document similarities

nlp = spacy.load('en_core_web_lg')
nlp.add_pipe("merge_entities")
docs = nlp.pipe(statements)

witnesses = [
    {'info': 'Birth', 'text': 'Someone is born in a place on a date.'},
    {'info': 'Birth', 'text': 'John Doe is born in New York on 1.1.2020.'},
    {'info': 'Death', 'text': 'John Doe died in New York on 1.1.2020.'},
    {'info': 'Parent', 'text': 'John Doe\'s parents are Jeane and Martin.'},
    {'info': 'Parent', 'text': 'Parents of John Doe are Jeane and Martin.'},
    {'info': 'Parent', 'text': 'Martin is John\'s father.'},
    {'info': 'Parent', 'text': 'John Doe is the son of Martin.'},
    {'info': 'Parent', 'text': 'Jeane is John\'s mother.'},
    {'info': 'Parent', 'text': 'John Doe is the son of Jeanne.'},
    {'info': 'Marriage', 'text': 'John Doe married Eva.'},
    {'info': 'Marriage', 'text': 'Eva is the spouse of'},
    {'info': 'Occupation', 'text': 'John Doe was a carpenter.'},
    {'info': 'Occupation', 'text': 'John Doe was a developer.'},
    {'info': 'Occupation', 'text': 'John Doe was a builder.'},
    {'info': 'Occupation', 'text': 'John Doe commanded a special unit of the Swiss army.'},
    {'info': 'Occupation', 'text': 'John Doe was in the army.'},
    {'info': 'Religion', 'text': 'John Doe was a Muslim.'},
    {'info': 'Religion', 'text': 'John Doe was a Jew.'},
    {'info': 'Religion', 'text': 'John Doe was a Christian.'},
    {'info': 'Religion', 'text': 'John Doe was a Catholic.'},
    {'info': 'Religion', 'text': 'John Doe was a Protestant.'},
    {'info': 'Study', 'text': 'John Doe studied architecture in New York.'},
    {'info': 'Study', 'text': 'John Doe studied in England.'},
    {'info': 'Study', 'text': 'John Doe studied at INSA Strasbourg.'},
    {'infos': 'Birth', 'text': 'A person is born in a place'},
    {'infos': 'Birth', 'text': 'A person is born on a date'},
    {'infos': 'Birth', 'text': 'A person is born in a place on a date'},
    {'infos': 'Death', 'text': 'A person died in a place'},
    {'infos': 'Death', 'text': 'A person died on a date'},
    {'infos': 'Death', 'text': 'A person died in a place on a date'}
]

for witness in witnesses:
    witness['doc'] = nlp(witness['text'])

graph = []

for doc in nlp.pipe(statements):
    best_score = 0
    best_info = ''
    for witness in witnesses:
        score = doc.similarity(witness['doc'])
        if score > best_score:
            best_score = score
            best_info = witness['info']

    graph += extract_infos(best_info, doc)

    # print(best_info, f'(score: {(round(best_score*100))})', doc)


for triplet in graph:
    print(triplet)

Extract Birth from: Pietro Antonio Landi was born on 26.12.1777 in Plaisance.


IndexError: list index out of range

___

### Attempt 3

In [158]:
# Attempt 3: Use spacy and text analysis to detect the main info in a statement

index = 0
def get_index():
    global index
    index += 1
    return index

def detect_information(doc):
    information = []
    for info_type in knex_config:
        detection = info_type['detection']
        for exact_word_lower in detection['exact_word_lower']:
            if exact_word_lower in doc.text.lower(): information.append(info_type)

    if len(information) == 0: return {'info_name': 'Unknown'}
    if len(information) == 1: return information[0]
    if len(information) > 1: return {'info_name': 'Multiple'}


def parse_date(number):
    splitted = number.split('.')
    if len(splitted) == 3: return (int(splitted[0]), int(splitted[1]), int(splitted[2]))
    return int(number)


def extract_birth(doc):
    birth_index = get_index()
    # Extract persons
    persons = list(filter(lambda entity: entity.label_ == "PERSON", doc.ents))
    if len(persons) > 0: yield (f"Birth ({birth_index})", "brought into life", persons[0].text)
    # Extract dates
    numbers = list(filter(lambda token: token.pos_ == "NUM", doc))
    if len(numbers) > 0: yield (f"Birth ({birth_index})", "at some time within", parse_date(numbers[0].text))
    # Extract places
    gpes = list(filter(lambda entity: entity.label_ == "GPE", doc.ents))
    if len(gpes) > 0: yield (f"Birth ({birth_index})", "took place at", gpes[0].text)


def extract_death(doc):
    death_index = get_index()
    # Extract persons
    persons = list(filter(lambda entity: entity.label_ == "PERSON", doc.ents))
    if len(persons) > 0: yield (f"Death ({death_index})", "was death of", persons[0].text)
    # Extract dates
    numbers = list(filter(lambda token: token.pos_ == "NUM", doc))
    if len(numbers) > 0: yield (f"Death ({death_index})", "at some time within", parse_date(numbers[0].text))
    # Extract places
    gpes = list(filter(lambda entity: entity.label_ == "GPE", doc.ents))
    if len(gpes) > 0: yield (f"Death ({death_index})", "took place at", gpes[0].text)


def extract_confession(doc):
    religion_index = get_index()
    # Extract persons
    persons = list(filter(lambda entity: entity.label_ == "PERSON", doc.ents))
    if len(persons) > 0: yield (f"Religious entity ({religion_index})", "pertains to", persons[0].text)
    # Extract religion
    religions = list(filter(lambda entity: entity.label_ == "NORP", doc.ents))
    if len(religions) > 0: yield (f"Religious entity ({religion_index})", "has name", religions[0].text)


def extract_parents(doc):

    print("parents", doc.text)

    matcher = spacy.matcher.Matcher(nlp.vocab)

    # Define pattenrs
    pattern_child = [{'LEMMA':'parent'}, {'LEMMA':'of'}, {"ENT_TYPE": "PERSON"}]
    pattern_parent1 = [{'LEMMA':'be'}, {"ENT_TYPE": "PERSON"}]
    pattern_parent2 = [{'LEMMA':'and'}, {"ENT_TYPE": "PERSON"}]

    # Search for the patterns
    matcher.add('CHILD', [pattern_child])
    matcher.add('PARENT', [pattern_parent1, pattern_parent2])
    matchings = matcher(doc)

    # Extract persons
    childs = []
    parents = []
    for id, start, end in matchings:
        if doc.vocab.strings[id] == 'CHILD': childs.append(str(doc[start+2:end]))
        if doc.vocab.strings[id] == 'PARENT': parents.append(str(doc[start+1:end]))

    if len(parents) > 0:
        union_index = get_index()

    for parent in parents: 
        yield (f"Union ({union_index})", "has partner", parent)

    for child in childs: 
        birth_index = get_index()
        yield (f"Birth ({birth_index})", "brought into life", child)
        yield (f"Birth ({birth_index})", "stemmed from", f"Union ({union_index})")


def extract_son_daughter_of(doc):

    print('son_daughter', doc.text)

    matcher = spacy.matcher.Matcher(nlp.vocab)

    # Define patterns
    pattern_child = [{"ENT_TYPE": "PERSON"}, {'LEMMA': 'be'}]
    pattern_parent1 = [{'LEMMA': {'IN': ['son', 'daughter']}}, {'LEMMA':'of'}, {"ENT_TYPE": "PERSON"}]
    pattern_parent2 = [{'LEMMA':'and'}, {"ENT_TYPE": "PERSON"}]

    # Search for the patterns
    matcher.add('CHILD', [pattern_child])
    matcher.add('PARENT', [pattern_parent1, pattern_parent2])
    matchings = matcher(doc)

    # Extract persons
    childs = []
    parents = []
    for id, start, end in matchings:
        if doc.vocab.strings[id] == 'CHILD': childs.append(str(doc[start:end]))
        if doc.vocab.strings[id] == 'PARENT': parents.append(str(doc[start+1:end]))

    if len(parents) > 0:
        union_index = get_index()

    for parent in parents: 
        yield (f"Union ({union_index})", "has partner", parent)

    for child in childs: 
        birth_index = get_index()
        yield (f"Birth ({birth_index})", "brought into life", child)
        yield (f"Birth ({birth_index})", "stemmed from", f"Union ({union_index})")



knex_config =[
        {
            'info_name':'BIRTH',
            'detection': {
                'exact_word_lower': ['birth', 'born'], 
            },
            'extraction': {
                'function': extract_birth
            },
        },
        {
            'info_name': 'DEATH',
            'detection': {
                'exact_word_lower': ['death', 'died'], 
            },
            'extraction': {
                'function': extract_death
            },
        },
        {
            'info_name': 'CONFESSION',
            'detection': {
                'exact_word_lower': ['catholic', 'protestant'], 
            },
            'extraction': {
                'function': extract_confession
            },
        },
        {
            'info_name': 'PARENT',
            'detection': {
                'exact_word_lower': ['parents'], 
            },
            'extraction': {
                'function': extract_parents
            },
        },
        {
            'info_name': 'SON_DAUGHTER',
            'detection': {
                'exact_word_lower': ['son', 'daughter'], 
            },
            'extraction': {
                'function': extract_son_daughter_of
            },
        }
    ]

graph = []
for doc in nlp.pipe(statements):
    print(doc, end=' ')
    infos = detect_information(doc)
    print('==>', infos['info_name'])
    if infos['info_name'] != 'Unknown' and infos['info_name'] != 'Multiple':
        new_graph = infos['extraction']['function'](doc)
        for triple in new_graph:
            print(triple)
        graph += new_graph
    

Pietro Antonio Landi was born on 26.12.1777 in Plaisance. ==> BIRTH
('Birth (1)', 'brought into life', 'Pietro Antonio Landi')
('Birth (1)', 'at some time within', (26, 12, 1777))
Pietro Antonio Landi died after 1843. ==> DEATH
('Death (2)', 'was death of', 'Pietro Antonio Landi')
('Death (2)', 'at some time within', 1843)
Pietro Antonio Landi was Catholic and from Plaisance. ==> CONFESSION
('Religious entity (3)', 'pertains to', 'Pietro Antonio Landi')
('Religious entity (3)', 'has name', 'Catholic')
Pietro Antonio Landi is the son of Gaspare, a painter. ==> SON_DAUGHTER
son_daughter Pietro Antonio Landi is the son of Gaspare, a painter.
('Union (4)', 'has partner', 'of Gaspare')
('Birth (5)', 'brought into life', 'Pietro Antonio Landi is')
('Birth (5)', 'stemmed from', 'Union (4)')
Pietro Antonio Landi lived in Plaisance and Rome until the age of 30. ==> Unknown
Pietro Antonio Landi established a typography in Mendrisio in 1817. ==> Unknown
Pietro Antonio Landi's typography in Mendri

In [159]:
doc2 = nlp('Pietro Antonio Landi was Catholic and from Plaisance')
for token in doc2:
    print(token.text, token.pos_, token.lemma_)

for entity in doc2.ents:
    print(entity.text, entity.label_)

Pietro Antonio Landi PROPN Pietro Antonio Landi
was AUX be
Catholic ADJ catholic
and CCONJ and
from ADP from
Plaisance NOUN plaisance
Pietro Antonio Landi PERSON
Catholic NORP
Plaisance LOC
