# POC Extract information from notices

In [2]:
# %load /Users/gaetanmuck/Development/geovpylib/templates/heading-admin.py
%load_ext autoreload
%autoreload 2

# Common imports
import os
import pandas as pd, numpy as np
import datetime
# import math
#import time
import json
import requests
#import duckdb
#import plotly.express as px
# from multiprocessing import Pool

# Geovpylib library
import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.decorators as d
import geovpylib.importer as i
import geovpylib.magics
import geovpylib.pks as pks
import geovpylib.queries as q
import geovpylib.record_linkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u
eta = u.Eta()

# Specific imports
# ...

# Global variables
# ...

db_url_env_var_name = 'YELLOW_SWITZERLAND_AND_BEYOND' # Name of an environment variable holding the Postgres database URL
execute = True # Boolean to prevent to execute directly into databases


In [3]:
def ask_ollama(prompt, model='mistral'):

    url='http://localhost:11434/api/generate'
    response = requests.post(url, json={'model':model,'prompt':prompt})
    text = response.text.strip()
    lines = text.split('\n')
    tokens = list(map(lambda line: json.loads(line)['response'], lines))
    formated = ''.join(tokens)
    answer = formated.strip()
    return answer


def until_comma_or_point_or_bracket(string):
    try: index_comma = string.index(',')
    except ValueError: index_comma = len(string)
    try: index_point = string.index(',')
    except ValueError: index_point = len(string)
    try: index_bracket = string.index('(')
    except ValueError: index_bracket = len(string)
    first_index = min(index_comma,index_point,index_bracket)
    
    return string[0:first_index]


prompt = """
I would like to extract information about the person which the following text is about.
Those information are: mother name, father name, occupation, and confession.
For occupations, if there is multiple, concatenate them with semi-colons.

Can you format the answer as:
- Mother:
- Father:
- Occupation: 
- Confession:

Something really important: all information should be as short as possible.
"""

In [5]:
db.connect_external(os.getenv(db_url_env_var_name), execute=execute)

db.execute('update hls.person set mother = NULL where mother is not null;')
db.execute('update hls.person set father = NULL where father is not null;')
db.execute('update hls.person set occupation = NULL where occupation is not null;')
db.execute('update hls.person set confession = NULL where confession is not null;')

persons = db.query("select id, name, notice, mother, father, occupation, confession from hls.person").sort_values('id')

[DB] Connecting to PGSQL Database ... Connected!


In [None]:
eta.begin(len(persons), 'Finding informations')
for _, person in persons.iterrows():
    full_message = f'{prompt}\n"{person['notice']}"'
    response = ask_ollama(full_message)

    # Extracting information
    response_list = response.split('\n')
    father, mother, occupation, confession = '', '', '', ''
    for line in response_list:
        if "- Father:" in line: 
            father = until_comma_or_point_or_bracket(line.replace('- Father: ', '')).replace("'", "''")
            if father != 'Unknown': db.execute(f"update hls.person set father = '{father}' where person.id = {person['id']};")
        if "- Mother:" in line: 
            mother = until_comma_or_point_or_bracket(line.replace('- Mother: ', '')).replace("'", "''")
            if mother != 'Unknown': db.execute(f"update hls.person set mother = '{mother}' where person.id = {person['id']};")
        if "- Occupation:" in line: 
            occupation = until_comma_or_point_or_bracket(line.replace('- Occupation: ', '')).replace("'", "''")
            if occupation != 'Unknown': db.execute(f"update hls.person set occupation = '{occupation}' where person.id = {person['id']};")
        if "- Confession:" in line: 
            confession = until_comma_or_point_or_bracket(line.replace('- Confession: ', '')).replace("'", "''")
            if confession != 'Unknown': db.execute(f"update hls.person set confession = '{confession}' where person.id = {person['id']};")

    eta.iter()
eta.end()

---

In [51]:
index = 1
name = persons.iloc[index]['name'].replace(',', '')
notice = persons.iloc[index]['notice']

response = ask_ollama(f"""
Can you generate me 5 texts that describe the following entity:
- Class: Person,
- Name: Rudolph Muller,
- Birth place: Paris,
- Birth date: 1st of january 2023,
- Father: Albert
""")

print(response)

1. Rudolph Muller was born on the 1st of January 2023 in the beautiful city of Paris, France. He is the proud son of Albert, a respected figure in their community.

2. Born and raised in the romantic city of Paris on the 1st of January 2023, Rudolph Muller is the beloved child of esteemed father, Albert.

3. Rudolph Muller, born on January 1, 2023, hails from the elegant and historic city of Paris. He is the cherished offspring of his devoted father, Albert.

4. The year was 2023 when Rudolph Muller entered the world in the captivating city of Paris. His proud father, Albert, looked on with immense joy as he welcomed this new addition to their family.

5. Parisian-born Rudolph Muller came into existence on January 1, 2023, the pride and joy of his esteemed father, Albert.


In [35]:
index = 0

response = ask_ollama(f"""
I would like to have all information that are in the following text in form of structured triples.
The person is named {persons.iloc[index]['name'].replace(',', '')}.
                      
"{persons.iloc[index]['notice']}"
""")

print(response)

[("Fischer Carl Viktor von", "Born", "1766")]
["Fischer Carl Viktor von", "Died", "1766"]
["Fischer Carl Viktor von", "BornIn", "Berne"]
["Fischer Carl Viktor von", "Parents", ["Emanuel Friedrich"]]
["Fischer Carl Viktor von", "MaritalStatus", "Celibate"]
["Fischer Carl Viktor von", "MemberOf", ["Grand Conseil (1795)"]]
["Fischer Carl Viktor von", "Occupation", "Officier (1782)"]
["Fischer Carl Viktor von", "Rank", "Capitaine"]
["Fischer Carl Viktor von", "Occupation", "Officier (1792)"]
["Fischer Carl Viktor von", "Location", "Bernois etat-major"]
["Fischer Carl Viktor von", "MilitaryService", ["Hollande (dès 1782)", "Capitaine etat-major bernois (1792)"]]
["Fischer Carl Viktor von", "Event", ["Combat (Fraubrunnen)", "AlliedWith (Ferdinand Isaac de Rovéréa, 1798)"]]
["Fischer Carl Viktor von", "Location", "Zurich"]
["Fischer Carl Viktor von", "Event", ["Blessed"]]
["Fischer Carl Viktor von", "MilitaryService", ["England (dès 1801)", "Major", "Combattant (Naples, Sicile et Egypte)"]]
[

In [14]:
index = 1

response = ask_ollama(f"""
Can you extract structured data from the following text, knowning that it is about a person named {persons.iloc[index]['name']}.
Format should be a list of short sentences about information present in the text.

"{persons.iloc[index]['notice']}"
""")

print(response)

[1. "Fleckenstein, Ludwig Xaver was born on August 1, 1677, in Lucerne.", 
  2. "He died on August 1, 1677, in Villmergen, canton of Lucerne.", 
  3. "His father, Jost (died in 1706), was a counselor and high-ranking officer in the service of Spain and the emperor.", 
  4. "Fleckenstein was a member of the Grand Council of Lucerne from 1703 to 1706.", 
  5. "He succeeded his father as a member of the Petit Conseil from 1706 to 1712.", 
  6. "Fleckenstein was the bailiff of Ruswil from 1709 to 1710.", 
  7. "In 1704, Fleckenstein became a colonel in the service of Savoie."]

[8. "During the Battle of Villmergen, Fleckenstein captured a Bern flag.", 
  9. "He died under enemy fire but was mistakenly killed by his own troops."]


In [11]:
index = 1

response = ask_ollama(f"""
Can you extract structured data from the following text in a RDF manner, knowning that it is about a person named {persons.iloc[index]['name']}
"{persons.iloc[index]['notice']}"
""")

print(response)

Yes, I can help you extract structured data from the given text about Ludwig Xaver Fleckenstein in RDF (Resource Description Framework) format. RDF is a standard model for data interchange on the web. Here's a possible way to represent the information:

```rdf
@prefix rdf: <http://www.w3.org/1999/01/rdf-schema#> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .

<Ludwig_Xaver_Fleckenstein>  a foaf:Person ;
    dct:dateOfBirth "1.8.1677"^^xsd:date ;
    dct:dateOfDeath "1.8.1677"^^xsd:date ;
    rdf:type foaf:Person .

<Jost>  a foaf:Person ;
    dct:isSuccessorOf <Ludwig_Xaver_Fleckenstein> ;
    dct:isPrecededBy [ rdf:type foaf:Person ; dct:dateOfDeath "1706"^^xsd:date ] ;
    rdf:type foaf:Agent .

<Lucerne>  a dct:City ;
    foaf:homepage <http://www.lucerne.ch/> ;
    dct:name "Lucerne" ;
    rdfs:label "Lucerne" .

<Villmergen>  a dct:City ;
    dct:name "Villmergen" ;
    rdfs:label "Villmergen" .

<Espagne>  a dct:Country ;
    rdfs:label 

In [10]:
index = 2

response = ask_ollama(f"""
Can you extract structured data from the following text in a RDF manner, 
knowning that it is about a person named {persons.iloc[index]['name']} 
which has the URI https://www.geovistory.org/page/i{persons.iloc[index]['id']}:
"{persons.iloc[index]['notice']}"
""")

print(response)

Yes, I can help you extract structured data from the given text about Ludwig Xaver Fleckenstein in RDF (Resource Description Framework) format. RDF is a standard model for data interchange on the web and represents data in the form of triples (subject-predicate-object statements).

Based on the provided text, we can extract the following triples:

1. <https://www.geovistory.org/page/i2#LudwigXaverFleckenstein> <nao:wasBornOn> "1.8.1677-08-01" .
2. <https://www.geovistory.org/page/i2#LudwigXaverFleckenstein> <nao:diedOn> "1.8.1677-08-01" .
3. <https://www.geovistory.org/page/i2#LudwigXaverFleckenstein> <dbo:sonOf> <https://www.geovistory.org/page/i2#Jost> .
4. <https://www.geovistory.org/page/i2#Jost> <nao:diedOn> "1706" .
5. <https://www.geovistory.org/page/i2#LudwigXaverFleckenstein> <dbo:memberOf> <http://dbpedia.org/ontology/GrandCouncil> .
6. <https://www.geovistory.org/page/i2#LudwigXaverFleckenstein> <nao:wasSuccessorOf> <https://www.geovistory.org/page/i2#Jost> .
7. <https://www