In [1]:
%load_ext autoreload
%autoreload 2


pk_project_bhp = 12782 # The pk project in BHP
pk_project_gv = 373987 # The pk project in GEOVISTORY

# Not to change:
pk_symogih_gv = 6857901 # The pk project of symogih on GEOVISTORY

env = 'prod'
execute = True
metadata_str = 'bhp-add-entity-to-project-Siprojuris'
import_manner = 'one-shot'

import os
import pandas as pd
import numpy as np
from datetime import datetime
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.queries as q
import geovpylib.pks as pks
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

# Add entities to project

The goal here is to add a list of entities that were in a particular BHP project, to the equivalent Geovistory project.

## Fetch Geovistory SYMOGIH data

In [2]:
db.connect_geovistory(env, pk_symogih_gv, False, skip_protection=True)

symogih_entities_gv = db.query(f"""
    select
        r.pk_entity as pk_gv,
        a3.string as uri        
    from information.resource r 
    inner join projects.info_proj_rel ipr on ipr.fk_entity = r.pk_entity and ipr.fk_project = {pk_symogih_gv} and ipr.is_in_project = true
    -- URI
    inner join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = s1.pk_entity and ipr1.fk_project = {pk_symogih_gv} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = {pks.properties.appe_hasValue_string}
    inner join information.appellation a3 on a3.pk_entity = s2.fk_object_info
""")

# Only symogih URIs
symogih_entities_gv = symogih_entities_gv[symogih_entities_gv['uri'].str.contains('symogih.org')]

# Extract PK
symogih_entities_gv['pk_bhp'] = symogih_entities_gv['uri'].str.replace('http://symogih.org/resource/', '', regex=False)
symogih_entities_gv.drop(columns=['uri'], inplace=True)

a.infos(symogih_entities_gv)

db.disconnect()

# 12s

[DB] Requests will not be executed
[DB] Connecting to PRODUCTION Database ... Connected!
Shape:  (117108, 2) - extract:


Unnamed: 0,pk_gv,pk_bhp
0,6486809,Actr14444
61,6485877,Actr13646
62,6485930,Actr13694
63,6485937,Actr13701
64,6485947,Actr13711


[DB] Database correctly disconnected.


## Fetch BHP entities

In [3]:
db.connect_external(os.environ.get('YELLOW_BHP'))

symogih_entities_bhp = db.query(f"""
    select 
        fk_collective_actor as pk_bhp_project, fk_object as pk_bhp
    from bhp.associate_project ap
    where ap.fk_collective_actor = {pk_project_bhp}
""")

symogih_entities_bhp = symogih_entities_bhp[['pk_bhp']]

a.infos(symogih_entities_bhp)

db.disconnect()

# 2s

[DB] Connecting to PGSQL Database ... Connected!
Shape:  (2131, 1) - extract:


Unnamed: 0,pk_bhp
0,AOTP131
1,Actr5
2,Actr13
3,Actr35
4,Actr36


[DB] Database correctly disconnected.


## Filter only the wanted

In [4]:
entities = symogih_entities_gv.merge(symogih_entities_bhp, how='inner')['pk_gv'].tolist()

print('Entity number:', len(entities))

Entity number: 2128


## Get all information we have about those entities in Geovistory Symogih project

In [5]:
db.connect_geovistory(env, -1, False, skip_protection=True)


# Get the whole graph ids of those entities (with rules; eg only incoming properties for TeEn)
peits, teens, stmts = q.get_graph_pks(entities, pk_symogih_gv, max_level=10)

# Make the graph explicit
# the_graph = db.explicit_statements(stmts)
# print('Explicit graph:')
# display(the_graph)

# Make a visual graph out of the explicit statements  
# graphs.show(the_graph, 'test.html')

db.disconnect()

# 55s

[DB] Requests will not be executed
[DB] Connecting to PRODUCTION Database ... Connected!
> Getting graph of 2128 entities
  > Getting graph of 5054 entities
    > Getting graph of 5 entities
    > Getting graph of 9 entities
    > Getting graph of 4 entities
  > Getting graph of 6635 entities
    > Getting graph of 2750 entities
      > Getting graph of 426 entities
        > Getting graph of 2 entities
        > Getting graph of 2 entities
    > Getting graph of 2324 entities
  > Getting graph of 2515 entities
[DB] Database correctly disconnected.


In [6]:
### For info, if needed, to spot unwanted incoming entities

# db.connect_geovistory(env, -1, False, skip_protection=True)
# stmts_expl = db.tools.explicit_statements(stmts)

# display(a.group_by_count(stmts_expl, 'subject')[0:10])
# display(a.group_by_count(stmts_expl, 'property')[0:10])
# display(a.group_by_count(stmts_expl, 'object')[0:10])

## Add those information to the project

In [7]:
db.connect_geovistory(env, pk_project_gv, execute)

[DB] Connecting to PRODUCTION Database ... Connected!


### Before import

In [8]:
ipr_before = db.query(f'select pk_entity from projects.info_proj_rel where fk_project = {pk_project_gv} and is_in_project = true')

print('Number of information before import:', len(ipr_before))
print('Number of information should be after import:', (len(entities) + len(peits) + len(teens) + len(stmts) ))

entities_str = u.get_sql_ready_str(entities) # peits + teens)
classes = db.query(f'''
    select 
        r.pk_entity, r.fk_class, ac.dfh_class_label
    from information.resource r
    left join data_for_history.api_class ac on ac.dfh_pk_class = r.fk_class and ac.dfh_class_label_language = 'en'
    where r.pk_entity in {entities_str}
''')
classes['class'] = classes['dfh_class_label'] + ' (' + classes['fk_class'].astype(str) + ')'
classes.drop_duplicates(inplace=True)
print('Details about classes that are the source of what is going to be imported:')
display(a.group_by_count(classes, 'class'))

Number of information before import: 96762
Number of information should be after import: 45372
Details about classes that are the source of what is going to be imported:


Unnamed: 0,class,count,percent
0,Person (21),2128,100.00%


In [9]:
db.info_proj_rels.create(entities)
db.info_proj_rels.create(peits)
db.info_proj_rels.create(teens)
db.info_proj_rels.create(stmts)

Creating info_proj_rel of 2128 entities with project <373987> ... Done in [00h00m02s]
Creating info_proj_rel of 6608 entities with project <373987> ... Done in [00h00m10s]
Creating info_proj_rel of 6646 entities with project <373987> ... Done in [00h00m07s]
Creating info_proj_rel of 29990 entities with project <373987> ... Done in [00h00m28s]


In [10]:
ipr_after = db.query(f'select pk_entity from projects.info_proj_rel where fk_project = {pk_project_gv} and is_in_project = true')

print('Number of information after import:', len(ipr_after))

Number of information after import: 141434


## Add favorites (take column `ord_num_of_domain` and `ord_num_of_range`)

### Prepare

In [11]:
db.connect_geovistory(env, pk_symogih_gv, execute)

# Fetch all IPR source
ipr_source = db.query(f"""
    select
        pk_entity as ipr_source, fk_entity, fk_project, ord_num_of_domain, ord_num_of_range
    from projects.info_proj_rel
    where fk_project = {pk_symogih_gv}
""")
print('IPR source:', len(ipr_source))

# Fetch all IPR target
ipr_target = db.query(f"""
    select
        pk_entity as ipr_target, fk_entity
    from projects.info_proj_rel
    where fk_project = {pk_project_gv}
""")
print('IPR target:', len(ipr_target))

# Take only the ones we are interested in
iprs = ipr_target.merge(ipr_source)

# Format & prepare request
iprs_to_reset = iprs.ipr_target.unique().tolist()
iprs_to_1 = iprs[iprs.ord_num_of_domain == 1].ipr_target.unique().tolist()

print('IPRs to reset:', len(iprs_to_reset))
print('IPRs to set to 1:', len(iprs_to_1))

# 12s

[DB] Connecting to PRODUCTION Database ... Connected!
IPR source: 1918490
IPR target: 141529
IPRs to reset: 54025
IPRs to set to 1: 2131


### Execute

In [12]:
iprs_to_reset_str = u.get_sql_ready_str(iprs_to_reset)

db.execute(f"""
    update projects.info_proj_rel
        set ord_num_of_domain = NULL
    where pk_entity in {iprs_to_reset_str};
""")

In [13]:
iprs_to_1_str = u.get_sql_ready_str(iprs_to_1)
db.execute(f"""
    update projects.info_proj_rel
        set ord_num_of_domain = 1
    where pk_entity in {iprs_to_1_str};
""")

### Finish

In [14]:
db.disconnect()

[DB] Database correctly disconnected.
