In [69]:
import sqlite3
import pandas as pd
import sys
import plotly.graph_objects as go

sys.path.append('../')

from dotenv import load_dotenv

load_dotenv()
import os
DB_PATH = os.getenv("DB_PATH")



conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

In [70]:
# Load individuals Main Informations
individuals = pd.read_sql_query("SELECT * FROM individuals_kept", conn)

In [71]:
# Load identifiers meta-data
df_identifiers = pd.read_sql_query("SELECT * FROM identifiers", conn)
df_identifiers['country_name'][df_identifiers['country_name'].isna()]=''
df_identifiers['identifier_name'] = df_identifiers.apply(lambda x : x['identifier_name'] + ' ('  + x['country_name'] + ')' if x['country_name']!='' else x['identifier_name'], axis=1)
df_identifiers = df_identifiers.drop(['country_wikidata_id', 'count_records', 'identifier_url'], axis=1).drop_duplicates()
df_identifiers = df_identifiers.drop(['country_name', 'identifier_name_country'], axis=1).drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_identifiers['country_name'][df_identifiers['country_name'].isna()]=''


In [72]:
# Load infirmation about individuals and identifiers
df_ind_identifiers = pd.read_sql_query("SELECT * FROM individual_identifiers", conn)
df_ind_identifiers = df_ind_identifiers.drop('identifier_name', axis=1)
df_ind_identifiers = pd.merge(df_ind_identifiers, df_identifiers, on = 'identifiers_wikidata_id', how = 'left')

# Freebase is now mixed with Google ID
df_ind_identifiers['identifier_name'][df_ind_identifiers['identifier_name']=='Freebase ID'] = 'Google Knowledge Graph ID'
df_ind_identifiers['identifiers_wikidata_id'][df_ind_identifiers['identifiers_wikidata_id']=='P646'] = 'P2671'

# Mix the two GND
df_ind_identifiers['identifier_name'][df_ind_identifiers['identifier_name']=='Deutsche Biographie (GND) ID (Germany)'] = 'GND ID (Germany)'
df_ind_identifiers['identifiers_wikidata_id'][df_ind_identifiers['identifiers_wikidata_id']=='P7902'] = 'P227'

# Clean wrong objects
df_ind_identifiers = df_ind_identifiers[~df_ind_identifiers['individual_name'].str.contains('Painter')]

# Remove superceded
df_ind_identifiers = df_ind_identifiers[~df_ind_identifiers['identifier_name'].str.contains('superceded')]
df_ind_identifiers = df_ind_identifiers[~df_ind_identifiers['identifier_name'].str.contains('obsolete')]

# Drop duplicated after mergeing GND and Freebase
df_ind_identifiers = df_ind_identifiers.drop_duplicates()

In [73]:
df_ind_identifiers

Unnamed: 0,individual_wikidata_id,individual_name,identifiers_wikidata_id,identifier_name
0,Q46995803,Johann Michael Vogt,P214,VIAF ID
1,Q46995803,Johann Michael Vogt,P227,GND ID (Germany)
2,Q46995803,Johann Michael Vogt,P2671,Google Knowledge Graph ID
3,Q46995803,Johann Michael Vogt,P3413,Leopoldina member ID (superseded) (Germany)
4,Q4684802,Adriaen Matham,P213,ISNI
...,...,...,...,...
3003267,Q60214671,Joseph Schneller,P9223,Provenio UUID (Czech Republic)
3003268,Q97571271,Antonio González de Cedillo,P245,Union List of Artist Names ID (United States o...
3003269,Q97571271,Antonio González de Cedillo,P2843,Benezit ID
3003270,Q97571271,Antonio González de Cedillo,P7848,Frick Art Reference Library Artist File ID


In [74]:
len(set(df_ind_identifiers.identifiers_wikidata_id))

1672

In [75]:
df_final = pd.merge(df_ind_identifiers, individuals, on = 'individual_wikidata_id')
df_stats = df_final[['individual_wikidata_id', 'identifier_name']].drop_duplicates()

In [76]:
len(set(df_stats['identifier_name']))

1652

In [77]:
df_stats

Unnamed: 0,individual_wikidata_id,identifier_name
0,Q46995803,VIAF ID
1,Q46995803,GND ID (Germany)
2,Q46995803,Google Knowledge Graph ID
3,Q46995803,Leopoldina member ID (superseded) (Germany)
4,Q4684802,ISNI
...,...,...
2525790,Q60214671,Provenio UUID (Czech Republic)
2525791,Q97571271,Union List of Artist Names ID (United States o...
2525792,Q97571271,Benezit ID
2525793,Q97571271,Frick Art Reference Library Artist File ID


In [78]:
len(list(set(df_stats.identifier_name)))

1652

In [79]:
# Group by 'identifier_name' and calculate the count and percentage
grouped = df_stats.groupby('identifier_name').size().reset_index(name='count')
grouped = grouped.sort_values('count', ascending=False).reset_index(drop=True)
grouped['percent'] = round(grouped['count']/len(individuals)*100, 1)
grouped.columns = ['Catalog', 'N Individuals', '%']

grouped['Catalog'] = grouped['Catalog'].str.replace('United States of America', 'US')
grouped.head(20)

Unnamed: 0,Catalog,N Individuals,%
0,VIAF ID,139198,80.5
1,Google Knowledge Graph ID,127136,73.5
2,GND ID (Germany),100841,58.3
3,ISNI,99399,57.5
4,CERL Thesaurus ID,78983,45.7
5,Library of Congress authority ID (US),76335,44.1
6,IdRef ID (France),53717,31.0
7,Nationale Thesaurus voor Auteurs ID (Netherlands),51121,29.5
8,Bibliothèque nationale de France ID (France),50378,29.1
9,FAST ID,45324,26.2


In [93]:
ind_unseen = pd.read_csv('unseen_species_model/unseen_species_model_bayesian.csv')
df_stats_filtered = df_stats[df_stats['individual_wikidata_id'].isin(list(set(ind_unseen['individual_wikidata_id'])))]
print(len(set(df_stats_filtered['identifier_name'])))


df_identifiers = pd.read_sql_query("SELECT * FROM identifiers", conn)
df_identifiers['country_name'][df_identifiers['country_name'].isna()]=''
df_identifiers['identifier_name'] = df_identifiers.apply(lambda x : x['identifier_name'] + ' ('  + x['country_name'] + ')' if x['country_name']!='' else x['identifier_name'], axis=1)


# Load infirmation about individuals and identifiers
df_ind_identifiers = pd.read_sql_query("SELECT * FROM individual_identifiers", conn)
df_ind_identifiers = df_ind_identifiers.drop('identifier_name', axis=1)
df_ind_identifiers = pd.merge(df_ind_identifiers, df_identifiers, on = 'identifiers_wikidata_id', how = 'left')
df_ind_identifiers

1360


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_identifiers['country_name'][df_identifiers['country_name'].isna()]=''


Unnamed: 0,individual_wikidata_id,individual_name,identifiers_wikidata_id,identifier_name,country_wikidata_id,country_name,count_records,identifier_url,identifier_name_country
0,Q46995803,Johann Michael Vogt,P214,VIAF ID,,,33656281.0,https://viaf.org,VIAF ID
1,Q46995803,Johann Michael Vogt,P227,GND ID (Germany),Q183,Germany,9217240.0,https://portal.dnb.de,GND ID (Germany)
2,Q46995803,Johann Michael Vogt,P2671,Google Knowledge Graph ID,,,,https://developers.google.com/knowledge-graph/,Google Knowledge Graph ID
3,Q46995803,Johann Michael Vogt,P3413,Leopoldina member ID (superseded) (Germany),Q183,Germany,,http://www.leopoldina.org/de/mitglieder/mitgli...,Leopoldina member ID (superseded) (Germany)
4,Q4684802,Adriaen Matham,P213,ISNI,,,11020000.0,,ISNI
...,...,...,...,...,...,...,...,...,...
3003267,Q60214671,Joseph Schneller,P9223,Provenio UUID (Czech Republic),Q213,Czech Republic,48290.0,https://provenio.net,Provenio UUID (Czech Republic)
3003268,Q97571271,Antonio González de Cedillo,P245,Union List of Artist Names ID (United States o...,Q30,United States of America,356156.0,,Union List of Artist Names ID (United States o...
3003269,Q97571271,Antonio González de Cedillo,P2843,Benezit ID,,,,https://www.oxfordartonline.com/benezit/,Benezit ID
3003270,Q97571271,Antonio González de Cedillo,P7848,Frick Art Reference Library Artist File ID,,,,https://library.frick.org,Frick Art Reference Library Artist File ID


In [94]:



df_stats_filtered = pd.merge(df_stats_filtered, df_identifiers, on = 'identifier_name')
df_stats_filtered

Unnamed: 0,individual_wikidata_id,identifier_name,identifiers_wikidata_id,country_wikidata_id,country_name,count_records,identifier_url,identifier_name_country
0,Q46995803,VIAF ID,P214,,,33656281.0,https://viaf.org,VIAF ID
1,Q56599708,VIAF ID,P214,,,33656281.0,https://viaf.org,VIAF ID
2,Q10377783,VIAF ID,P214,,,33656281.0,https://viaf.org,VIAF ID
3,Q1319090,VIAF ID,P214,,,33656281.0,https://viaf.org,VIAF ID
4,Q2080951,VIAF ID,P214,,,33656281.0,https://viaf.org,VIAF ID
...,...,...,...,...,...,...,...,...
1396773,Q19325138,Whitney Museum of American Art artist ID (Unit...,P6714,Q30,United States of America,3533.0,https://whitney.org/artists,Whitney Museum of American Art artist ID (Unit...
1396774,Q27500953,Irish playography person ID (Republic of Ireland),P7934,Q27,Republic of Ireland,,http://www.irishplayography.com/,Irish playography person ID (Republic of Ireland)
1396775,Q60515669,South Carolina Encyclopedia ID (United States ...,P7690,Q30,United States of America,,http://www.scencyclopedia.org/sce/,South Carolina Encyclopedia ID (United States ...
1396776,Q18672131,Film.ru person ID (Russia),P10302,Q159,Russia,1770332.0,https://film.ru,Film.ru person ID (Russia)


In [95]:
df_stats_filtered['country_name'].nunique()

71

Unnamed: 0,identifiers_wikidata_id,identifier_name,country_wikidata_id,country_name,count_records,identifier_url,identifier_name_country
0,P212,ISBN-13,,,,,ISBN-13
1,P213,ISNI,,,11020000.0,,ISNI
2,P214,VIAF ID,,,33656281.0,https://viaf.org,VIAF ID
3,P218,ISO 639-1 code,,,184.0,,ISO 639-1 code
4,P219,ISO 639-2 code,,,486.0,,ISO 639-2 code
...,...,...,...,...,...,...,...
8017,P11281,Biographical Dictionary of the Australian Sena...,Q408,Australia,600.0,https://biography.senate.gov.au/,Biographical Dictionary of the Australian Sena...
8018,P11282,Tanzania Parliament member ID,Q924,Tanzania,618.0,https://www.parliament.go.tz/mps-list,Tanzania Parliament member ID (Tanzania)
8019,P11283,JCDb ID,Q17,Japan,49095.0,https://www.japanese-cinema-db.jp/,JCDb ID (Japan)
8020,P11284,Dictionnaire de l'Académie française ID (1st e...,Q142,France,,,Dictionnaire de l'Académie française ID (1st e...
