In [1]:
import sqlite3
import pandas as pd
import plotly.express as px
import sys
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import os

sys.path.append('../')
from functions_env import DB_PATH



NEW_DB_PATH = os.getenv("NEW_DB_PATH")

conn = sqlite3.connect(NEW_DB_PATH)
cursor = conn.cursor()

In [2]:
# Load individuals Main Informations
df_individual = pd.read_sql_query("SELECT * FROM individuals_main_information", conn)
df_individual = df_individual[['individual_wikidata_id', 'birthyear', 'individual_name']]

In [3]:
len(df_individual)

220770

In [4]:
# Load invividuals Regions
df_individual_region = pd.read_sql_query("SELECT * FROM individuals_regions", conn)
df_individual_region = df_individual_region[['individual_wikidata_id', 'region_name']].drop_duplicates()

In [5]:
# Load identifiers meta-data
df_identifiers = pd.read_sql_query("SELECT * FROM identifiers", conn)
df_identifiers['country_name'][df_identifiers['country_name'].isna()]=''
df_identifiers['identifier_name'] = df_identifiers.apply(lambda x : x['identifier_name'] + ' ('  + x['country_name'] + ')' if x['country_name']!='' else x['identifier_name'], axis=1)
df_identifiers = df_identifiers.drop(['country_wikidata_id', 'count_records', 'identifier_url'], axis=1).drop_duplicates()
df_identifiers = df_identifiers.drop(['country_name', 'identifier_name_country'], axis=1).drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_identifiers['country_name'][df_identifiers['country_name'].isna()]=''


In [6]:
# Load infirmation about individuals and identifiers
df_ind_identifiers = pd.read_sql_query("SELECT * FROM individual_identifiers", conn)
df_ind_identifiers = df_ind_identifiers.drop('identifier_name', axis=1)
df_ind_identifiers = pd.merge(df_ind_identifiers, df_identifiers, on = 'identifiers_wikidata_id', how = 'left')

# Freebase is now mixed with Google ID
df_ind_identifiers['identifier_name'][df_ind_identifiers['identifier_name']=='Freebase ID'] = 'Google Knowledge Graph ID'
df_ind_identifiers['identifiers_wikidata_id'][df_ind_identifiers['identifiers_wikidata_id']=='P646'] = 'P2671'

# Mix the two GND
df_ind_identifiers['identifier_name'][df_ind_identifiers['identifier_name']=='Deutsche Biographie (GND) ID (Germany)'] = 'GND ID (Germany)'
df_ind_identifiers['identifiers_wikidata_id'][df_ind_identifiers['identifiers_wikidata_id']=='P7902'] = 'P227'

# Clean wrong objects
df_ind_identifiers = df_ind_identifiers[~df_ind_identifiers['individual_name'].str.contains('Painter')]

# Remove superceded
df_ind_identifiers = df_ind_identifiers[~df_ind_identifiers['identifier_name'].str.contains('superceded')]
df_ind_identifiers = df_ind_identifiers[~df_ind_identifiers['identifier_name'].str.contains('obsolete')]

# Drop duplicated after mergeing GND and Freebase
df_ind_identifiers = df_ind_identifiers.drop_duplicates()

In [7]:
df_final = df_individual[~df_individual['birthyear'].isna()]
df_final = df_final[~df_final['individual_name'].str.contains('Painter')]
df_final = df_final[df_final['individual_wikidata_id'].isin(set(df_individual_region.individual_wikidata_id))]
df_final = df_final[df_final['individual_wikidata_id'].isin(set(df_ind_identifiers.individual_wikidata_id))]

In [8]:
len(set(df_final.individual_wikidata_id))

159394

In [9]:
df_final[['individual_wikidata_id']].drop_duplicates().to_sql('individuals_kept', conn, if_exists = 'replace', index=False)

159394