In [187]:
import sys

sys.path.append('../')

from functions_env import DB_PATH
import sqlite3
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib_venn import venn3

In [188]:
conn = sqlite3.connect(DB_PATH)

In [189]:

df_individual = pd.read_sql_query("SELECT * FROM individuals_main_information", conn)
df_individual = df_individual[['individual_wikidata_id', 'birthyear']]
df_individual_region = pd.read_sql_query("SELECT * FROM individuals_regions", conn)
df_individual_region = df_individual_region[['individual_wikidata_id', 'region_name']].drop_duplicates()

In [190]:
df_identifiers = pd.read_sql_query("SELECT * FROM identifiers", conn)
df_identifiers['country_name'][df_identifiers['country_name'].isna()]=''
df_identifiers['identifier_name'] = df_identifiers.apply(lambda x : x['identifier_name'] + ' ('  + x['country_name'] + ')' if x['country_name']!='' else x['identifier_name'], axis=1)
df_identifiers = df_identifiers.drop(['country_wikidata_id', 'count_records', 'identifier_url'], axis=1).drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_identifiers['country_name'][df_identifiers['country_name'].isna()]=''


In [191]:
df_ind_identifiers = pd.read_sql_query("SELECT * FROM individual_identifiers", conn)
df_ind_identifiers = df_ind_identifiers.drop('identifier_name', axis=1)
df_ind_identifiers = pd.merge(df_ind_identifiers, df_identifiers, on = 'identifiers_wikidata_id', how = 'left')

# Freebase is now mixed with Google ID
df_ind_identifiers['identifier_name'][df_ind_identifiers['identifier_name']=='Freebase ID'] = 'Google Knowledge Graph ID'
df_ind_identifiers['identifiers_wikidata_id'][df_ind_identifiers['identifiers_wikidata_id']=='P646'] = 'P2671'

# Mix the two GND
df_ind_identifiers['identifier_name'][df_ind_identifiers['identifier_name']=='Deutsche Biographie (GND) ID (Germany)'] = 'GND ID (Germany)'
df_ind_identifiers['identifiers_wikidata_id'][df_ind_identifiers['identifiers_wikidata_id']=='P7902'] = 'P227'

# Remove superceded
df_ind_identifiers = df_ind_identifiers[~df_ind_identifiers['identifier_name'].str.contains('superceded')]

df_ind_identifiers = df_ind_identifiers.drop_duplicates()

df_final = pd.merge(df_individual, df_ind_identifiers, on = 'individual_wikidata_id', how = 'left')
df_final = pd.merge(df_final, df_individual_region, on = 'individual_wikidata_id')


In [192]:
region_name = 'Latin World'
min_year = -800
max_year = 300

df_country = df_final[df_final['region_name']==region_name]
df_country = df_country[(df_country['birthyear']>=min_year)&(df_country['birthyear']<=max_year)].reset_index(drop=True)
df_country["year"] = df_country["birthyear"].apply(lambda x : round(x/10)*10)

top_catalogs = df_country[['individual_wikidata_id', 'identifier_name']].drop_duplicates()
top_catalogs = top_catalogs.identifier_name.value_counts().head(20)
top_catalogs = list(top_catalogs.index)

df_country = df_country.groupby(['year', 'identifier_name'])['individual_wikidata_id'].count().reset_index()
df_country = df_country[df_country['identifier_name'].isin(top_catalogs)]
df_country = df_country.pivot(index = 'year', columns = 'identifier_name', values = 'individual_wikidata_id')
df_country = df_country.reset_index()
df_country = df_country.sort_values('year', ascending=True)
df_country = df_country.fillna(0)
df_country = df_country.set_index('year')
mean_overall_correlation = df_country.corr().stack().mean()
round(mean_overall_correlation, 2)

0.91

In [193]:
region_name = 'Greek World'
min_year = -800
max_year = 500

df_table_1 = df_final[df_final['region_name']==region_name]
df_table_1 = df_table_1[~df_table_1['individual_name'].isna()]

# Get rid of mistakenly obejcts classifiers as humans
df_table_1 = df_table_1[~df_table_1['individual_name'].str.contains('Painter')]


df_table_1 = df_table_1.dropna()
df_table_1 = df_table_1[(df_table_1['birthyear']>=min_year)&(df_table_1['birthyear']<=max_year)].reset_index(drop=True)
df_filter = df_table_1[['individual_wikidata_id', 'identifier_name']].drop_duplicates()
df_filter['identifier_name'] = df_filter['identifier_name'].astype(str)
df_filter = df_filter[~df_filter['identifier_name'].str.contains('superceded')]
df_filter[df_filter['identifier_name']=='Deutsche Biographie (GND) ID (Germany)'] = 'GND ID (Germany)'
df_filter = df_filter.drop_duplicates()
top_catalogs_1 = df_filter.identifier_name.value_counts().head(20).reset_index()
#top_catalogs_1['region'] = 'Latin World'
top_catalogs_1['percent_of_total'] = top_catalogs_1['identifier_name']/len(set(df_table_1.individual_wikidata_id))
top_catalogs_1['percent_of_total'] = round(top_catalogs_1['percent_of_total'], 2)
top_catalogs_1.columns = ['catalog', 'number of individuals', '% of individuals']
top_catalogs_1.to_clipboard(index=False)
print(len(set(df_table_1.individual_wikidata_id)))

1040


In [194]:
region_name = 'Latin World'
min_year = -300
max_year = 500

df_table_1 = df_final[df_final['region_name']==region_name]
df_table_1 = df_table_1[(df_table_1['birthyear']>=min_year)&(df_table_1['birthyear']<=max_year)].reset_index(drop=True)
df_filter = df_table_1[['individual_wikidata_id', 'identifier_name']].drop_duplicates()
df_filter['identifier_name'] = df_filter['identifier_name'].astype(str)
df_filter = df_filter[~df_filter['identifier_name'].str.contains('superceded')]
df_filter[df_filter['identifier_name']=='Deutsche Biographie (GND) ID (Germany)'] = 'GND ID (Germany)'
df_filter = df_filter.drop_duplicates()
top_catalogs_2 = df_filter.identifier_name.value_counts().head(20).reset_index()
#top_catalogs_1['region'] = 'Latin World'
top_catalogs_2['percent_of_total'] = top_catalogs_2['identifier_name']/len(set(df_table_1.individual_wikidata_id))
top_catalogs_2['percent_of_total'] = round(top_catalogs_2['percent_of_total'], 2)
top_catalogs_2.columns = ['catalog', 'number of individuals', '% of individuals']
top_catalogs_2.to_clipboard(index=False)
print(len(set(df_table_1.individual_wikidata_id)))

422


In [195]:
final_catalog = pd.merge(top_catalogs_1, top_catalogs_2, on = 'catalog')
final_catalog.to_clipboard(index=False)