We charge the sqlite3 Database

In [50]:
import sys

sys.path.append("../")

import pandas as pd
import numpy as np

from dotenv import load_dotenv

load_dotenv()
import os

import sqlite3

DB_PATH = os.getenv("DB_PATH")
DATA_PATH = "data"

conn = sqlite3.connect(DB_PATH)

import numpy as np

We get the number of individuals per region (with at least one information in the catalogs of one country)

In [51]:
# Individuals Regions
df_ind_regions = pd.read_sql_query("SELECT * FROM individuals_regions", conn)

# Years
df_ind = pd.read_sql_query("SELECT * FROM individuals_main_information", conn)
df_ind_year = df_ind[["individual_wikidata_id", "birthyear"]].drop_duplicates()
df_ind_year = df_ind_year.dropna()
#df_ind_year = df_ind_year[df_ind_year['decade']<=1800]

temporal_resolution = 10
df_ind_year["decade"] = df_ind_year["birthyear"].apply(lambda x: round(x / temporal_resolution) * temporal_resolution)

# Load Catalogs Informations
df_catalogs_id = pd.read_sql_query("SELECT * FROM individual_identifiers", conn)
df_catalogs = pd.read_sql_query("SELECT * FROM identifiers", conn)
df_catalogs = df_catalogs[['identifiers_wikidata_id', 'country_name']].dropna()

df_cat = pd.merge(df_catalogs_id, df_catalogs, on = 'identifiers_wikidata_id')
df_cat_filtered = df_cat[['individual_wikidata_id','individual_name', 'country_name']].drop_duplicates()
df_cat_country = df_cat_filtered.groupby(['individual_name', 'individual_wikidata_id'])['country_name'].count().rename('score').reset_index()
df_cat_country = df_cat_country.sort_values('score', ascending=False).reset_index(drop=True)

df = pd.merge(df_ind_regions,df_ind_year, on = 'individual_wikidata_id')
df = pd.merge(df, df_cat_country, on = 'individual_wikidata_id')

df_final = df[['individual_wikidata_id', 'region_name', 'decade']].drop_duplicates()
df_final['score'] = 1
df_final = df_final.groupby(['region_name', 'decade'])['score'].sum().reset_index()
df_final.to_sql('region_score',conn, if_exists = 'replace', index=False)

5363

Plot the different trends

In [52]:
"""

import pandas as pd
import matplotlib.pyplot as plt

# Assuming you have a DataFrame called df
# Replace 'df' with your actual DataFrame name

# Get unique regions (countries)
unique_regions = df_final['region_name'].unique()

# Loop through each region and create a line plot
for region in unique_regions:
    region_data = df_final[df_final['region_name'] == region]
    region_data = region_data[region_data['decade']<1850]

    plt.figure(figsize=(8, 4))
    plt.plot(region_data['decade'], region_data['score'])
    plt.title(f'Score Over Decades for {region}')
    plt.xlabel('Decade')
    plt.ylabel('Score')
    plt.grid(True)
    plt.show()
    
"""


"\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n# Assuming you have a DataFrame called df\n# Replace 'df' with your actual DataFrame name\n\n# Get unique regions (countries)\nunique_regions = df_final['region_name'].unique()\n\n# Loop through each region and create a line plot\nfor region in unique_regions:\n    region_data = df_final[df_final['region_name'] == region]\n    region_data = region_data[region_data['decade']<1850]\n\n    plt.figure(figsize=(8, 4))\n    plt.plot(region_data['decade'], region_data['score'])\n    plt.title(f'Score Over Decades for {region}')\n    plt.xlabel('Decade')\n    plt.ylabel('Score')\n    plt.grid(True)\n    plt.show()\n    \n"

Only subset the top 50% individuals in terms of number of references in international catalogs

In [58]:
df_count_cat = df_cat[['individual_wikidata_id', 'identifiers_wikidata_id']].drop_duplicates()
df_count_cat = df_count_cat.groupby('individual_wikidata_id')['identifiers_wikidata_id'].count().reset_index()
value_counts_normalized = df_count_cat['identifiers_wikidata_id'].value_counts(normalize=True).sort_index()
cumulative_sum = value_counts_normalized.cumsum()
print(cumulative_sum.head(20))

1     0.121739
2     0.280428
3     0.389469
4     0.472690
5     0.540014
6     0.594747
7     0.640916
8     0.680669
9     0.714042
10    0.743381
11    0.769292
12    0.792272
13    0.811567
14    0.828863
15    0.844627
16    0.859068
17    0.871101
18    0.882073
19    0.892051
20    0.900430
Name: identifiers_wikidata_id, dtype: float64


We note that if we take indiviudals with at least 5 mention in catalogs, we remove 55% of the data

In [59]:
df_top = df_count_cat[df_count_cat['identifiers_wikidata_id']>20]
df_top = pd.merge(df_top,df_ind_year, on = 'individual_wikidata_id')
df_top = pd.merge(df_top, df_ind_regions, on = 'individual_wikidata_id')

# We remove data before 1850 because we round data at the decade and the max birthyear we have is 1850. 
# 1850 rounded will be lower than 1840 because 1851, 1852, 1853 and 1854 will not be taken into account in the score
df_top = df_top[df_top['decade']<1850]

Then we sum the number of individuals for every region only taking the top 50% individuals

In [60]:
df_final_top = df_top[['individual_wikidata_id', 'region_name', 'decade']].drop_duplicates()
df_final_top = df_final_top.groupby(['region_name', 'decade'])['individual_wikidata_id'].count().rename('score').reset_index()
df_final_top.to_sql('region_score_top_10',conn, if_exists = 'replace', index=False)

3530

Plot the results

In [56]:
'''

import pandas as pd
import matplotlib.pyplot as plt

# Get unique regions (countries)
unique_regions = df_final_top['region_name'].unique()

# Loop through each region and create a line plot
for region in unique_regions:
    region_data = df_final_top[df_final_top['region_name'] == region]
    region_data = region_data[region_data['decade']<1850]

    plt.figure(figsize=(8, 4))
    plt.plot(region_data['decade'], region_data['score'])
    plt.title(f'Score Over Decades for {region}')
    plt.xlabel('Decade')
    plt.ylabel('Score')
    plt.grid(True)
    plt.show()
    
'''

"\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n# Get unique regions (countries)\nunique_regions = df_final_top['region_name'].unique()\n\n# Loop through each region and create a line plot\nfor region in unique_regions:\n    region_data = df_final_top[df_final_top['region_name'] == region]\n    region_data = region_data[region_data['decade']<1850]\n\n    plt.figure(figsize=(8, 4))\n    plt.plot(region_data['decade'], region_data['score'])\n    plt.title(f'Score Over Decades for {region}')\n    plt.xlabel('Decade')\n    plt.ylabel('Score')\n    plt.grid(True)\n    plt.show()\n    \n"