In [2]:
import sys

sys.path.append("../")

import pandas as pd
from dotenv import load_dotenv

load_dotenv()
import os
import sqlite3

DB_PATH = os.getenv("DB_PATH")

conn = sqlite3.connect(DB_PATH)
pd.options.mode.chained_assignment = None

In [3]:
only_country = False

# Individuals Regions
df_ind_regions = pd.read_sql_query("SELECT * FROM individuals_regions", conn)

# Years
df_ind = pd.read_sql_query("SELECT * FROM individuals_main_information", conn)

df_ind_birthyear = df_ind[["individual_wikidata_id", "birthyear"]].drop_duplicates()
df_ind_deathyear = pd.read_sql_query("SELECT * FROM deathyear", conn)
df_ind_year = pd.merge(df_ind_birthyear, df_ind_deathyear, on = 'individual_wikidata_id', how = 'outer')

df_ind_year['birthyear'][(df_ind_year['birthyear'] % 100 == 0)&(df_ind_year['deathyear'].isna())] = df_ind_year['birthyear'] - 50

In [4]:
# Function to calculate productive_year
def calculate_productive_year(row):
    if pd.isna(row['birthyear']):
        return row['deathyear']
    productive_year = row['birthyear'] + 35
    if pd.notna(row['deathyear']) and productive_year > row['deathyear']:
        return row['deathyear']
    return productive_year


# Apply the function to the DataFrame
df_ind_year['productive_year'] = df_ind_year.apply(calculate_productive_year, axis=1)
#df_ind_year = df_ind_year[df_ind_year['productive_year']<=1880]

df_ind_year = df_ind_year[~df_ind_year['productive_year'].isna()]

temporal_resolution = 10
df_ind_year["decade"] = df_ind_year["productive_year"].apply(lambda x: round(x / temporal_resolution) * temporal_resolution)

#df_ind_year = df_ind_year[df_ind_year['productive_year']<=1880]

In [5]:
df_catalogs_id = pd.read_sql_query("SELECT * FROM individual_identifiers", conn)
df_catalogs = pd.read_sql_query("SELECT * FROM identifiers", conn)


if only_country:
    df_catalogs = df_catalogs[['identifiers_wikidata_id', 'country_name']].dropna()
    df_cat = pd.merge(df_catalogs_id, df_catalogs, on = 'identifiers_wikidata_id')
    df_cat = df_cat[['individual_wikidata_id','individual_name', 'country_name']].drop_duplicates()
    df_cat = df_cat.groupby(['individual_name', 'individual_wikidata_id'])['country_name'].count().rename('score').reset_index()
    df_cat = df_cat.sort_values('score', ascending=False).reset_index(drop=True)
else:
    df_cat = df_catalogs_id.groupby(['individual_name', 'individual_wikidata_id'])['identifiers_wikidata_id'].count().rename('score').reset_index()
    df_cat = df_cat.sort_values('score', ascending=False).reset_index(drop=True)
    
df_final = pd.merge(df_ind_year, df_cat, on = 'individual_wikidata_id')
df_final = pd.merge(df_final, df_ind_regions, on = ['individual_wikidata_id', 'individual_name'])
df_final = df_final.drop(['birthyear', 'deathyear'], axis=1)

df_final = df_final.drop_duplicates()

df_final.to_csv('results/df_individuals_score.csv')

In [21]:
test = df_final[df_final['region_name']=='Arabian peninsula']
test = test[test['decade']<=800]
test = test.sort_values('decade', ascending=True)


In [6]:
df_final.sort_values('score', ascending=False).sample(3)

Unnamed: 0,individual_wikidata_id,productive_year,decade,individual_name,score,region_code,region_name
503059,Q3619014,1813.0,1810,Antonietta Fagnani Arese,15,re_southwestern_europe,Southwestern Europe
128259,Q8010250,1871.0,1870,William Grylls Adams,16,re_british_islands,British Islands
434160,Q1695561,1768.0,1770,Johann Michael Koneberg,6,re_western_europe,Western Europe


### Make scores for regions

In [23]:
# Make groups
df_group = df_final[['individual_wikidata_id', 'region_name', 'decade', 'score']].drop_duplicates()
df_group = df_group.reset_index(drop=True)

df_group = df_group.groupby(['region_name', 'decade'])['score'].sum().reset_index()
df_group

Unnamed: 0,region_name,decade,score
0,Arabian peninsula,180,1
1,Arabian peninsula,300,1
2,Arabian peninsula,480,14
3,Arabian peninsula,540,22
4,Arabian peninsula,560,85
...,...,...,...
5500,mediterranean World,1840,495
5501,mediterranean World,1850,489
5502,mediterranean World,1860,500
5503,mediterranean World,1870,571


In [24]:

df_group.to_sql('region_score',conn, if_exists = 'replace', index=False)
df_group.to_csv('results/df_region_score.csv')

In [8]:
# Individuals that are only scientists
df_scientists = pd.read_sql_query("SELECT * FROM individual_occupations", conn)
df_scientists = df_scientists[df_scientists['occupations_category'].str.contains('science')]

top_scientific_occupations = df_scientists.occupations_name.value_counts()
top_scientific_occupations = top_scientific_occupations.head(50).index
occupations_to_remove = ['pedagogue','explorer','music pedagogue', 'land surveyor', 'music theorist', 'folklorist', 'conquistador', 'urban planner', 'settler']
filtered_occupation_list = [occupation for occupation in top_scientific_occupations if occupation not in occupations_to_remove]
df_scientists = df_scientists[df_scientists['occupations_name'].isin(filtered_occupation_list)]

list_scientists = list(set(df_scientists['individual_wikidata_id']))
df_final_scientists = df_final[df_final['individual_wikidata_id'].isin(list_scientists)]
len(df_final_scientists)

df_final_scientists = df_final_scientists.drop_duplicates()
df_final_scientists.to_csv('results/df_individuals_score_science.csv')

### Make groups for scientists in regions

In [9]:
# Make groups
df_group = df_final_scientists[['individual_wikidata_id', 'region_name', 'decade', 'score']].drop_duplicates()
df_group = df_group.reset_index(drop=True)

df_group = df_group.groupby(['region_name', 'decade'])['score'].sum().reset_index()
df_group.to_sql('region_score_science',conn, if_exists = 'replace', index=False)
df_group.to_csv('results/df_region_score_science.csv')