In [8]:
import sys

sys.path.append("../")

import pandas as pd
from dotenv import load_dotenv

load_dotenv()
import os
import sqlite3

DB_PATH = os.getenv("DB_PATH")

conn = sqlite3.connect(DB_PATH)
pd.options.mode.chained_assignment = None

In [9]:
only_country = False

# Individuals Regions
df_ind_regions = pd.read_sql_query("SELECT * FROM individuals_regions", conn)

# Years
df_ind = pd.read_sql_query("SELECT * FROM individuals_main_information", conn)

df_ind_birthyear = df_ind[["individual_wikidata_id", "birthyear"]].drop_duplicates()
df_ind_deathyear = pd.read_sql_query("SELECT * FROM deathyear", conn)
df_ind_year = pd.merge(df_ind_birthyear, df_ind_deathyear, on = 'individual_wikidata_id', how = 'outer')


df_ind_year['birthyear'][(df_ind_year['birthyear'] < 0) & 
            (df_ind_year['birthyear'] % 100 == 0) & 
            (df_ind_year['deathyear'].isna())] = df_ind_year['birthyear'] + 50


In [10]:
df_ind_year[df_ind_year['individual_wikidata_id']=='Q44233']

Unnamed: 0,individual_wikidata_id,birthyear,deathyear
21502,Q44233,-750.0,


In [11]:
# Function to calculate productive_year
def calculate_productive_year(row):
    if pd.isna(row['birthyear']):
        return row['deathyear']
    productive_year = row['birthyear'] + 35
    if pd.notna(row['deathyear']) and productive_year > row['deathyear']:
        return row['deathyear']
    return productive_year


# Apply the function to the DataFrame
df_ind_year['productive_year'] = df_ind_year.apply(calculate_productive_year, axis=1)
#df_ind_year = df_ind_year[df_ind_year['productive_year']<=1880]

In [12]:
df_ind_year[df_ind_year['individual_wikidata_id']=='Q6691']

Unnamed: 0,individual_wikidata_id,birthyear,deathyear,productive_year
218656,Q6691,-900.0,-800.0,-865.0


In [13]:

df_ind_year = df_ind_year[~df_ind_year['productive_year'].isna()]

temporal_resolution = 10
df_ind_year["decade"] = df_ind_year["productive_year"].apply(lambda x: round(x / temporal_resolution) * temporal_resolution)

#df_ind_year = df_ind_year[df_ind_year['productive_year']<=1880]

df_ind_year[df_ind_year['individual_wikidata_id']=='Q6691']

Unnamed: 0,individual_wikidata_id,birthyear,deathyear,productive_year,decade
218656,Q6691,-900.0,-800.0,-865.0,-860


In [14]:
df_catalogs_id = pd.read_sql_query("SELECT * FROM individual_identifiers", conn)
df_catalogs = pd.read_sql_query("SELECT * FROM identifiers", conn)


if only_country:
    df_catalogs = df_catalogs[['identifiers_wikidata_id', 'country_name']].dropna()
    df_cat = pd.merge(df_catalogs_id, df_catalogs, on = 'identifiers_wikidata_id')
    df_cat = df_cat[['individual_wikidata_id','individual_name', 'country_name']].drop_duplicates()
    df_cat = df_cat.groupby(['individual_name', 'individual_wikidata_id'])['country_name'].count().rename('score').reset_index()
    df_cat = df_cat.sort_values('score', ascending=False).reset_index(drop=True)
else:
    df_cat = df_catalogs_id.groupby(['individual_name', 'individual_wikidata_id'])['identifiers_wikidata_id'].count().rename('score').reset_index()
    df_cat = df_cat.sort_values('score', ascending=False).reset_index(drop=True)
    
df_final = pd.merge(df_ind_year, df_cat, on = 'individual_wikidata_id')
df_final = pd.merge(df_final, df_ind_regions, on = ['individual_wikidata_id', 'individual_name'])
df_final = df_final.drop(['birthyear', 'deathyear'], axis=1)

df_final = df_final.drop_duplicates()

df_final.to_csv('results/df_individuals_score.csv')

In [15]:
test = df_final[df_final['region_name']=='Arabian peninsula']
test = test[test['decade']<=800]
test = test.sort_values('decade', ascending=True)


In [16]:
df_final.sort_values('score', ascending=False).sample(3)

Unnamed: 0,individual_wikidata_id,productive_year,decade,individual_name,score,region_code,region_name
71311,Q16933564,1866.0,1870,Andrew Scoble,13,re_western_europe,Western Europe
511127,Q782832,1857.0,1860,Carlo Alfredo Piatti,60,re_italy,Italy
1664,Q1526151,1628.0,1630,Giovanni Giacomo Tencalla,13,re_southwestern_europe,Southwestern Europe


### Make scores for regions

In [17]:
import math

def get_century(decade):
    return math.ceil(decade / 100)

In [18]:
# Make groups
df_group = df_final[['individual_wikidata_id', 'region_name', 'decade', 'score']].drop_duplicates()
df_group = df_group.reset_index(drop=True)
df_group['score'] = 1

df_group = df_group.groupby(['region_name', 'decade'])['score'].sum().reset_index()
df_group['century'] = df_group['decade'].apply(get_century)

df_start = df_group.copy()

min_individuals = 8

#def find_min_individuals()

# Applying the function to the 'decade' column
df_start['century'] = df_start['decade'].apply(get_century)
df_start = df_start.groupby(['region_name', 'century'])['score'].sum().reset_index()
df_start= df_start.reset_index()

# Filter rows with score >= 10
filtered_df = df_start[df_start['score'] >= min_individuals]

# Get index of the first occurrence of each region
indices = filtered_df.groupby('region_name').apply(lambda x: x.index[0])

df_index = indices.reset_index()
df_index.columns = ['region_name', 'min_index']


# For every region filter by the min century
final_new_region = []
for region in df_index['region_name'].unique():
    region_min_index = df_index[df_index['region_name']==region]['min_index'].iloc[0]
    df_filtered_region = df_start[(df_start['region_name']==region)&(df_start['index']>=region_min_index)]
    final_new_region.append(df_filtered_region)
df_final_new_region = pd.concat([x for x in final_new_region])
df_final_new_region = df_final_new_region[['region_name', 'century']].copy()


df_final_group = pd.merge(df_group, df_final_new_region, on = ['region_name', 'century'])
df_final_group = df_final_group.reset_index(drop=True)
df_final_group[df_final_group['region_name']=='Chinese world']

Unnamed: 0,region_name,decade,score,century
693,Chinese world,-280,1,-2
694,Chinese world,-270,1,-2
695,Chinese world,-260,2,-2
696,Chinese world,-240,2,-2
697,Chinese world,-230,2,-2
...,...,...,...,...
885,Chinese world,1840,50,19
886,Chinese world,1850,43,19
887,Chinese world,1860,66,19
888,Chinese world,1870,58,19


### this is a function to find the earlier date for every region with at least n indiviudla in a century


In [19]:
df_group = df_final_group.copy()
df_group.to_sql('region_score',conn, if_exists = 'replace', index=False)
df_group.to_csv('results/df_region_score.csv')

In [20]:
# Individuals that are only scientists
df_scientists = pd.read_sql_query("SELECT * FROM individual_occupations", conn)
df_scientists = df_scientists[df_scientists['occupations_category'].str.contains('science')]

top_scientific_occupations = df_scientists.occupations_name.value_counts()
top_scientific_occupations = top_scientific_occupations.head(50).index
occupations_to_remove = ['pedagogue','explorer','music pedagogue', 'land surveyor', 'music theorist', 'folklorist', 'conquistador', 'urban planner', 'settler']
filtered_occupation_list = [occupation for occupation in top_scientific_occupations if occupation not in occupations_to_remove]
df_scientists = df_scientists[df_scientists['occupations_name'].isin(filtered_occupation_list)]

list_scientists = list(set(df_scientists['individual_wikidata_id']))
df_final_scientists = df_final[df_final['individual_wikidata_id'].isin(list_scientists)]
len(df_final_scientists)

df_final_scientists = df_final_scientists.drop_duplicates()
df_final_scientists.to_csv('results/df_individuals_score_science.csv')

### Make groups for scientists in regions

In [21]:
# Make groups
df_group = df_final_scientists[['individual_wikidata_id', 'region_name', 'decade', 'score']].drop_duplicates()
df_group = df_group.reset_index(drop=True)
df_group['score'] = 1

In [22]:


df_group = df_group.groupby(['region_name', 'decade'])['score'].sum().reset_index()
df_group.to_sql('region_score_science',conn, if_exists = 'replace', index=False)
df_group.to_csv('results/df_region_score_science.csv')

## Make complexity score for regions

In [23]:
df_occupations = pd.read_sql_query("SELECT * FROM individual_occupations", conn)

In [24]:
df_complexity = pd.merge(df_final,df_occupations, on = 'individual_wikidata_id')

In [25]:
df_complexity_group = df_complexity[['occupations_wikidata_id', 'region_name', 'decade']].copy()
df_complexity_group = df_complexity_group.groupby(['region_name', 'decade'])['occupations_wikidata_id'].apply(lambda x : len(set(x))).reset_index()
df_complexity_group = df_complexity_group.rename(columns={'occupations_wikidata_id':'score'})


In [26]:
df_complexity_group.to_csv('results/df_region_score_complexity.csv')

In [27]:
#df_complexity_group[df_complexity_group['region_name']=='Japan'].sample(10)