In [16]:
import sys

sys.path.append("../")

import sqlite3
import pandas as pd

pd.options.mode.chained_assignment = None

import os

from dotenv import load_dotenv

load_dotenv()

DB_SCIENCE_PATH_NEW  = os.getenv("DB_SCIENCE_PATH_NEW")

conn = sqlite3.connect(DB_SCIENCE_PATH_NEW)


In [17]:
df_occupation = pd.read_sql("SELECT * FROM cleaned_occupations_science", conn)
df_occupation = df_occupation.groupby(['wikidata_id'])['meta_occupation'].apply(lambda x : ' | '.join(x))
df_occupation = df_occupation.reset_index()
df_occupation = df_occupation.drop_duplicates()
df_occupation

Unnamed: 0,wikidata_id,meta_occupation
0,Q1000034,mathematician
1,Q100038758,archeologist
2,Q1000581,mathematician
3,Q1000608,anthropologist
4,Q1000929,geneticist
...,...,...
32596,Q99909,zoologist
32597,Q999118,chemist
32598,Q99914,chemist
32599,Q99932403,geographer


In [18]:
df_ind_regions = pd.read_sql_query(
    "SELECT * FROM individuals_regions", conn
)
df_ind_regions = df_ind_regions.rename(
    columns={"individual_wikidata_id": "wikidata_id"}
)


df_temporal = pd.read_sql("SELECT * FROM individuals_occupation_information", conn)
df_temporal = df_temporal.rename(
    columns={"individual_wikidata_id": "wikidata_id"}
)
df_temporal = df_temporal[['wikidata_id', 'birthyear']].drop_duplicates()

In [19]:
df_random = pd.merge(df_ind_regions, df_temporal, on = 'wikidata_id')
df_random = pd.merge(df_random, df_occupation, on = 'wikidata_id')
df_random = df_random[['wikidata_id', 'meta_occupation']].drop_duplicates()
df_random_1 = df_random.sample(1000, random_state=42)
df_random_2 = df_random.sample(1000, random_state=41)

df_random_1.to_csv('data/random_1.csv')
df_random_2.to_csv('data/random_2.csv')

In [20]:


regions_europe = [
're_nordic_countries'
're_british_islands',
're_low_countries',
're_france',
're_spain',
're_italy',
're_german_world',
're_central_europe',
're_slav_world'
're_balkans']


df_ind_regions_europe = df_ind_regions[df_ind_regions["region_code"].isin(regions_europe)]

# starting from 500
df_temporal_europe = df_temporal[df_temporal["birthyear"] > 500]

df_europe = pd.merge(df_ind_regions_europe, df_temporal_europe, on = 'wikidata_id')
df_europe = pd.merge(df_europe, df_occupation, on = 'wikidata_id')

print(len(set(df_europe.wikidata_id)))

df_europe.to_csv('data/europe.csv')


17107


In [21]:
df_ind_regions_antiquity = df_ind_regions[df_ind_regions["region_code"].isin(['re_greek_world', 're_latin'])]
df_temporal_antiquity = df_temporal[df_temporal["birthyear"] <= 500]

df_antiquity = pd.merge(df_ind_regions_antiquity, df_temporal_antiquity, on = 'wikidata_id')
df_antiquity = df_antiquity.drop_duplicates('wikidata_id', keep='first')
df_antiquity = pd.merge(df_antiquity, df_occupation, on = 'wikidata_id')
df_antiquity.to_csv('data/antiquity.csv')

print(len(set(df_antiquity.wikidata_id)))

143


In [22]:
regions_non_europe = ['re_ottoman_turkey',
're_arabic_world',
're_persian_world',
're_indian_world',
're_chinese_world',
're_korea',
're_japan',
're_south_east_asia']

df_ind_regions_non_europe = df_ind_regions[df_ind_regions["region_code"].isin(regions_non_europe)]

# starting from 500
#df_temporal_europe = df_temporal[df_temporal["birthyear"] > 500]

df_non_europe = pd.merge(df_ind_regions_non_europe, df_temporal, on = 'wikidata_id')
df_non_europe = pd.merge(df_non_europe, df_occupation, on = 'wikidata_id')
print(len(set(df_non_europe.wikidata_id)))

df_arabs = df_non_europe[df_non_europe['region_code']=='re_arabic_world']
df_arabs = df_arabs[df_arabs['birthyear']>500]

df_non_europe = df_non_europe[df_non_europe['region_code'] !='re_arabic_world']
df_non_europe = pd.concat([df_non_europe, df_arabs])
df_non_europe = df_non_europe.reset_index(drop=True)


df_non_europe = pd.concat([df_antiquity, df_non_europe])
df_non_europe = df_non_europe.reset_index(drop=True)
print(len(set(df_non_europe.wikidata_id)))


df_non_europe.to_csv('data/non_europe.csv')



865
970


In [23]:
df_global = pd.concat([df_antiquity, df_europe, df_non_europe])
print(len(set(df_global.wikidata_id)))
df_global.to_csv('data/global.csv')


df_global_before_1500 = df_global[df_global['birthyear']<=1500]
print(len(set(df_global_before_1500.wikidata_id)))

df_global_before_1500 = df_global_before_1500.drop_duplicates()
df_global_before_1500.to_csv('data/global_before_1500.csv')


df_global_before_1700 = df_global[df_global['birthyear']<=1700]
print(len(set(df_global_before_1500.wikidata_id)))

df_global_before_1700 = df_global_before_1700.drop_duplicates()
df_global_before_1700.to_csv('data/global_before_1700.csv')

18077
921
921


['Korea',
 'Persian world',
 'Chinese world',
 'Latin World',
 'Spain',
 'Central Europe',
 'South East Asia',
 'Low countries',
 'German world',
 'Ottoman Turkey',
 'Japan',
 'France',
 'Indian world',
 'Italy',
 'Greek World',
 'Arabic world']

In [25]:

df_global_after_1500 = df_global[df_global['birthyear']>1500]
print(len(set(df_global_after_1500.wikidata_id)))
df_global_after_1500.to_csv('data/global_after_1500.csv')


17156


In [26]:
df_global['region'] = df_global['region_code']
df_global['region'][df_global['region_code'].isin(regions_europe)] = 're_europe'

final = []
for region in list(set(df_global['region'])):
    df_sample = df_global[df_global["region"] == region]
    individual_sample = 100

    if len(df_sample) > individual_sample:
        res = df_sample.sample(individual_sample, random_state=41)
    else:
        res = df_sample.copy()

    final.append(res)

df_fin = pd.concat([x for x in final])
df_fin = df_fin.reset_index(drop=True)

print(len(set(df_fin.wikidata_id)))
df_fin.to_csv('data/global_weighted.csv')

df_fin.region.value_counts()


779


re_chinese_world      100
re_japan              100
re_europe             100
re_greek_world        100
re_arabic_world       100
re_indian_world       100
re_persian_world       86
re_latin               64
re_south_east_asia     53
re_ottoman_turkey      22
re_korea               10
Name: region, dtype: int64

In [27]:
df_non_europe_before_1700 = df_non_europe[df_non_europe['birthyear']<=1700]
df_non_europe_before_1700.to_csv('data/non_europe_before_1700.csv')

df_europe_before_1700 = df_europe[df_europe['birthyear']<=1700]
df_europe_before_1700.to_csv('data/europe_before_1700.csv')


len_non_europe = len(df_non_europe_before_1700)


In [28]:
df_test = df_non_europe_before_1700.copy()
df_test = df_test[df_test['meta_occupation'].str.contains('historian')]
df_test = df_test[df_test['meta_occupation'].str.contains('botanist')]
df_test

Unnamed: 0,wikidata_id,individual_name,region_code,region_name,birthyear,meta_occupation
154,Q353068,Akşemseddin,re_ottoman_turkey,Ottoman Turkey,1389.0,historian | botanist
167,Q293520,Abū Ḥanīfa Dīnawarī,re_persian_world,Persian world,828.0,botanist | historian | astronomer | geographer...
230,Q11826,Al-Biruni,re_persian_world,Persian world,973.0,botanist | anthropologist | historian | astron...
563,Q267350,Michał Boym,re_chinese_world,Chinese world,1612.0,historian | linguist | botanist
735,Q67657447,Hamzah ibn ʻAbd Allah Nashiri,re_arabic_world,Arabic world,1430.0,historian | botanist
822,Q5984319,Ibn al-Wardi,re_arabic_world,Arabic world,1290.0,historian | geographer | botanist | zoologist
871,Q12177924,Ahmad ibn Muhammad abu Yafar al-Gafiqi,re_arabic_world,Arabic world,1100.0,historian | botanist
958,Q337126,Al-Bakri,re_arabic_world,Arabic world,1014.0,historian | geographer | botanist


In [29]:
from tqdm import tqdm
for x in tqdm(range(100)):
    df_weighted = pd.concat([df_non_europe_before_1700, df_europe_before_1700.sample(len_non_europe, random_state=x)])
    df_weighted = df_weighted.reset_index(drop=True)
    df_weighted.to_csv(f'data/weighted/df_{x}.csv')

100%|██████████| 100/100 [00:00<00:00, 258.47it/s]
