In [1]:
import sys

sys.path.append("../")

import sqlite3
import pandas as pd

pd.options.mode.chained_assignment = None

import os

from dotenv import load_dotenv

load_dotenv()

DB_SCIENCE_PATH_NEW  = os.getenv("DB_SCIENCE_PATH_NEW")

conn = sqlite3.connect(DB_SCIENCE_PATH_NEW)


In [2]:
df_occupation = pd.read_sql("SELECT * FROM cleaned_occupations_science", conn)
df_occupation = df_occupation.groupby(['wikidata_id'])['meta_occupation'].apply(lambda x : ' | '.join(x))
df_occupation = df_occupation.reset_index()
df_occupation = df_occupation.drop_duplicates()

In [3]:
data_path = '../networks/data'

In [4]:
df_ind_regions = pd.read_sql_query(
    "SELECT * FROM individuals_regions", conn
)
df_ind_regions = df_ind_regions.rename(
    columns={"individual_wikidata_id": "wikidata_id"}
)


df_temporal = pd.read_sql("SELECT * FROM individuals_occupation_information", conn)
df_temporal = df_temporal.rename(
    columns={"individual_wikidata_id": "wikidata_id"}
)
df_temporal = df_temporal[['wikidata_id', 'birthyear']].drop_duplicates()
len(set(df_temporal.wikidata_id))

71331

In [5]:
df_random = pd.merge(df_ind_regions, df_temporal, on = 'wikidata_id')
df_random = pd.merge(df_random, df_occupation, on = 'wikidata_id')
df_random = df_random[['wikidata_id', 'meta_occupation']].drop_duplicates()
df_random_1 = df_random.sample(1000, random_state=42)
df_random_2 = df_random.sample(1000, random_state=41)

df_random_1.to_csv(data_path + '/random_1.csv')
df_random_2.to_csv(data_path + '/random_2.csv')

In [6]:
regions_europe = [
're_low_countries',
're_france',
're_spain',
're_portugal',
're_italy',
're_german_world',
're_central_europe',
're_slav_world',
're_balkans',
're_nordic_countries',
're_british_islands']

df_ind_regions['region_code'] = df_ind_regions['region_code'].str.strip()
df_ind_regions_europe = df_ind_regions[df_ind_regions["region_code"].isin(regions_europe)]


In [7]:
df_ind_regions_europe.region_name.value_counts()

German world        15250
France               7959
British Islands      5727
Italy                5189
Central Europe       4825
Nordic countries     2718
Slav world           2431
Low countries        2236
Spain                1966
Balkans               672
Portugal              407
Name: region_name, dtype: int64

In [8]:
df_ind_regions_europe.region_code.value_counts()

re_german_world        15250
re_france               7959
re_british_islands      5727
re_italy                5189
re_central_europe       4825
re_nordic_countries     2718
re_slav_world           2431
re_low_countries        2236
re_spain                1966
re_balkans               672
re_portugal              407
Name: region_code, dtype: int64

In [9]:
set(df_ind_regions_europe.region_code)

{'re_balkans',
 're_british_islands',
 're_central_europe',
 're_france',
 're_german_world',
 're_italy',
 're_low_countries',
 're_nordic_countries',
 're_portugal',
 're_slav_world',
 're_spain'}

In [10]:
df_ind_regions[df_ind_regions["region_code"].isin(regions_europe)]

Unnamed: 0,wikidata_id,individual_name,region_code,region_name
0,Q2822993,Achard of Saint-Victor,re_british_islands,British Islands
4,Q246595,Joseph Smit,re_british_islands,British Islands
8,Q742311,Chad of Mercia,re_british_islands,British Islands
12,Q6240116,John Howe,re_british_islands,British Islands
16,Q5738046,Thomas Douglas Forsyth,re_british_islands,British Islands
...,...,...,...,...
180824,Q12278401,Dimitrije Kantakouzenos,re_balkans,Balkans
180826,Q3148191,Ignác Szentmártonyi,re_balkans,Balkans
180828,Q2548769,Gregory Akindynos,re_balkans,Balkans
180829,Q3633198,Balagrus,re_balkans,Balkans


In [11]:
# starting from 500
df_temporal_europe = df_temporal[df_temporal["birthyear"] > 500]

df_europe = pd.merge(df_ind_regions_europe, df_temporal_europe, on = 'wikidata_id')
df_europe = pd.merge(df_europe, df_occupation, on = 'wikidata_id')

print(len(set(df_europe.wikidata_id)))

df_europe.to_csv(data_path + '/europe.csv')

42489


In [12]:
set(df_ind_regions.region_code)

{'re_arabian_peninsula',
 're_arabic_world',
 're_austria',
 're_balkans',
 're_belgium',
 're_british_islands',
 're_central_europe',
 're_chinese_world',
 're_denmark',
 're_eastern_europe',
 're_finland',
 're_france',
 're_german_world',
 're_germany',
 're_greece',
 're_greek_world',
 're_iceland',
 're_indian_world',
 're_ireland',
 're_italy',
 're_japan',
 're_korea',
 're_latin',
 're_low_countries',
 're_mediterreanean',
 're_muslim_world',
 're_netherlands',
 're_nordic_countries',
 're_north_china',
 're_north_france',
 're_north_india',
 're_north_italy',
 're_north_japan',
 're_northwestern_europe',
 're_norway',
 're_ottoman_turkey',
 're_ottoman_world',
 're_persian_world',
 're_portugal',
 're_slav_world',
 're_south_china',
 're_south_east_asia',
 're_south_france',
 're_south_india',
 're_south_italy',
 're_south_japan',
 're_southwestern_europe',
 're_spain',
 're_sweden',
 're_switzerland',
 're_united_kingdom',
 're_western_europe',
 're_yangtze'}

In [13]:
df_ind_regions_antiquity = df_ind_regions[df_ind_regions["region_code"].isin(['re_greek_world', 're_latin'])]
df_temporal_antiquity = df_temporal[df_temporal["birthyear"] <= 500]

df_antiquity = pd.merge(df_ind_regions_antiquity, df_temporal_antiquity, on = 'wikidata_id')
df_antiquity = df_antiquity.drop_duplicates('wikidata_id', keep='first')
df_antiquity = pd.merge(df_antiquity, df_occupation, on = 'wikidata_id')
df_antiquity.to_csv(data_path + '/antiquity.csv')

print(len(set(df_antiquity.wikidata_id)))

657


In [14]:
regions_non_europe = ['re_ottoman_turkey',
're_arabic_world',
're_persian_world',
're_indian_world',
're_chinese_world',
're_korea',
're_japan',
're_south_east_asia']

df_ind_regions_non_europe = df_ind_regions[df_ind_regions["region_code"].isin(regions_non_europe)]

# starting from 500
#df_temporal_europe = df_temporal[df_temporal["birthyear"] > 500]

df_non_europe = pd.merge(df_ind_regions_non_europe, df_temporal, on = 'wikidata_id')
df_non_europe = pd.merge(df_non_europe, df_occupation, on = 'wikidata_id')
print(len(set(df_non_europe.wikidata_id)))

df_arabs = df_non_europe[df_non_europe['region_code']=='re_arabic_world']
df_arabs = df_arabs[df_arabs['birthyear']>500]

df_non_europe = df_non_europe[df_non_europe['region_code'] !='re_arabic_world']
df_non_europe = pd.concat([df_non_europe, df_arabs])
df_non_europe = df_non_europe.reset_index(drop=True)


df_non_europe = pd.concat([df_antiquity, df_non_europe])
df_non_europe = df_non_europe.reset_index(drop=True)
print(len(set(df_non_europe.wikidata_id)))

df_non_europe.to_csv(data_path + '/non_europe.csv')

2370
2916


In [15]:
df_antiquity['meta_region'] = 'antiquity'
df_europe['meta_region'] = 'europe'
df_non_europe['meta_region'] = 'non_europe'

In [16]:
df_global = pd.concat([df_antiquity, df_europe, df_non_europe])
df_global.to_csv('../networks/data/global.csv')

df_global_before_1500 = df_global[df_global['birthyear']<=1500]
df_global_before_1500 = df_global_before_1500.drop_duplicates()
df_global_before_1500.to_csv(data_path + '/global_before_1500.csv')

df_global_after_1500 = df_global[df_global['birthyear']>1500]
df_global_after_1500.to_csv(data_path + '/global_after_1500.csv')

df_global_before_1700 = df_global[df_global['birthyear']<=1700]
df_global_before_1700 = df_global_before_1700.drop_duplicates()
df_global_before_1700.to_csv(data_path + '/global_before_1700.csv')

In [18]:
len(set(df_global_before_1700.wikidata_id))

13556

In [19]:
df_global['region'] = df_global['region_code']
df_global['region'][df_global['region_code'].isin(regions_europe)] = 're_europe'

final = []
for region in list(set(df_global['region'])):
    df_sample = df_global[df_global["region"] == region]
    individual_sample = 100

    if len(df_sample) > individual_sample:
        res = df_sample.sample(individual_sample, random_state=41)
    else:
        res = df_sample.copy()

    final.append(res)

df_fin = pd.concat([x for x in final])
df_fin = df_fin.reset_index(drop=True)

print(len(set(df_fin.wikidata_id)))
df_fin.to_csv(data_path + '/global_weighted.csv')

df_fin.region.value_counts()

1032


re_korea              100
re_persian_world      100
re_indian_world       100
re_latin              100
re_japan              100
re_europe             100
re_arabic_world       100
re_greek_world        100
re_chinese_world      100
re_south_east_asia     94
re_ottoman_turkey      53
Name: region, dtype: int64

In [20]:
df_non_europe_before_1700 = df_non_europe[df_non_europe['birthyear']<=1700]
df_non_europe_before_1700.to_csv(data_path + '/non_europe_before_1700.csv')

df_europe_before_1700 = df_europe[df_europe['birthyear']<=1700]
df_europe_before_1700.to_csv(data_path + '/europe_before_1700.csv')

len_non_europe = len(df_non_europe_before_1700)

### Weighted Data

In [21]:
from tqdm import tqdm
for x in tqdm(range(100)):
    df_weighted = pd.concat([df_non_europe_before_1700, df_europe_before_1700.sample(len_non_europe, random_state=x)])
    df_weighted = df_weighted.reset_index(drop=True)
    df_weighted.to_csv(data_path + f'/weighted/df_{x}.csv')

df_weighted


  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:01<00:00, 71.93it/s]


Unnamed: 0,wikidata_id,individual_name,region_code,region_name,birthyear,meta_occupation,meta_region
0,Q316119,Gnaeus Pompeius Trogus,re_latin,Latin World,-100.0,historian,non_europe
1,Q782074,Claudianus Mamertus,re_latin,Latin World,420.0,theologian | philosopher,non_europe
2,Q182123,Irenaeus,re_latin,Latin World,130.0,theologian | philosopher,non_europe
3,Q44344,Hilary of Poitiers,re_latin,Latin World,315.0,theologian | philosopher,non_europe
4,Q1430,Marcus Aurelius,re_latin,Latin World,121.0,philosopher,non_europe
...,...,...,...,...,...,...,...
4339,Q2857161,Antonino Diana,re_italy,Italy,1586.0,theologian,europe
4340,Q247826,Abraham Heidanus,re_low_countries,Low countries,1597.0,theologian,europe
4341,Q347726,Hans Tausen,re_nordic_countries,Nordic countries,1494.0,theologian | linguist,europe
4342,Q1664515,Nikolaus Elgard,re_german_world,German world,1547.0,theologian,europe
