In [7]:
import sys

sys.path.append("../")

import sqlite3
import pandas as pd

pd.options.mode.chained_assignment = None

import os

from dotenv import load_dotenv

load_dotenv()

DB_SCIENCE_PATH_NEW  = os.getenv("DB_SCIENCE_PATH_NEW")

conn = sqlite3.connect(DB_SCIENCE_PATH_NEW)


In [8]:
df_occupation = pd.read_sql("SELECT * FROM cleaned_occupations_science", conn)
df_occupation = df_occupation.groupby(['wikidata_id'])['meta_occupation'].apply(lambda x : ' | '.join(x))
df_occupation = df_occupation.reset_index()
df_occupation = df_occupation.drop_duplicates()




In [9]:
data_path = '../networks/data'

In [10]:
df_ind_regions = pd.read_sql_query(
    "SELECT * FROM individuals_regions", conn
)
df_ind_regions = df_ind_regions.rename(
    columns={"individual_wikidata_id": "wikidata_id"}
)


df_temporal = pd.read_sql("SELECT * FROM individuals_occupation_information", conn)
df_temporal = df_temporal.rename(
    columns={"individual_wikidata_id": "wikidata_id"}
)
df_temporal = df_temporal[['wikidata_id', 'birthyear']].drop_duplicates()
len(set(df_temporal.wikidata_id))

42668

In [11]:
df_random = pd.merge(df_ind_regions, df_temporal, on = 'wikidata_id')
df_random = pd.merge(df_random, df_occupation, on = 'wikidata_id')
df_random = df_random[['wikidata_id', 'meta_occupation']].drop_duplicates()
df_random_1 = df_random.sample(1000, random_state=42)
df_random_2 = df_random.sample(1000, random_state=41)

df_random_1.to_csv(data_path + '/random_1.csv')
df_random_2.to_csv(data_path + '/random_2.csv')

In [12]:
regions_europe = [
're_low_countries',
're_france',
're_spain',
're_portugal',
're_italy',
're_german_world',
're_central_europe',
're_slav_world',
're_balkans',
're_nordic_countries',
're_british_islands']

df_ind_regions['region_code'] = df_ind_regions['region_code'].str.strip()
df_ind_regions_europe = df_ind_regions[df_ind_regions["region_code"].isin(regions_europe)]


In [13]:
df_ind_regions_europe.region_name.value_counts()

German world        8036
France              5120
British Islands     4157
Central Europe      3081
Italy               2871
Slav world          1756
Nordic countries    1731
Low countries       1186
Spain               1096
Balkans              272
Portugal             246
Name: region_name, dtype: int64

In [14]:
df_ind_regions_europe.region_code.value_counts()

re_german_world        8036
re_france              5120
re_british_islands     4157
re_central_europe      3081
re_italy               2871
re_slav_world          1756
re_nordic_countries    1731
re_low_countries       1186
re_spain               1096
re_balkans              272
re_portugal             246
Name: region_code, dtype: int64

In [15]:
set(df_ind_regions_europe.region_code)

{'re_balkans',
 're_british_islands',
 're_central_europe',
 're_france',
 're_german_world',
 're_italy',
 're_low_countries',
 're_nordic_countries',
 're_portugal',
 're_slav_world',
 're_spain'}

In [16]:
df_ind_regions[df_ind_regions["region_code"].isin(regions_europe)]

Unnamed: 0,wikidata_id,individual_name,region_code,region_name
0,Q246595,Joseph Smit,re_british_islands,British Islands
4,Q5738046,Thomas Douglas Forsyth,re_british_islands,British Islands
8,Q55043617,William Henry Harwood,re_british_islands,British Islands
12,Q3290537,Margaretta Riley,re_british_islands,British Islands
16,Q7410983,Samuel Brewer,re_british_islands,British Islands
...,...,...,...,...
107197,Q12021904,Ivan Bohdan Staněk,re_central_europe,Central Europe
107199,Q65921411,Stefan Michal Puzyna,re_central_europe,Central Europe
107201,Q55065262,Johann Tobias Lowiz,re_slav_world,Slav world
107203,Q55680410,Ottomar Kayser,re_central_europe,Central Europe


In [17]:
df_ind_regions

Unnamed: 0,wikidata_id,individual_name,region_code,region_name
0,Q246595,Joseph Smit,re_british_islands,British Islands
1,Q246595,Joseph Smit,re_northwestern_europe,Northwestern Europe
2,Q246595,Joseph Smit,re_united_kingdom,United Kingdom
3,Q246595,Joseph Smit,re_western_europe,Western Europe
4,Q5738046,Thomas Douglas Forsyth,re_british_islands,British Islands
...,...,...,...,...
109575,Q5894051,Hong Dae-yong,re_korea,Korea
109576,Q12619924,Han-gi Ch'oe,re_korea,Korea
109577,Q6408542,Kim Dam,re_korea,Korea
109578,Q11095247,Yi Ji-bang,re_korea,Korea


In [18]:
# starting from 500
df_temporal_europe = df_temporal[df_temporal["birthyear"] > 500]

df_europe = pd.merge(df_ind_regions_europe, df_temporal_europe, on = 'wikidata_id')
df_europe = pd.merge(df_europe, df_occupation, on = 'wikidata_id')

print(len(set(df_europe.wikidata_id)))

df_europe.to_csv(data_path + '/europe.csv')

23200


In [19]:
set(df_ind_regions.region_code)

{'re_arabian_peninsula',
 're_arabic_world',
 're_austria',
 're_balkans',
 're_belgium',
 're_british_islands',
 're_central_europe',
 're_chinese_world',
 're_denmark',
 're_eastern_europe',
 're_finland',
 're_france',
 're_german_world',
 're_germany',
 're_greece',
 're_greek_world',
 're_iceland',
 're_indian_world',
 're_ireland',
 're_italy',
 're_japan',
 're_korea',
 're_latin',
 're_low_countries',
 're_mediterreanean',
 're_muslim_world',
 're_netherlands',
 're_nordic_countries',
 're_north_china',
 're_north_france',
 're_north_india',
 're_north_italy',
 're_north_japan',
 're_northwestern_europe',
 're_norway',
 're_ottoman_turkey',
 're_ottoman_world',
 're_persian_world',
 're_portugal',
 're_slav_world',
 're_south_china',
 're_south_east_asia',
 're_south_france',
 're_south_india',
 're_south_italy',
 're_south_japan',
 're_southwestern_europe',
 're_spain',
 're_sweden',
 're_switzerland',
 're_united_kingdom',
 're_western_europe',
 're_yangtze'}

In [20]:
df_ind_regions_antiquity = df_ind_regions[df_ind_regions["region_code"].isin(['re_greek_world', 're_latin'])]
df_temporal_antiquity = df_temporal[df_temporal["birthyear"] <= 500]

df_antiquity = pd.merge(df_ind_regions_antiquity, df_temporal_antiquity, on = 'wikidata_id')
df_antiquity = df_antiquity.drop_duplicates('wikidata_id', keep='first')
df_antiquity = pd.merge(df_antiquity, df_occupation, on = 'wikidata_id')
df_antiquity.to_csv(data_path + '/antiquity.csv')

print(len(set(df_antiquity.wikidata_id)))

143


In [21]:
regions_non_europe = ['re_ottoman_turkey',
're_arabic_world',
're_persian_world',
're_indian_world',
're_chinese_world',
're_korea',
're_japan',
're_south_east_asia']

df_ind_regions_non_europe = df_ind_regions[df_ind_regions["region_code"].isin(regions_non_europe)]

# starting from 500
#df_temporal_europe = df_temporal[df_temporal["birthyear"] > 500]

df_non_europe = pd.merge(df_ind_regions_non_europe, df_temporal, on = 'wikidata_id')
df_non_europe = pd.merge(df_non_europe, df_occupation, on = 'wikidata_id')
print(len(set(df_non_europe.wikidata_id)))

df_arabs = df_non_europe[df_non_europe['region_code']=='re_arabic_world']
df_arabs = df_arabs[df_arabs['birthyear']>500]

df_non_europe = df_non_europe[df_non_europe['region_code'] !='re_arabic_world']
df_non_europe = pd.concat([df_non_europe, df_arabs])
df_non_europe = df_non_europe.reset_index(drop=True)


df_non_europe = pd.concat([df_antiquity, df_non_europe])
df_non_europe = df_non_europe.reset_index(drop=True)
print(len(set(df_non_europe.wikidata_id)))

df_non_europe.to_csv(data_path + '/non_europe.csv')

865
970


In [22]:
df_antiquity['meta_region'] = 'antiquity'
df_europe['meta_region'] = 'europe'
df_non_europe['meta_region'] = 'non_europe'

In [23]:
df_global = pd.concat([df_antiquity, df_europe, df_non_europe])
df_global.to_csv('../networks/data/global.csv')

df_global_before_1500 = df_global[df_global['birthyear']<=1500]
df_global_before_1500 = df_global_before_1500.drop_duplicates()
df_global_before_1500.to_csv(data_path + '/global_before_1500.csv')

df_global_after_1500 = df_global[df_global['birthyear']>1500]
df_global_after_1500.to_csv(data_path + '/global_after_1500.csv')

df_global_before_1700 = df_global[df_global['birthyear']<=1700]
df_global_before_1700 = df_global_before_1700.drop_duplicates()
df_global_before_1700.to_csv(data_path + '/global_before_1700.csv')

In [27]:
df_global_before_1700

Unnamed: 0,wikidata_id,individual_name,region_code,region_name,birthyear,meta_occupation,meta_region
0,Q50082878,Jess Wade,re_latin,Latin World,2.0,physicist,antiquity
1,Q723645,Victorius of Aquitaine,re_latin,Latin World,450.0,astronomer | mathematician,antiquity
2,Q297515,Pomponius Mela,re_latin,Latin World,15.0,historian | geographer,antiquity
3,Q755297,Athenaeus Mechanicus,re_latin,Latin World,-199.0,philosopher | mathematician,antiquity
4,Q706319,Marinus of Tyre,re_latin,Latin World,100.0,geographer | mathematician,antiquity
...,...,...,...,...,...,...,...
966,Q3603837,Abu 'Ali al-Khaiyat,re_arabic_world,Arabic world,770.0,astronomer,non_europe
967,Q10299689,Ibn Ghazi al-Miknasi,re_arabic_world,Arabic world,1437.0,historian | linguist | mathematician,non_europe
968,Q1168219,Josef ben Meir ibn Zabara,re_arabic_world,Arabic world,1140.0,physicist,non_europe
969,Q167852,Jabir ibn Hayyan,re_arabic_world,Arabic world,721.0,astronomer | philosopher | mathematician,non_europe


In [24]:
len(set(df_global_before_1700.wikidata_id))

4266

In [25]:
df_global['region'] = df_global['region_code']
df_global['region'][df_global['region_code'].isin(regions_europe)] = 're_europe'

final = []
for region in list(set(df_global['region'])):
    df_sample = df_global[df_global["region"] == region]
    individual_sample = 100

    if len(df_sample) > individual_sample:
        res = df_sample.sample(individual_sample, random_state=41)
    else:
        res = df_sample.copy()

    final.append(res)

df_fin = pd.concat([x for x in final])
df_fin = df_fin.reset_index(drop=True)

print(len(set(df_fin.wikidata_id)))
df_fin.to_csv(data_path + '/global_weighted.csv')

df_fin.region.value_counts()

779


re_arabic_world       100
re_indian_world       100
re_greek_world        100
re_chinese_world      100
re_japan              100
re_europe             100
re_persian_world       86
re_latin               64
re_south_east_asia     53
re_ottoman_turkey      22
re_korea               10
Name: region, dtype: int64

In [26]:
df_non_europe_before_1700 = df_non_europe[df_non_europe['birthyear']<=1700]
df_non_europe_before_1700.to_csv(data_path + '/non_europe_before_1700.csv')

df_europe_before_1700 = df_europe[df_europe['birthyear']<=1700]
df_europe_before_1700.to_csv(data_path + '/europe_before_1700.csv')

len_non_europe = len(df_non_europe_before_1700)

from tqdm import tqdm
for x in tqdm(range(100)):
    df_weighted = pd.concat([df_non_europe_before_1700, df_europe_before_1700.sample(len_non_europe, random_state=x)])
    df_weighted = df_weighted.reset_index(drop=True)
    df_weighted.to_csv(data_path + f'/weighted/df_{x}.csv')

df_weighted


  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:00<00:00, 223.01it/s]


Unnamed: 0,wikidata_id,individual_name,region_code,region_name,birthyear,meta_occupation,meta_region
0,Q50082878,Jess Wade,re_latin,Latin World,2.0,physicist,non_europe
1,Q723645,Victorius of Aquitaine,re_latin,Latin World,450.0,astronomer | mathematician,non_europe
2,Q297515,Pomponius Mela,re_latin,Latin World,15.0,historian | geographer,non_europe
3,Q755297,Athenaeus Mechanicus,re_latin,Latin World,-199.0,philosopher | mathematician,non_europe
4,Q706319,Marinus of Tyre,re_latin,Latin World,100.0,geographer | mathematician,non_europe
...,...,...,...,...,...,...,...
1175,Q64344,Johann Cochlaeus,re_central_europe,Central Europe,1479.0,musicologist | theologian,europe
1176,Q45367560,Johannes Bronkhorst,re_german_world,German world,1494.0,mathematician,europe
1177,Q78427,Johann Bayer,re_german_world,German world,1572.0,astronomer,europe
1178,Q920563,"Jean Charles, Chevalier Folard",re_france,France,1669.0,historian,europe
