- Find the number of people thta re pure specialist versus the people that have different occupations
- Show if the association is random or not

In [1]:
import pandas as pd

data = pd.read_csv('../networks/data/global_before_1700.csv', index_col = [0])


In [2]:
df_clip = data.groupby(['wikidata_id', 'birthyear']).agg({
    'individual_name': '|'.join,
    'region_code': '|'.join,
    'region_name': '|'.join,
    #'birthyear': '|'.join,
    'meta_occupation': '|'.join,
    'meta_region': '|'.join
}).reset_index()

df_clip.to_clipboard()

In [3]:
len_unique_ids = len(set(data.wikidata_id))
print('There are {} unique individuals.'.format(len_unique_ids))

print(max(data.birthyear))
print(min(data.birthyear))

There are 13374 unique individuals.
1700.0
-800.0


In [3]:
unique_occupations = len(set(data['meta_occupation'].apply(lambda x : x.split(' | ')).explode('meta_occupation')))
print(f'There are {unique_occupations} unique occupations in the dataset.')

There are 25 unique occupations in the dataset.


In [4]:
data_europe = data[data['meta_region']=='europe']
number_of_europeans = len(set(data_europe.wikidata_id))
percent_europeans = round(number_of_europeans/len_unique_ids*100, 1)

print(f'There are {number_of_europeans} europeans in the dataset.')
print(f'There are {percent_europeans}% of europeans in the dataset.')

data_europe[data_europe['meta_occupation'].str.contains(' | ')]

There are 11386 europeans in the dataset.
There are 84.0% of europeans in the dataset.


Unnamed: 0,wikidata_id,individual_name,region_code,region_name,birthyear,meta_occupation,meta_region
40,Q579194,John Major,re_british_islands,British Islands,1469.0,theologian | philosopher | historian,europe
45,Q13129536,Lewis Caerleon,re_british_islands,British Islands,1500.0,theologian | astronomer | mathematician,europe
55,Q242640,"Margaret Cavendish, Duchess of Newcastle-upon-...",re_british_islands,British Islands,1623.0,philosopher | physicist,europe
71,Q185832,Emmanuel Swedenborg,re_british_islands,British Islands,1688.0,theologian | philosopher | physicist | mathema...,europe
87,Q559548,Michael Scot,re_british_islands,British Islands,1175.0,philosopher | mathematician,europe
...,...,...,...,...,...,...,...
42127,Q9376391,Wojciech Bystrzonowski,re_central_europe,Central Europe,1699.0,theologian | philosopher,europe
42265,Q100793338,Jan Mudran,re_central_europe,Central Europe,1647.0,theologian | philosopher,europe
42269,Q329201,Tomáš Pešina z Čechorodu,re_central_europe,Central Europe,1629.0,theologian | historian,europe
42414,Q57963,Johannes Hevelius,re_central_europe,Central Europe,1611.0,geographer | astronomer,europe


In [5]:
percent_europeans_after_1500 = round(len(set(data_europe[data_europe['birthyear']>=1500].wikidata_id))/len(data_europe)*100, 0)
print(f'There are {percent_europeans_after_1500} % of europeans born after 1500')

There are 82.0 % of europeans born after 1500


In [9]:
data_non_europe = data[data['meta_region']=='non_europe']
print(len(set(data_non_europe.wikidata_id)))

data_non_europe[data_non_europe['meta_occupation'].str.contains(' | ')]

2172


Unnamed: 0,wikidata_id,individual_name,region_code,region_name,birthyear,meta_occupation,meta_region
1,Q782074,Claudianus Mamertus,re_latin,Latin World,420.0,theologian | philosopher,non_europe
2,Q182123,Irenaeus,re_latin,Latin World,130.0,theologian | philosopher,non_europe
3,Q44344,Hilary of Poitiers,re_latin,Latin World,315.0,theologian | philosopher,non_europe
7,Q723645,Victorius of Aquitaine,re_latin,Latin World,450.0,astronomer | mathematician,non_europe
8,Q209102,Lactantius,re_latin,Latin World,250.0,philosopher | historian,non_europe
...,...,...,...,...,...,...,...
2894,Q25468769,Majd ad-Dīn Ibn Athir,re_arabic_world,Arabic world,1150.0,linguist | historian,non_europe
2895,Q2737184,Ibn al-Khatib,re_arabic_world,Arabic world,1313.0,philosopher | historian,non_europe
2897,Q10299689,Ibn Ghazi al-Miknasi,re_arabic_world,Arabic world,1437.0,linguist | historian | mathematician,non_europe
2907,Q167852,Jabir ibn Hayyan,re_arabic_world,Arabic world,721.0,astronomer | philosopher | mathematician,non_europe


In [7]:
duplicated_ids = data[data.duplicated(subset='wikidata_id', keep=False)]
duplicated_ids = duplicated_ids.sort_values('wikidata_id')
duplicated_ids.sample(10)

Unnamed: 0,wikidata_id,individual_name,region_code,region_name,birthyear,meta_occupation,meta_region
628,Q320042,Antiochus of Ascalon,re_greek_world,Greek World,-123.0,philosopher,non_europe
390,Q168261,Ptolemy I Soter,re_greek_world,Greek World,-365.0,historian,non_europe
329,Q949530,Demonax,re_greek_world,Greek World,200.0,philosopher,antiquity
541,Q2749849,Athenodorus of Soli,re_greek_world,Greek World,-300.0,philosopher,non_europe
538,Q343018,Eunapius,re_greek_world,Greek World,349.0,philosopher | historian,antiquity
205,Q794615,Aetius,re_latin,Latin World,200.0,astronomer | philosopher,non_europe
487,Q2696027,Alexander of Damascus,re_greek_world,Greek World,200.0,philosopher,antiquity
295,Q1747004,Timon of Athens,re_greek_world,Greek World,-500.0,philosopher,antiquity
423,Q43182,Eratosthenes,re_greek_world,Greek World,-275.0,musicologist | astronomer | mathematician | ge...,antiquity
416,Q721022,Malchus,re_greek_world,Greek World,450.0,historian,non_europe


In [10]:
df_specialist = data[['wikidata_id', 'meta_occupation']].drop_duplicates()
non_specialist = len(df_specialist[df_specialist['meta_occupation'].str.contains(' | ')])
specialist = len(df_specialist[~df_specialist['meta_occupation'].str.contains(' | ')])

print(f'There are {non_specialist} Non-specialists in the dataset.')
print(f'There are {specialist} specialists in the dataset.')

percent_specialit = round(specialist/len_unique_ids*100, 0)
print(f'There are {percent_specialit}% percent of specialists in the dataset.')

percent_non_specialit = round(non_specialist/len_unique_ids*100, 0)
print(f'There are {percent_non_specialit}% percent of non-specialists in the dataset.')
#non_specialist/len(df_specialist)

There are 2317 Non-specialists in the dataset.
There are 11239 specialists in the dataset.
There are 83.0% percent of specialists in the dataset.
There are 17.0% percent of non-specialists in the dataset.


In [9]:
import sys
sys.path.append('../')
from functions.env import DATA_PATH,  DB_SCIENCE_PATH_NEW
import sqlite3

conn = sqlite3.connect(DB_SCIENCE_PATH_NEW)
data_occupation = pd.read_sql("SELECT * FROM individuals_occupation_information", conn)
print(len(set(data_occupation.individual_wikidata_id)))
len(set(data_occupation[data_occupation['birthyear']<=1700].individual_wikidata_id))

71331


19432

In [10]:



data_occupation = data_occupation.rename(
    columns={
        "occupations_name": "occupation",
        "individual_wikidata_id": "wikidata_id",
    }
)


data_occupation = data_occupation[['wikidata_id', 'occupation']].drop_duplicates()


data_occupation_filtered = data_occupation[data_occupation['wikidata_id'].isin(list(set(data.wikidata_id)))]
len(set(data_occupation_filtered.occupation))

104

In [11]:
df_non_spec = data[data['meta_occupation'].str.contains(' | ')]
df_non_spec

Unnamed: 0,wikidata_id,individual_name,region_code,region_name,birthyear,meta_occupation,meta_region
1,Q782074,Claudianus Mamertus,re_latin,Latin World,420.0,theologian | philosopher,antiquity
2,Q182123,Irenaeus,re_latin,Latin World,130.0,theologian | philosopher,antiquity
3,Q44344,Hilary of Poitiers,re_latin,Latin World,315.0,theologian | philosopher,antiquity
7,Q723645,Victorius of Aquitaine,re_latin,Latin World,450.0,astronomer | mathematician,antiquity
8,Q209102,Lactantius,re_latin,Latin World,250.0,philosopher | historian,antiquity
...,...,...,...,...,...,...,...
2894,Q25468769,Majd ad-Dīn Ibn Athir,re_arabic_world,Arabic world,1150.0,linguist | historian,non_europe
2895,Q2737184,Ibn al-Khatib,re_arabic_world,Arabic world,1313.0,philosopher | historian,non_europe
2897,Q10299689,Ibn Ghazi al-Miknasi,re_arabic_world,Arabic world,1437.0,linguist | historian | mathematician,non_europe
2907,Q167852,Jabir ibn Hayyan,re_arabic_world,Arabic world,721.0,astronomer | philosopher | mathematician,non_europe


In [12]:

df_non_spec.region_name.value_counts()


German world        505
Italy               328
France              324
Greek World         214
British Islands     198
Low countries       164
Central Europe      141
Arabic world        123
Nordic countries     79
Spain                79
Latin World          78
Chinese world        71
Persian world        54
Slav world           36
Indian world         23
Portugal             19
Balkans              15
Japan                 7
South East Asia       4
Ottoman Turkey        2
Name: region_name, dtype: int64

In [13]:
data[data['region_name']=='Japan']

Unnamed: 0,wikidata_id,individual_name,region_code,region_name,birthyear,meta_occupation,meta_region
951,Q6538434,Sawaguchi Kazuyuki,re_japan,Japan,1670.0,mathematician,non_europe
954,Q683792,Kitabatake Chikafusa,re_japan,Japan,1293.0,historian,non_europe
957,Q311478,Nichiren,re_japan,Japan,1222.0,philosopher,non_europe
960,Q786877,Shibukawa Shunkai,re_japan,Japan,1639.0,astronomer,non_europe
963,Q6538396,Andō Yūeki,re_japan,Japan,1624.0,astronomer,non_europe
...,...,...,...,...,...,...,...
1177,Q63566583,Jinzan Tani,re_japan,Japan,1663.0,astronomer,non_europe
1184,Q3128822,Hayashi Hōkō,re_japan,Japan,1644.0,philosopher,non_europe
1200,Q11564289,Minamoto no Yoshiari,re_japan,Japan,845.0,historian,non_europe
1203,Q1559382,Minamoto no Shitagō,re_japan,Japan,911.0,linguist,non_europe


In [14]:
data.region_name.value_counts()

German world        3768
Italy               1786
France              1729
British Islands      991
Greek World          884
Low countries        880
Central Europe       809
Spain                599
Arabic world         575
Nordic countries     430
Latin World          430
Chinese world        392
Persian world        199
Slav world           188
Indian world         127
Portugal             113
Korea                101
Balkans               93
Japan                 67
South East Asia       28
Ottoman Turkey        26
Name: region_name, dtype: int64