In [3]:
import sys
from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np

import itertools

sys.path.append('../../')

from sys_utils import load_model
from data_model_region import Region
from data_model import Individual
import typing as t
import random
from tqdm import tqdm

In [4]:
checkpoint_path = '../../checkpoints_dev'

regions = load_model(
        Region, name=checkpoint_path+"/regions.jsonl"
    )

individuals = load_model(
        Individual, name=checkpoint_path + "/individuals.jsonl"
    )

In [5]:
individuals_science = []

for ind in tqdm(individuals):
    for occupation in ind.id.occupations:
        if "science" in occupation.category:
            individuals_science.append(ind)
            break

100%|████████████████████████████████████████████████████| 225636/225636 [00:02<00:00, 107756.26it/s]


In [36]:
#individuals_filtered = [x for x in individuals_science if x.regions != None]

df_individuals = [
    {
        "wikidata_id": x.id.wikidata_id,
        "name": x.id.name,
        "year": x.impact_years,
        "cultural_score": x.cultural_score,
        "region_code": x.regions,
        "occupation":[y.name for y in x.id.occupations]
    }
    for x in individuals_science
]

df_individuals = pd.DataFrame(df_individuals)
df_individuals = df_individuals.dropna()

df_regions = [
            {
                "region_code": x.code,
                "region_name": x.name,
            }
            for x in regions
        ]

df_regions = pd.DataFrame(df_regions)

#### Study Occupations

In [40]:
df_individuals_regions = df_individuals.explode('region_code')
df_country = pd.merge(df_individuals_regions, df_regions, on = 'region_code')

df_occupation = df_country.drop(['region_name', 'region_code'], axis=1)
df_occupation = df_occupation.explode('occupation')
df_occupation = df_occupation.drop_duplicates().reset_index(drop=True)
df_occupation.occupation.value_counts().reset_index().to_clipboard(index =False)

In [41]:
df_occupation.to_csv('data/df_indi_occupations.csv')

In [11]:
df_occupation_manual = pd.read_csv('data/ENS - True Science - cleaning_top_occupations.csv')
df_occupation_manual = df_occupation_manual[df_occupation_manual['erase']!=1].reset_index(drop=True)

In [12]:
df_occupation[df_occupation['occupation']=='computer scientist']

Unnamed: 0,wikidata_id,name,year,cultural_score,occupation
3034,Q134661,George Boole,"(1830, 1880)",0.19178,computer scientist
3721,Q548672,Luigi Federico Menabrea,"(1820, 1870)",0.121815,computer scientist
9044,Q7259,Ada Lovelace,"(1830, 1880)",0.232519,computer scientist
10994,Q46633,Charles Babbage,"(1800, 1850)",0.238735,computer scientist
12769,Q57403,Wilhelm Schickard,"(1600, 1650)",0.116541,computer scientist
12788,Q964243,Per Georg Scheutz,"(1800, 1850)",0.105297,computer scientist
13399,Q76116,Johann Helfrich von Müller,"(1760, 1810)",0.101496,computer scientist
16641,Q216811,John Venn,"(1840, 1890)",0.125639,computer scientist
19224,Q77762,Karl Philipp Fohr,"(1810, 1860)",0.103314,computer scientist


In [8]:
# criteri to clean
# everything that is non scientist
# eevrything that is too general, not domain-specific

In [13]:
def get_top_occupation(df_country, region_name = 'Japan', top_n = 100):

    df_country = df_country[df_country['region_name']==region_name]
    df_country = df_country.explode('occupation').reset_index(drop=True)
    df_country = df_country[df_country['occupation'].isin(list(df_occupation_manual['occupation']))]

    top_occupations = df_country.occupation.value_counts().head(top_n).reset_index()
    top_occupations.columns = ['occupation', 'count_occupation']

    return top_occupations

def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

In [14]:
countries = ['Arabic world', 'Chinese world', 'Japan', 'France', 'Germany', 'United Kingdom', 'Italy']

pairs = list(itertools.combinations(countries, 2))

final = []

for pair in pairs:

    top_occupations_1= get_top_occupation(df_country, region_name = pair[0], top_n = 5)
    top_occupations_2 = get_top_occupation(df_country, region_name = pair[1], top_n = 5)

    lst1 = list(top_occupations_1['occupation'])
    lst2 = list(top_occupations_2['occupation'])
    
    lenght = [len(lst1), len(lst2)]

    percent_similarity = round(len(intersection(lst1, lst2))/len(lst1)*100, 2)
    final.append({'pair':pair, 
                  'percent_similarity':percent_similarity, 
                  'len':lenght, 
                  'pair_1':lst1, 
                 'pair_2':lst2})

In [15]:
overall = pd.DataFrame(final)
overall = overall.sort_values('percent_similarity', ascending=False).reset_index(drop=True)
overall.to_clipboard(index = False)

In [16]:
countries = ['Arabic world', 'Chinese world', 'Japan', 'France', 'Germany', 'United Kingdom', 'Italy']

In [17]:
top_occupations_1= get_top_occupation(df_country, region_name = 'Japan', top_n = 5)
top_occupations_2 = get_top_occupation(df_country, region_name = 'Italy', top_n = 5)

In [18]:
df_spec = df_occupation[df_occupation['occupation']=='naturalist']
df_spec_indi = list(set(df_spec.wikidata_id))
df_spec_fin = df_occupation[df_occupation['wikidata_id'].isin(df_spec_indi)]
#df_spec_fin = df_spec_fin[df_spec_fin['occupation']!= 'naturalist']

In [19]:
df_unique = df_spec_fin.groupby('wikidata_id')['occupation'].count().reset_index()

In [20]:
df_spec_fin[df_spec_fin['wikidata_id']=='Q1001675']

Unnamed: 0,wikidata_id,name,year,cultural_score,occupation
62100,Q1001675,Dániel Fischer,"(1710, 1760)",0.10083,naturalist


#### Exploration Analysis of meta-categories

In [23]:
df_annotation = pd.read_csv('data/ENS - True Science - cleaning_top_occupations.csv')
df_annotation = df_annotation[df_annotation['erase']!= 1]
df_annotation = df_annotation.drop(['note', 'erase'], axis=1)
df_annotation = df_annotation[df_annotation['count_occupation']>=10]
df_annotation = df_annotation[['occupation', 'meta_occupation']].drop_duplicates()
df_annotation = df_annotation.sort_values('meta_occupation').reset_index(drop=True)

In [25]:
df_annotation['meta_occupation'][df_annotation['occupation']=='ecologist'] = 'ecologist'
df_annotation['meta_occupation'][df_annotation['occupation']=='biologist'] = 'biologist'
df_annotation['meta_occupation'][df_annotation['occupation']=='naturalist'] = 'naturalist'

In [26]:
df_occupation_label = pd.merge(df_occupation, df_annotation, on = 'occupation')
df_occupation_label = df_occupation_label[['wikidata_id', 'meta_occupation']].drop_duplicates().reset_index(drop=True)

In [27]:
import typing as t
def coocurrence_multiple(data_network:pd.DataFrame(),index_var :str, variables:t.List[str]):

        fin_list = []
        for var in variables:
            df_var = data_network[[index_var, var]]
            df_var = df_var.drop_duplicates()
            df_var = (
                df_var.groupby([index_var, var])[index_var]
                .count()
                .rename("weight")
                .reset_index()
            )
            df_var = df_var.rename(columns={var: "data"})
            df_var["entity"] = var
            fin_list.append(df_var)
        
        fin = pd.concat([x for x in fin_list])
        df_co = pd.merge(fin, fin, on=index_var)
        df_co["product"] = df_co["weight_x"] * df_co["weight_y"]
        edges = df_co.groupby(["data_x", "data_y"])["product"].sum().reset_index()
        edges.columns = ["source", "target", "weight"]

        return edges

In [31]:
df_res = coocurrence_multiple(df_occupation_label, index_var = 'wikidata_id', variables = ['meta_occupation'])
df_res = df_res[df_res['source'].isin(['naturalist', 'zoologist', 'anatomist', 'botanist', 'ecologist', 'biologist'])]
df_res = df_res[df_res['target'].isin(['naturalist', 'zoologist', 'anatomist', 'botanist',  'ecologist', 'biologist'])]
df_res = df_res[df_res["target"] != df_res["source"]]
df_res = df_res.reset_index(drop=True)
df_res.to_clipboard(index=False)

In [32]:
df_res

Unnamed: 0,source,target,weight
0,anatomist,biologist,72
1,anatomist,botanist,86
2,anatomist,ecologist,1
3,anatomist,naturalist,33
4,anatomist,zoologist,127
5,biologist,anatomist,72
6,biologist,botanist,154
7,biologist,ecologist,2
8,biologist,naturalist,85
9,biologist,zoologist,203


In [33]:
test = df_occupation_label[df_occupation_label['meta_occupation'].isin(['botanist', 'anatomist'])]
test = test.groupby('wikidata_id')['meta_occupation'].apply(list).reset_index()
test['len']  = test['meta_occupation'].apply(lambda x : len(x))
test = test[test['len']>1]
test

Unnamed: 0,wikidata_id,meta_occupation,len
121,Q107619,"[botanist, anatomist]",2
173,Q110731,"[botanist, anatomist]",2
313,Q1187350,"[botanist, anatomist]",2
318,Q1191107,"[botanist, anatomist]",2
389,Q123225,"[botanist, anatomist]",2
...,...,...,...
5125,Q85511,"[botanist, anatomist]",2
5172,Q87828,"[botanist, anatomist]",2
5387,Q960658,"[botanist, anatomist]",2
5447,Q98053,"[botanist, anatomist]",2


In [17]:
# nettoyages

In [18]:
# discipline le plus souvent dans le top 20